diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index bfd0411798c9..c5576b7e7d44 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -567,9 +567,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.49.0" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53dcf5e7d9bd1517b8b998e170e650047cea8a2b85fe1835abe3210713e541b7" +checksum = "6ada54e5f26ac246dc79727def52f7f8ed38915cb47781e2a72213957dc3a7d5" dependencies = [ "aws-credential-types", "aws-runtime", @@ -857,9 +857,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" +checksum = "1a68f1f47cdf0ec8ee4b941b2eee2a80cb796db73118c0dd09ac63fbe405be22" dependencies = [ "memchr", "regex-automata", @@ -917,9 +917,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aeb932158bd710538c73702db6945cb68a8fb08c519e6e12706b94263b36db8" +checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47" dependencies = [ "jobserver", "libc", @@ -980,9 +980,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.20" +version = "4.5.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" +checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" dependencies = [ "clap_builder", "clap_derive", @@ -990,9 +990,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.20" +version = "4.5.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" +checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" dependencies = [ "anstream", "anstyle", @@ -1014,9 +1014,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" [[package]] name = "clipboard-win" @@ -1035,9 +1035,9 @@ checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "comfy-table" -version = "7.1.2" +version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" +checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" dependencies = [ "strum 0.26.3", "strum_macros 0.26.4", @@ -1537,9 +1537,11 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr-common", + "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-plan", "itertools", + "log", "recursive", ] @@ -1749,9 +1751,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.34" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", "miniz_oxide", @@ -2162,7 +2164,7 @@ dependencies = [ "http 1.1.0", "hyper 1.5.0", "hyper-util", - "rustls 0.23.16", + "rustls 0.23.17", "rustls-native-certs 0.8.0", "rustls-pki-types", "tokio", @@ -2484,9 +2486,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.162" +version = "0.2.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" +checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" [[package]] name = "libflate" @@ -3073,7 +3075,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.16", + "rustls 0.23.17", "socket2", "thiserror 2.0.3", "tokio", @@ -3091,7 +3093,7 @@ dependencies = [ "rand", "ring", "rustc-hash", - "rustls 0.23.16", + "rustls 0.23.17", "rustls-pki-types", "slab", "thiserror 2.0.3", @@ -3269,7 +3271,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.16", + "rustls 0.23.17", "rustls-native-certs 0.8.0", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -3363,9 +3365,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.40" +version = "0.38.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99e4ea3e1cdc4b559b8e5650f9c8e5998e3e5c1343b4eaf034565f32318d63c0" +checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" dependencies = [ "bitflags 2.6.0", "errno", @@ -3388,9 +3390,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.16" +version = "0.23.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e" +checksum = "7f1a745511c54ba6d4465e8d5dfbd81b45791756de28d4981af70d6dca128f1e" dependencies = [ "once_cell", "ring", @@ -3518,9 +3520,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" dependencies = [ "windows-sys 0.59.0", ] @@ -3598,9 +3600,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.133" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" dependencies = [ "itoa", "memchr", @@ -4019,7 +4021,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.16", + "rustls 0.23.17", "rustls-pki-types", "tokio", ] diff --git a/datafusion/core/src/physical_optimizer/mod.rs b/datafusion/core/src/physical_optimizer/mod.rs index a9f6f30dc175..000c27effdb6 100644 --- a/datafusion/core/src/physical_optimizer/mod.rs +++ b/datafusion/core/src/physical_optimizer/mod.rs @@ -27,7 +27,6 @@ pub mod enforce_sorting; pub mod join_selection; pub mod optimizer; pub mod projection_pushdown; -pub mod pruning; pub mod replace_with_order_preserving_variants; pub mod sanity_checker; #[cfg(test)] diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index 04f01f8badb8..718567de8df4 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -37,11 +37,15 @@ arrow-schema = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr-common = { workspace = true, default-features = true } +datafusion-optimizer = { workspace = true } datafusion-physical-expr = { workspace = true } datafusion-physical-plan = { workspace = true } itertools = { workspace = true } +log = { workspace = true } recursive = { workspace = true } [dev-dependencies] +datafusion-expr = { workspace = true } datafusion-functions-aggregate = { workspace = true } +datafusion-functions-nested = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs index 5d0ccde9f8cd..c4f5fa74e122 100644 --- a/datafusion/physical-optimizer/src/lib.rs +++ b/datafusion/physical-optimizer/src/lib.rs @@ -25,6 +25,7 @@ pub mod limit_pushdown; pub mod limited_distinct_aggregation; mod optimizer; pub mod output_requirements; +pub mod pruning; pub mod topk_aggregation; pub mod update_aggr_exprs; diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs similarity index 99% rename from datafusion/core/src/physical_optimizer/pruning.rs rename to datafusion/physical-optimizer/src/pruning.rs index 89b86471561e..3cfb03b7205a 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -18,33 +18,30 @@ //! [`PruningPredicate`] to apply filter [`Expr`] to prune "containers" //! based on statistics (e.g. Parquet Row Groups) //! -//! [`Expr`]: crate::prelude::Expr +//! [`Expr`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html use std::collections::HashSet; use std::sync::Arc; -use crate::{ - common::{Column, DFSchema}, - error::{DataFusionError, Result}, - logical_expr::Operator, - physical_plan::{ColumnarValue, PhysicalExpr}, -}; - +use arrow::array::AsArray; use arrow::{ array::{new_null_array, ArrayRef, BooleanArray}, datatypes::{DataType, Field, Schema, SchemaRef}, record_batch::{RecordBatch, RecordBatchOptions}, }; -use arrow_array::cast::AsArray; +use log::trace; + +use datafusion_common::error::{DataFusionError, Result}; use datafusion_common::tree_node::TransformedResult; use datafusion_common::{ internal_err, plan_datafusion_err, plan_err, tree_node::{Transformed, TreeNode}, ScalarValue, }; +use datafusion_common::{Column, DFSchema}; +use datafusion_expr_common::operator::Operator; use datafusion_physical_expr::utils::{collect_columns, Guarantee, LiteralGuarantee}; use datafusion_physical_expr::{expressions as phys_expr, PhysicalExprRef}; - -use log::trace; +use datafusion_physical_plan::{ColumnarValue, PhysicalExpr}; /// A source of runtime statistical information to [`PruningPredicate`]s. /// @@ -567,7 +564,7 @@ impl PruningPredicate { /// expressions like `b = false`, but it does handle the /// simplified version `b`. See [`ExprSimplifier`] to simplify expressions. /// - /// [`ExprSimplifier`]: crate::optimizer::simplify_expressions::ExprSimplifier + /// [`ExprSimplifier`]: https://docs.rs/datafusion/latest/datafusion/optimizer/simplify_expressions/struct.ExprSimplifier.html pub fn prune(&self, statistics: &S) -> Result> { let mut builder = BoolVecBuilder::new(statistics.num_containers()); @@ -653,7 +650,7 @@ impl PruningPredicate { // this is only used by `parquet` feature right now #[allow(dead_code)] - pub(crate) fn required_columns(&self) -> &RequiredColumns { + pub fn required_columns(&self) -> &RequiredColumns { &self.required_columns } @@ -762,7 +759,7 @@ fn is_always_true(expr: &Arc) -> bool { /// Handles creating references to the min/max statistics /// for columns as well as recording which statistics are needed #[derive(Debug, Default, Clone)] -pub(crate) struct RequiredColumns { +pub struct RequiredColumns { /// The statistics required to evaluate this predicate: /// * The unqualified column in the input schema /// * Statistics type (e.g. Min or Max or Null_Count) @@ -786,7 +783,7 @@ impl RequiredColumns { /// * `true` returns None #[allow(dead_code)] // this fn is only used by `parquet` feature right now, thus the `allow(dead_code)` - pub(crate) fn single_column(&self) -> Option<&phys_expr::Column> { + pub fn single_column(&self) -> Option<&phys_expr::Column> { if self.columns.windows(2).all(|w| { // check if all columns are the same (ignoring statistics and field) let c1 = &w[0].0; @@ -1664,15 +1661,14 @@ mod tests { use std::ops::{Not, Rem}; use super::*; - use crate::assert_batches_eq; - use crate::logical_expr::{col, lit}; + use datafusion_common::assert_batches_eq; + use datafusion_expr::{col, lit}; use arrow::array::Decimal128Array; use arrow::{ - array::{BinaryArray, Int32Array, Int64Array, StringArray}, + array::{BinaryArray, Int32Array, Int64Array, StringArray, UInt64Array}, datatypes::TimeUnit, }; - use arrow_array::UInt64Array; use datafusion_expr::expr::InList; use datafusion_expr::{cast, is_null, try_cast, Expr}; use datafusion_functions_nested::expr_fn::{array_has, make_array}; @@ -3536,7 +3532,7 @@ mod tests { // more complex case with unknown column let input = known_expression.clone().and(input.clone()); let expected = phys_expr::BinaryExpr::new( - known_expression_transformed.clone(), + Arc::::clone(&known_expression_transformed), Operator::And, logical2physical(&lit(42), &schema), ); @@ -3552,7 +3548,7 @@ mod tests { // more complex case with unknown expression let input = known_expression.and(input); let expected = phys_expr::BinaryExpr::new( - known_expression_transformed.clone(), + Arc::::clone(&known_expression_transformed), Operator::And, logical2physical(&lit(42), &schema), ); @@ -4038,7 +4034,7 @@ mod tests { ) { println!("Pruning with expr: {}", expr); let expr = logical2physical(&expr, schema); - let p = PruningPredicate::try_new(expr, schema.clone()).unwrap(); + let p = PruningPredicate::try_new(expr, Arc::::clone(schema)).unwrap(); let result = p.prune(statistics).unwrap(); assert_eq!(result, expected); }