diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 229982a9616a..74d0fbdc4033 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -20,10 +20,11 @@ use std::sync::Arc; use arrow::array::ArrayRef; use arrow::datatypes::DataType; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::utils::make_scalar_function; -use datafusion_common::{internal_err, Result}; -use datafusion_expr::{ColumnarValue, Documentation}; +use datafusion_common::{internal_err, Result, ScalarValue}; +use datafusion_expr::{ColumnarValue, Documentation, Expr, Like}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; @@ -98,6 +99,48 @@ impl ScalarUDFImpl for StartsWithFunc { } } + fn simplify( + &self, + args: Vec, + _info: &dyn SimplifyInfo, + ) -> Result { + if let Expr::Literal(scalar_value) = &args[1] { + // Convert starts_with(col, 'prefix') to col LIKE 'prefix%' with proper escaping + // Example: starts_with(col, 'ja%') -> col LIKE 'ja\%%' + // 1. 'ja%' (input pattern) + // 2. 'ja\%' (escape special char '%') + // 3. 'ja\%%' (add suffix for starts_with) + let like_expr = match scalar_value { + ScalarValue::Utf8(Some(pattern)) => { + let escaped_pattern = pattern.replace("%", "\\%"); + let like_pattern = format!("{}%", escaped_pattern); + Expr::Literal(ScalarValue::Utf8(Some(like_pattern))) + } + ScalarValue::LargeUtf8(Some(pattern)) => { + let escaped_pattern = pattern.replace("%", "\\%"); + let like_pattern = format!("{}%", escaped_pattern); + Expr::Literal(ScalarValue::LargeUtf8(Some(like_pattern))) + } + ScalarValue::Utf8View(Some(pattern)) => { + let escaped_pattern = pattern.replace("%", "\\%"); + let like_pattern = format!("{}%", escaped_pattern); + Expr::Literal(ScalarValue::Utf8View(Some(like_pattern))) + } + _ => return Ok(ExprSimplifyResult::Original(args)), + }; + + return Ok(ExprSimplifyResult::Simplified(Expr::Like(Like { + negated: false, + expr: Box::new(args[0].clone()), + pattern: Box::new(like_expr), + escape_char: None, + case_insensitive: false, + }))); + } + + Ok(ExprSimplifyResult::Original(args)) + } + fn documentation(&self) -> Option<&Documentation> { self.doc() } diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 0faa7bd6b0bf..28f1a6a3780d 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -598,3 +598,34 @@ drop table cpu; statement ok drop table cpu_parquet; + +# Test for parquet predicate pruning with `starts_with` function +query I +copy (values ('foo'), ('bar'), ('baz')) TO 'test_files/scratch/parquet/foo.parquet' +---- +3 + +statement ok +create external table foo +stored as parquet +location 'test_files/scratch/parquet/foo.parquet'; + + +# Expect that the pruning predicate contain a comparison on the min/max value of `column1): +# column1_min@0 <= g AND f <= column1_max@1` +# (the starts_with function is not supported in the parquet predicate pruning but DataFusion rewrites +# it to a like which is then handled by the PruningPredicate) +query TT +explain select * from foo where starts_with(column1, 'f'); +---- +logical_plan +01)Filter: foo.column1 LIKE Utf8View("f%") +02)--TableScan: foo projection=[column1], partial_filters=[foo.column1 LIKE Utf8View("f%")] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: column1@0 LIKE f% +03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +04)------ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != column1_row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[] + +statement ok +drop table foo diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index c37dd1ed3b4f..435b4bc3c5a8 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -344,9 +344,51 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 +01)Projection: test.column1_utf8view LIKE Utf8View("äöüß%") AS c1, CASE test.column1_utf8view IS NOT NULL WHEN Boolean(true) THEN Boolean(true) END AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 02)--TableScan: test projection=[column1_utf8view] +## Test STARTS_WITH is rewitten to LIKE when the pattern is a constant +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8, 'foo%') as c1, + STARTS_WITH(column1_large_utf8, 'foo%') as c2, + STARTS_WITH(column1_utf8view, 'foo%') as c3, + STARTS_WITH(column1_utf8, 'f_o') as c4, + STARTS_WITH(column1_large_utf8, 'f_o') as c5, + STARTS_WITH(column1_utf8view, 'f_o') as c6 +FROM test; +---- +logical_plan +01)Projection: test.column1_utf8 LIKE Utf8("foo\%%") AS c1, test.column1_large_utf8 LIKE LargeUtf8("foo\%%") AS c2, test.column1_utf8view LIKE Utf8View("foo\%%") AS c3, test.column1_utf8 LIKE Utf8("f_o%") AS c4, test.column1_large_utf8 LIKE LargeUtf8("f_o%") AS c5, test.column1_utf8view LIKE Utf8View("f_o%") AS c6 +02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view] + +## Test STARTS_WITH works with column arguments +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8, substr(column1_utf8, 1, 2)) as c1, + STARTS_WITH(column1_large_utf8, substr(column1_large_utf8, 1, 2)) as c2, + STARTS_WITH(column1_utf8view, substr(column1_utf8view, 1, 2)) as c3 +FROM test; +---- +logical_plan +01)Projection: starts_with(test.column1_utf8, substr(test.column1_utf8, Int64(1), Int64(2))) AS c1, starts_with(test.column1_large_utf8, substr(test.column1_large_utf8, Int64(1), Int64(2))) AS c2, starts_with(test.column1_utf8view, substr(test.column1_utf8view, Int64(1), Int64(2))) AS c3 +02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view] + +query BBB +SELECT + STARTS_WITH(column1_utf8, substr(column1_utf8, 1, 2)) as c1, + STARTS_WITH(column1_large_utf8, substr(column1_large_utf8, 1, 2)) as c2, + STARTS_WITH(column1_utf8view, substr(column1_utf8view, 1, 2)) as c3 +FROM test; +---- +true true true +true true true +true true true +true true true +NULL NULL NULL + + +# Ensure that INIT cap works with utf8view query TT EXPLAIN SELECT INITCAP(column1_utf8view) as c @@ -887,7 +929,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 +01)Projection: test.column1_utf8view LIKE Utf8View("foo%") AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for TRANSLATE