Skip to content

Commit

Permalink
Implemented simplify for the starts_with function to convert it i…
Browse files Browse the repository at this point in the history
…nto a LIKE expression. (#14119)

* Implemented `simplify` for the `starts_with` function to convert it into a LIKE expression, enabling predicate pruning optimization.

* fix: escape special characters in starts_with to LIKE conversion

* updated simply function to handle utf8, largeutf8 and utf8view data type. and updated the coresponding test

* Add some more tests

* Add pruning test

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
jatin510 and alamb authored Jan 23, 2025
1 parent bd0b56f commit 49f95af
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 4 deletions.
47 changes: 45 additions & 2 deletions datafusion/functions/src/string/starts_with.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ use std::sync::Arc;

use arrow::array::ArrayRef;
use arrow::datatypes::DataType;
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};

use crate::utils::make_scalar_function;
use datafusion_common::{internal_err, Result};
use datafusion_expr::{ColumnarValue, Documentation};
use datafusion_common::{internal_err, Result, ScalarValue};
use datafusion_expr::{ColumnarValue, Documentation, Expr, Like};
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use datafusion_macros::user_doc;

Expand Down Expand Up @@ -98,6 +99,48 @@ impl ScalarUDFImpl for StartsWithFunc {
}
}

fn simplify(
&self,
args: Vec<Expr>,
_info: &dyn SimplifyInfo,
) -> Result<ExprSimplifyResult> {
if let Expr::Literal(scalar_value) = &args[1] {
// Convert starts_with(col, 'prefix') to col LIKE 'prefix%' with proper escaping
// Example: starts_with(col, 'ja%') -> col LIKE 'ja\%%'
// 1. 'ja%' (input pattern)
// 2. 'ja\%' (escape special char '%')
// 3. 'ja\%%' (add suffix for starts_with)
let like_expr = match scalar_value {
ScalarValue::Utf8(Some(pattern)) => {
let escaped_pattern = pattern.replace("%", "\\%");
let like_pattern = format!("{}%", escaped_pattern);
Expr::Literal(ScalarValue::Utf8(Some(like_pattern)))
}
ScalarValue::LargeUtf8(Some(pattern)) => {
let escaped_pattern = pattern.replace("%", "\\%");
let like_pattern = format!("{}%", escaped_pattern);
Expr::Literal(ScalarValue::LargeUtf8(Some(like_pattern)))
}
ScalarValue::Utf8View(Some(pattern)) => {
let escaped_pattern = pattern.replace("%", "\\%");
let like_pattern = format!("{}%", escaped_pattern);
Expr::Literal(ScalarValue::Utf8View(Some(like_pattern)))
}
_ => return Ok(ExprSimplifyResult::Original(args)),
};

return Ok(ExprSimplifyResult::Simplified(Expr::Like(Like {
negated: false,
expr: Box::new(args[0].clone()),
pattern: Box::new(like_expr),
escape_char: None,
case_insensitive: false,
})));
}

Ok(ExprSimplifyResult::Original(args))
}

fn documentation(&self) -> Option<&Documentation> {
self.doc()
}
Expand Down
31 changes: 31 additions & 0 deletions datafusion/sqllogictest/test_files/parquet.slt
Original file line number Diff line number Diff line change
Expand Up @@ -598,3 +598,34 @@ drop table cpu;

statement ok
drop table cpu_parquet;

# Test for parquet predicate pruning with `starts_with` function
query I
copy (values ('foo'), ('bar'), ('baz')) TO 'test_files/scratch/parquet/foo.parquet'
----
3

statement ok
create external table foo
stored as parquet
location 'test_files/scratch/parquet/foo.parquet';


# Expect that the pruning predicate contain a comparison on the min/max value of `column1):
# column1_min@0 <= g AND f <= column1_max@1`
# (the starts_with function is not supported in the parquet predicate pruning but DataFusion rewrites
# it to a like which is then handled by the PruningPredicate)
query TT
explain select * from foo where starts_with(column1, 'f');
----
logical_plan
01)Filter: foo.column1 LIKE Utf8View("f%")
02)--TableScan: foo projection=[column1], partial_filters=[foo.column1 LIKE Utf8View("f%")]
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: column1@0 LIKE f%
03)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
04)------ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/foo.parquet]]}, projection=[column1], predicate=column1@0 LIKE f%, pruning_predicate=column1_null_count@2 != column1_row_count@3 AND column1_min@0 <= g AND f <= column1_max@1, required_guarantees=[]

statement ok
drop table foo
46 changes: 44 additions & 2 deletions datafusion/sqllogictest/test_files/string/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,51 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
01)Projection: test.column1_utf8view LIKE Utf8View("äöüß%") AS c1, CASE test.column1_utf8view IS NOT NULL WHEN Boolean(true) THEN Boolean(true) END AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
02)--TableScan: test projection=[column1_utf8view]

## Test STARTS_WITH is rewitten to LIKE when the pattern is a constant
query TT
EXPLAIN SELECT
STARTS_WITH(column1_utf8, 'foo%') as c1,
STARTS_WITH(column1_large_utf8, 'foo%') as c2,
STARTS_WITH(column1_utf8view, 'foo%') as c3,
STARTS_WITH(column1_utf8, 'f_o') as c4,
STARTS_WITH(column1_large_utf8, 'f_o') as c5,
STARTS_WITH(column1_utf8view, 'f_o') as c6
FROM test;
----
logical_plan
01)Projection: test.column1_utf8 LIKE Utf8("foo\%%") AS c1, test.column1_large_utf8 LIKE LargeUtf8("foo\%%") AS c2, test.column1_utf8view LIKE Utf8View("foo\%%") AS c3, test.column1_utf8 LIKE Utf8("f_o%") AS c4, test.column1_large_utf8 LIKE LargeUtf8("f_o%") AS c5, test.column1_utf8view LIKE Utf8View("f_o%") AS c6
02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view]

## Test STARTS_WITH works with column arguments
query TT
EXPLAIN SELECT
STARTS_WITH(column1_utf8, substr(column1_utf8, 1, 2)) as c1,
STARTS_WITH(column1_large_utf8, substr(column1_large_utf8, 1, 2)) as c2,
STARTS_WITH(column1_utf8view, substr(column1_utf8view, 1, 2)) as c3
FROM test;
----
logical_plan
01)Projection: starts_with(test.column1_utf8, substr(test.column1_utf8, Int64(1), Int64(2))) AS c1, starts_with(test.column1_large_utf8, substr(test.column1_large_utf8, Int64(1), Int64(2))) AS c2, starts_with(test.column1_utf8view, substr(test.column1_utf8view, Int64(1), Int64(2))) AS c3
02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column1_utf8view]

query BBB
SELECT
STARTS_WITH(column1_utf8, substr(column1_utf8, 1, 2)) as c1,
STARTS_WITH(column1_large_utf8, substr(column1_large_utf8, 1, 2)) as c2,
STARTS_WITH(column1_utf8view, substr(column1_utf8view, 1, 2)) as c3
FROM test;
----
true true true
true true true
true true true
true true true
NULL NULL NULL


# Ensure that INIT cap works with utf8view
query TT
EXPLAIN SELECT
INITCAP(column1_utf8view) as c
Expand Down Expand Up @@ -887,7 +929,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2
01)Projection: test.column1_utf8view LIKE Utf8View("foo%") AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2
02)--TableScan: test projection=[column1_utf8view, column2_utf8view]

## Ensure no casts for TRANSLATE
Expand Down

0 comments on commit 49f95af

Please sign in to comment.