diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 5140ae7eb0dc..9983e247f9ac 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1482,6 +1482,7 @@ dependencies = [ "itertools", "log", "paste", + "regex", "regex-syntax", ] diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index 79a5bb24e918..2ea3ebf337eb 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -47,6 +47,7 @@ indexmap = { workspace = true } itertools = { workspace = true } log = { workspace = true } paste = "1.0.14" +regex = { workspace = true } regex-syntax = "0.8.0" [dev-dependencies] diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index e0df6a3a68ce..d8ca246bb635 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -49,6 +49,7 @@ use crate::analyzer::type_coercion::TypeCoercionRewriter; use crate::simplify_expressions::guarantees::GuaranteeRewriter; use crate::simplify_expressions::regex::simplify_regex_expr; use crate::simplify_expressions::SimplifyInfo; +use regex::Regex; use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; @@ -1470,34 +1471,70 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { }) => Transformed::yes(simplify_regex_expr(left, op, right)?), // Rules for Like - Expr::Like(Like { - expr, - pattern, - negated, - escape_char: _, - case_insensitive: _, - }) if matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%" - ) || matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%" - ) || matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%" - ) => - { - // exp LIKE '%' is - // - when exp is not NULL, it's true - // - when exp is NULL, it's NULL - // exp NOT LIKE '%' is - // - when exp is not NULL, it's false - // - when exp is NULL, it's NULL - Transformed::yes(Expr::Case(Case { - expr: Some(Box::new(Expr::IsNotNull(expr))), - when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))], - else_expr: None, - })) + Expr::Like(like) => { + // `\` is implicit escape, see https://github.com/apache/datafusion/issues/13291 + let escape_char = like.escape_char.unwrap_or('\\'); + match as_string_scalar(&like.pattern) { + Some((data_type, pattern_str)) => { + match pattern_str { + None => return Ok(Transformed::yes(lit_bool_null())), + Some(pattern_str) if pattern_str == "%" => { + // exp LIKE '%' is + // - when exp is not NULL, it's true + // - when exp is NULL, it's NULL + // exp NOT LIKE '%' is + // - when exp is not NULL, it's false + // - when exp is NULL, it's NULL + let result_for_non_null = lit(!like.negated); + Transformed::yes(if !info.nullable(&like.expr)? { + result_for_non_null + } else { + Expr::Case(Case { + expr: Some(Box::new(Expr::IsNotNull(like.expr))), + when_then_expr: vec![( + Box::new(lit(true)), + Box::new(result_for_non_null), + )], + else_expr: None, + }) + }) + } + Some(pattern_str) + if pattern_str.contains("%%") + && !pattern_str.contains(escape_char) => + { + // Repeated occurrences of wildcard are redundant so remove them + // exp LIKE '%%' --> exp LIKE '%' + let simplified_pattern = Regex::new("%%+") + .unwrap() + .replace_all(pattern_str, "%") + .to_string(); + Transformed::yes(Expr::Like(Like { + pattern: Box::new(to_string_scalar( + data_type, + Some(simplified_pattern), + )), + ..like + })) + } + Some(pattern_str) + if !pattern_str + .contains(['%', '_', escape_char].as_ref()) => + { + // If the pattern does not contain any wildcards, we can simplify the like expression to an equality expression + // TODO: handle escape characters + Transformed::yes(Expr::BinaryExpr(BinaryExpr { + left: like.expr.clone(), + op: if like.negated { NotEq } else { Eq }, + right: like.pattern.clone(), + })) + } + + Some(_pattern_str) => Transformed::no(Expr::Like(like)), + } + } + None => Transformed::no(Expr::Like(like)), + } } // a is not null/unknown --> true (if a is not nullable) @@ -1696,6 +1733,24 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { } } +fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { + match expr { + Expr::Literal(ScalarValue::Utf8(s)) => Some((DataType::Utf8, s)), + Expr::Literal(ScalarValue::LargeUtf8(s)) => Some((DataType::LargeUtf8, s)), + Expr::Literal(ScalarValue::Utf8View(s)) => Some((DataType::Utf8View, s)), + _ => None, + } +} + +fn to_string_scalar(data_type: DataType, value: Option) -> Expr { + match data_type { + DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value)), + DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value)), + DataType::Utf8View => Expr::Literal(ScalarValue::Utf8View(value)), + _ => unreachable!(), + } +} + fn has_common_conjunction(lhs: &Expr, rhs: &Expr) -> bool { let lhs_set: HashSet<&Expr> = iter_conjunction(lhs).collect(); iter_conjunction(rhs).any(|e| lhs_set.contains(&e) && !e.is_volatile()) @@ -2810,10 +2865,16 @@ mod tests { ); // single character - assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%")); + assert_change( + regex_match(col("c1"), lit("x")), + like(col("c1"), lit("%x%")), + ); // single word - assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%")); + assert_change( + regex_match(col("c1"), lit("foo")), + like(col("c1"), lit("%foo%")), + ); // regular expressions that match an exact literal assert_change(regex_match(col("c1"), lit("^$")), col("c1").eq(lit(""))); @@ -2900,44 +2961,50 @@ mod tests { assert_no_change(regex_match(col("c1"), lit("$foo^"))); // regular expressions that match a partial literal - assert_change(regex_match(col("c1"), lit("^foo")), like(col("c1"), "foo%")); - assert_change(regex_match(col("c1"), lit("foo$")), like(col("c1"), "%foo")); + assert_change( + regex_match(col("c1"), lit("^foo")), + like(col("c1"), lit("foo%")), + ); + assert_change( + regex_match(col("c1"), lit("foo$")), + like(col("c1"), lit("%foo")), + ); assert_change( regex_match(col("c1"), lit("^foo|bar$")), - like(col("c1"), "foo%").or(like(col("c1"), "%bar")), + like(col("c1"), lit("foo%")).or(like(col("c1"), lit("%bar"))), ); // OR-chain assert_change( regex_match(col("c1"), lit("foo|bar|baz")), - like(col("c1"), "%foo%") - .or(like(col("c1"), "%bar%")) - .or(like(col("c1"), "%baz%")), + like(col("c1"), lit("%foo%")) + .or(like(col("c1"), lit("%bar%"))) + .or(like(col("c1"), lit("%baz%"))), ); assert_change( regex_match(col("c1"), lit("foo|x|baz")), - like(col("c1"), "%foo%") - .or(like(col("c1"), "%x%")) - .or(like(col("c1"), "%baz%")), + like(col("c1"), lit("%foo%")) + .or(like(col("c1"), lit("%x%"))) + .or(like(col("c1"), lit("%baz%"))), ); assert_change( regex_not_match(col("c1"), lit("foo|bar|baz")), - not_like(col("c1"), "%foo%") - .and(not_like(col("c1"), "%bar%")) - .and(not_like(col("c1"), "%baz%")), + not_like(col("c1"), lit("%foo%")) + .and(not_like(col("c1"), lit("%bar%"))) + .and(not_like(col("c1"), lit("%baz%"))), ); // both anchored expressions (translated to equality) and unanchored assert_change( regex_match(col("c1"), lit("foo|^x$|baz")), - like(col("c1"), "%foo%") + like(col("c1"), lit("%foo%")) .or(col("c1").eq(lit("x"))) - .or(like(col("c1"), "%baz%")), + .or(like(col("c1"), lit("%baz%"))), ); assert_change( regex_not_match(col("c1"), lit("foo|^bar$|baz")), - not_like(col("c1"), "%foo%") + not_like(col("c1"), lit("%foo%")) .and(col("c1").not_eq(lit("bar"))) - .and(not_like(col("c1"), "%baz%")), + .and(not_like(col("c1"), lit("%baz%"))), ); // Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION) assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc"))); @@ -2987,41 +3054,41 @@ mod tests { }) } - fn like(expr: Expr, pattern: &str) -> Expr { + fn like(expr: Expr, pattern: impl Into) -> Expr { Expr::Like(Like { negated: false, expr: Box::new(expr), - pattern: Box::new(lit(pattern)), + pattern: Box::new(pattern.into()), escape_char: None, case_insensitive: false, }) } - fn not_like(expr: Expr, pattern: &str) -> Expr { + fn not_like(expr: Expr, pattern: impl Into) -> Expr { Expr::Like(Like { negated: true, expr: Box::new(expr), - pattern: Box::new(lit(pattern)), + pattern: Box::new(pattern.into()), escape_char: None, case_insensitive: false, }) } - fn ilike(expr: Expr, pattern: &str) -> Expr { + fn ilike(expr: Expr, pattern: impl Into) -> Expr { Expr::Like(Like { negated: false, expr: Box::new(expr), - pattern: Box::new(lit(pattern)), + pattern: Box::new(pattern.into()), escape_char: None, case_insensitive: true, }) } - fn not_ilike(expr: Expr, pattern: &str) -> Expr { + fn not_ilike(expr: Expr, pattern: impl Into) -> Expr { Expr::Like(Like { negated: true, expr: Box::new(expr), - pattern: Box::new(lit(pattern)), + pattern: Box::new(pattern.into()), escape_char: None, case_insensitive: true, }) @@ -3633,32 +3700,123 @@ mod tests { #[test] fn test_like_and_ilke() { - // LIKE '%' - let expr = like(col("c1"), "%"); + let null = lit(ScalarValue::Utf8(None)); + + // expr [NOT] [I]LIKE NULL + let expr = like(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_like(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = ilike(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_ilike(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + // expr [NOT] [I]LIKE '%' + let expr = like(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); + + let expr = not_like(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); + + let expr = ilike(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); + + let expr = not_ilike(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); + + // expr [NOT] [I]LIKE '%%' + let expr = like(col("c1"), lit("%%")); assert_eq!(simplify(expr), if_not_null(col("c1"), true)); - let expr = not_like(col("c1"), "%"); + let expr = not_like(col("c1"), lit("%%")); assert_eq!(simplify(expr), if_not_null(col("c1"), false)); - let expr = ilike(col("c1"), "%"); + let expr = ilike(col("c1"), lit("%%")); assert_eq!(simplify(expr), if_not_null(col("c1"), true)); - let expr = not_ilike(col("c1"), "%"); + let expr = not_ilike(col("c1"), lit("%%")); assert_eq!(simplify(expr), if_not_null(col("c1"), false)); - // null_constant LIKE '%' + // not_null_expr [NOT] [I]LIKE '%' + let expr = like(col("c1_non_null"), lit("%")); + assert_eq!(simplify(expr), lit(true)); + + let expr = not_like(col("c1_non_null"), lit("%")); + assert_eq!(simplify(expr), lit(false)); + + let expr = ilike(col("c1_non_null"), lit("%")); + assert_eq!(simplify(expr), lit(true)); + + let expr = not_ilike(col("c1_non_null"), lit("%")); + assert_eq!(simplify(expr), lit(false)); + + // not_null_expr [NOT] [I]LIKE '%%' + let expr = like(col("c1_non_null"), lit("%%")); + assert_eq!(simplify(expr), lit(true)); + + let expr = not_like(col("c1_non_null"), lit("%%")); + assert_eq!(simplify(expr), lit(false)); + + let expr = ilike(col("c1_non_null"), lit("%%")); + assert_eq!(simplify(expr), lit(true)); + + let expr = not_ilike(col("c1_non_null"), lit("%%")); + assert_eq!(simplify(expr), lit(false)); + + // null_constant [NOT] [I]LIKE '%' + let expr = like(null.clone(), lit("%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_like(null.clone(), lit("%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = ilike(null.clone(), lit("%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_ilike(null, lit("%")); + assert_eq!(simplify(expr), lit_bool_null()); + + // null_constant [NOT] [I]LIKE '%%' + let null = lit(ScalarValue::Utf8(None)); + let expr = like(null.clone(), lit("%%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_like(null.clone(), lit("%%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = ilike(null.clone(), lit("%%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_ilike(null, lit("%%")); + assert_eq!(simplify(expr), lit_bool_null()); + + // null_constant [NOT] [I]LIKE 'a%' let null = lit(ScalarValue::Utf8(None)); - let expr = like(null.clone(), "%"); + let expr = like(null.clone(), lit("a%")); assert_eq!(simplify(expr), lit_bool_null()); - let expr = not_like(null.clone(), "%"); + let expr = not_like(null.clone(), lit("a%")); assert_eq!(simplify(expr), lit_bool_null()); - let expr = ilike(null.clone(), "%"); + let expr = ilike(null.clone(), lit("a%")); assert_eq!(simplify(expr), lit_bool_null()); - let expr = not_ilike(null, "%"); + let expr = not_ilike(null, lit("a%")); assert_eq!(simplify(expr), lit_bool_null()); + + // expr [NOT] [I]LIKE with pattern without wildcards + let expr = like(col("c1"), lit("a")); + assert_eq!(simplify(expr), col("c1").eq(lit("a"))); + let expr = not_like(col("c1"), lit("a")); + assert_eq!(simplify(expr), col("c1").not_eq(lit("a"))); + let expr = like(col("c1"), lit("a_")); + assert_eq!(simplify(expr), col("c1").like(lit("a_"))); + let expr = not_like(col("c1"), lit("a_")); + assert_eq!(simplify(expr), col("c1").not_like(lit("a_"))); } #[test] @@ -4034,6 +4192,7 @@ mod tests { Ok(DataType::Int16) } } + #[test] fn test_optimize_volatile_conditions() { let fun = Arc::new(ScalarUDF::new_from_impl(VolatileUdf::new())); diff --git a/datafusion/sqllogictest/test_files/string/init_data.slt.part b/datafusion/sqllogictest/test_files/string/init_data.slt.part index e1248c73da5f..9cdeff1977ee 100644 --- a/datafusion/sqllogictest/test_files/string/init_data.slt.part +++ b/datafusion/sqllogictest/test_files/string/init_data.slt.part @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +# TODO (https://github.com/apache/datafusion/issues/12637): add a row with '%%' pattern statement ok create table test_source as values ('Andrew', 'X', 'datafusion📊🔥', '🔥'), diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 9536cfee9359..f781b9dc33ca 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -953,45 +953,43 @@ NULL NULL NULL NULL NULL #Raphael datafusionДатаФусион false false false false #NULL NULL NULL NULL NULL NULL -# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections -query TTBB +query TTBBBB SELECT ascii_1, unicode_1, ascii_1 LIKE '%' AS ascii_1_like_percent, - unicode_1 LIKE '%' AS unicode_1_like_percent - -- ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 - -- unicode_1 LIKE '%%' AS unicode_1_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 + unicode_1 LIKE '%' AS unicode_1_like_percent, + ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, + unicode_1 LIKE '%%' AS unicode_1_like_percent_percent FROM test_basic_operator ---- -Andrew datafusion📊🔥 true true -Xiangpeng datafusion数据融合 true true -Raphael datafusionДатаФусион true true -under_score un iść core true true -percent pan Tadeusz ma iść w kąt true true -(empty) (empty) true true -% (empty) true true -_ (empty) true true -NULL NULL NULL NULL -NULL NULL NULL NULL +Andrew datafusion📊🔥 true true true true +Xiangpeng datafusion数据融合 true true true true +Raphael datafusionДатаФусион true true true true +under_score un iść core true true true true +percent pan Tadeusz ma iść w kąt true true true true +(empty) (empty) true true true true +% (empty) true true true true +_ (empty) true true true true +NULL NULL NULL NULL NULL NULL +NULL NULL NULL NULL NULL NULL -# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections -query TTBB +query TTBBBB SELECT ascii_1, unicode_1, ascii_1 NOT LIKE '%' AS ascii_1_not_like_percent, - unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent - -- ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 - -- unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 + unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent, + ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, + unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent FROM test_basic_operator ---- -Andrew datafusion📊🔥 false false -Xiangpeng datafusion数据融合 false false -Raphael datafusionДатаФусион false false -under_score un iść core false false -percent pan Tadeusz ma iść w kąt false false -(empty) (empty) false false -% (empty) false false -_ (empty) false false -NULL NULL NULL NULL -NULL NULL NULL NULL +Andrew datafusion📊🔥 false false false false +Xiangpeng datafusion数据融合 false false false false +Raphael datafusionДатаФусион false false false false +under_score un iść core false false false false +percent pan Tadeusz ma iść w kąt false false false false +(empty) (empty) false false false false +% (empty) false false false false +_ (empty) false false false false +NULL NULL NULL NULL NULL NULL +NULL NULL NULL NULL NULL NULL query T SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 LIKE '%' diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 43b08cb25f3f..dec5488d7319 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -391,12 +391,12 @@ drop table test_lowercase ## Ensure no casts for LIKE/ILIKE query TT EXPLAIN SELECT - column1_utf8view like 'foo' as "like", - column1_utf8view ilike 'foo' as "ilike" + column1_utf8view like '%foo%' as "like", + column1_utf8view ilike '%foo%' as "ilike" FROM test; ---- logical_plan -01)Projection: test.column1_utf8view LIKE Utf8View("foo") AS like, test.column1_utf8view ILIKE Utf8View("foo") AS ilike +01)Projection: test.column1_utf8view LIKE Utf8View("%foo%") AS like, test.column1_utf8view ILIKE Utf8View("%foo%") AS ilike 02)--TableScan: test projection=[column1_utf8view]