From c79156f082ca223eda77bd314eba75051064e525 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 27 Apr 2024 23:04:11 +0800 Subject: [PATCH 01/37] remove casting for coalesce Signed-off-by: jayzhan211 --- datafusion/expr/src/signature.rs | 24 +- .../expr/src/type_coercion/functions.rs | 51 +++-- datafusion/functions/src/core/coalesce.rs | 11 +- datafusion/sqllogictest/test_files/expr.slt | 2 +- .../sqllogictest/test_files/functions.slt | 215 ++++++++++++++++++ datafusion/sqllogictest/test_files/joins.slt | 4 +- datafusion/sqllogictest/test_files/scalar.slt | 159 ------------- datafusion/sqllogictest/test_files/select.slt | 14 +- 8 files changed, 285 insertions(+), 195 deletions(-) diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index e2505d6fd65f..a877979ead2c 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -92,7 +92,7 @@ pub enum TypeSignature { /// A function such as `concat` is `Variadic(vec![DataType::Utf8, DataType::LargeUtf8])` Variadic(Vec), /// One or more arguments of an arbitrary but equal type. - /// DataFusion attempts to coerce all argument types to match the first argument's type + /// DataFusion attempts to coerce all argument types to match to the common type with comparision coercion. /// /// # Examples /// Given types in signature should be coercible to the same final type. @@ -100,6 +100,11 @@ pub enum TypeSignature { /// /// `make_array(i32, i64) -> make_array(i64, i64)` VariadicEqual, + /// One or more arguments of an arbitrary but equal type or Null. + /// No coercion is attempted. + /// + /// Functions like `coalesce` is `VariadicEqual`. + VariadicEqualOrNull, /// One or more arguments with arbitrary types VariadicAny, /// Fixed number of arguments of an arbitrary but equal type out of a list of valid types. @@ -193,6 +198,9 @@ impl TypeSignature { TypeSignature::VariadicEqual => { vec!["CoercibleT, .., CoercibleT".to_string()] } + TypeSignature::VariadicEqualOrNull => { + vec!["T or Null, .., T or Null".to_string()] + } TypeSignature::VariadicAny => vec!["Any, .., Any".to_string()], TypeSignature::OneOf(sigs) => { sigs.iter().flat_map(|s| s.to_string_repr()).collect() @@ -226,6 +234,11 @@ impl TypeSignature { _ => false, } } + + /// Skip coercion to match the signature. + pub fn skip_coercion(&self) -> bool { + matches!(self, TypeSignature::VariadicEqualOrNull) + } } /// Defines the supported argument types ([`TypeSignature`]) and [`Volatility`] for a function. @@ -255,13 +268,20 @@ impl Signature { volatility, } } - /// An arbitrary number of arguments of the same type. + /// One or more number of arguments to the same type. pub fn variadic_equal(volatility: Volatility) -> Self { Self { type_signature: TypeSignature::VariadicEqual, volatility, } } + /// One or more number of arguments of the same type. + pub fn variadic_equal_or_null(volatility: Volatility) -> Self { + Self { + type_signature: TypeSignature::VariadicEqualOrNull, + volatility, + } + } /// An arbitrary number of arguments of any type. pub fn variadic_any(volatility: Volatility) -> Self { Self { diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index eb4f325ff818..3184753ff581 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -28,7 +28,7 @@ use arrow::{ use datafusion_common::utils::{coerced_fixed_size_list_to_list, list_ndims}; use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result}; -use super::binary::{comparison_binary_numeric_coercion, comparison_coercion}; +use super::binary::comparison_coercion; /// Performs type coercion for function arguments. /// @@ -62,11 +62,13 @@ pub fn data_types( return Ok(current_types.to_vec()); } - // Try and coerce the argument types to match the signature, returning the - // coerced types from the first matching signature. - for valid_types in valid_types { - if let Some(types) = maybe_data_types(&valid_types, current_types) { - return Ok(types); + if !signature.type_signature.skip_coercion() { + // Try and coerce the argument types to match the signature, returning the + // coerced types from the first matching signature. + for valid_types in valid_types { + if let Some(types) = maybe_data_types(&valid_types, current_types) { + return Ok(types); + } } } @@ -184,6 +186,29 @@ fn get_valid_types( .iter() .map(|valid_type| (0..*number).map(|_| valid_type.clone()).collect()) .collect(), + TypeSignature::VariadicEqualOrNull => { + current_types + .iter() + .find(|&t| t != &DataType::Null) + .map_or_else( + || vec![vec![DataType::Null; current_types.len()]], + |t| { + let valid_types = current_types + .iter() + .map(|d| { + if d != &DataType::Null { + t.clone() + } else { + DataType::Null + } + }) + .collect::>(); + vec![valid_types] + }, + ) + + // vec![vec![current_types.first().unwrap().clone(); current_types.len()]] + } TypeSignature::VariadicEqual => { let new_type = current_types.iter().skip(1).try_fold( current_types.first().unwrap().clone(), @@ -450,19 +475,7 @@ fn coerced_from<'a>( { Some(type_into.clone()) } - // More coerce rules. - // Note that not all rules in `comparison_coercion` can be reused here. - // For example, all numeric types can be coerced into Utf8 for comparison, - // but not for function arguments. - _ => comparison_binary_numeric_coercion(type_into, type_from).and_then( - |coerced_type| { - if *type_into == coerced_type { - Some(coerced_type) - } else { - None - } - }, - ), + _ => None, } } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index 76f2a3ed741b..2ef657b6e263 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -23,7 +23,6 @@ use arrow::compute::{and, is_not_null, is_null}; use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; -use datafusion_expr::type_coercion::functions::data_types; use datafusion_expr::ColumnarValue; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; @@ -41,7 +40,7 @@ impl Default for CoalesceFunc { impl CoalesceFunc { pub fn new() -> Self { Self { - signature: Signature::variadic_equal(Volatility::Immutable), + signature: Signature::variadic_equal_or_null(Volatility::Immutable), } } } @@ -60,9 +59,11 @@ impl ScalarUDFImpl for CoalesceFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - // COALESCE has multiple args and they might get coerced, get a preview of this - let coerced_types = data_types(arg_types, self.signature()); - coerced_types.map(|types| types[0].clone()) + Ok(arg_types + .iter() + .find(|&t| t != &DataType::Null) + .unwrap_or(&DataType::Null) + .clone()) } /// coalesce evaluates to the first value which is not NULL diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index ff63416b3a10..7e7ebd8529da 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1897,7 +1897,7 @@ SELECT substring('alphabet' for 1); ---- a -# The 'from' and 'for' parameters don't support string types, because they should be treated as +# The 'from' and 'for' parameters don't support string types, because they should be treated as # regular expressions, which we have not implemented yet. query error DataFusion error: Error during planning: No function matches the given name and argument types SELECT substring('alphabet' FROM '3') diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index c8675a5d9c54..bd0ce3e7e8ea 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1159,3 +1159,218 @@ drop table uuid_table statement ok drop table t + +# Test coalesce function +query I +select coalesce(1, 2, 3); +---- +1 + +# test with first null +query I +select coalesce(null, 3, 2, 1); +---- +3 + +# test with null values +query ? +select coalesce(null, null); +---- +NULL + +# test with different types +query error +select coalesce(arrow_cast(1, 'Int32'), arrow_cast(1, 'Int64')); +---- +DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Int32, Int64)'. You might need to add explicit type casts. + Candidate functions: + coalesce(T or Null, .., T or Null) + + +# first non-null value is int32, but there is mixed types with value null, this should be error too. +# expect `skip_coercion` works +statement ok +create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2, null); + +query error +select coalesce(a, b, c) from t1; +---- +DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Int64, Int32, Int32)'. You might need to add explicit type casts. + Candidate functions: + coalesce(T or Null, .., T or Null) + + +statement ok +drop table t1; + + +# coalesce static empty value +query T +SELECT COALESCE('', 'test') +---- +(empty) + +# coalesce static value with null +query T +SELECT COALESCE(NULL, 'test') +---- +test + + +statement ok +create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); + +# test coercion string +query error +select coalesce(column1, 'none_set') from test1; +---- +DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Dictionary(Int32, Utf8), Utf8)'. You might need to add explicit type casts. + Candidate functions: + coalesce(T or Null, .., T or Null) + + +# test coercion Int +query error +select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); +---- +DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Int64, Dictionary(Int32, Int8))'. You might need to add explicit type casts. + Candidate functions: + coalesce(T or Null, .., T or Null) + + +# test with Int +query error +select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'), 34); +---- +DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Dictionary(Int32, Int8), Int64)'. You might need to add explicit type casts. + Candidate functions: + coalesce(T or Null, .., T or Null) + + +# test with null +query error +select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); +---- +DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Null, Int64, Dictionary(Int32, Int8))'. You might need to add explicit type casts. + Candidate functions: + coalesce(T or Null, .., T or Null) + + +# test with null +query error +select coalesce(null, column1, 'none_set') from test1; +---- +DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Null, Dictionary(Int32, Utf8), Utf8)'. You might need to add explicit type casts. + Candidate functions: + coalesce(T or Null, .., T or Null) + + +statement ok +drop table test1 + + +statement ok +CREATE TABLE test( + c1 INT, + c2 INT +) as VALUES +(0, 1), +(NULL, 1), +(1, 0), +(NULL, 1), +(NULL, NULL); + +# coalesce result +query I rowsort +SELECT COALESCE(c1, c2) FROM test +---- +0 +1 +1 +1 +NULL + +# coalesce result with default value +query error +SELECT COALESCE(c1, c2, '-1') FROM test +---- +DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Int32, Int32, Utf8)'. You might need to add explicit type casts. + Candidate functions: + coalesce(T or Null, .., T or Null) + + +statement ok +drop table test + +statement ok +CREATE TABLE test( + c1 BIGINT, + c2 BIGINT, +) as VALUES +(1, 2), +(NULL, 2), +(1, NULL), +(NULL, NULL); + +# coalesce sum with default value +query I +SELECT SUM(COALESCE(c1, c2, 0)) FROM test +---- +4 + +# coalesce mul with default value +query I +SELECT COALESCE(c1 * c2, 0) FROM test +---- +2 +0 +0 +0 + +statement ok +drop table test + +# coalesce date32 + +statement ok +CREATE TABLE test( + d1_date DATE, + d2_date DATE, + d3_date DATE +) as VALUES + ('2022-12-12','2022-12-12','2022-12-12'), + (NULL,'2022-12-11','2022-12-12'), + ('2022-12-12','2022-12-10','2022-12-12'), + ('2022-12-12',NULL,'2022-12-12'), + ('2022-12-12','2022-12-8','2022-12-12'), + ('2022-12-12','2022-12-7',NULL), + ('2022-12-12',NULL,'2022-12-12'), + (NULL,'2022-12-5','2022-12-12') +; + +query D +SELECT COALESCE(d1_date, d2_date, d3_date) FROM test +---- +2022-12-12 +2022-12-11 +2022-12-12 +2022-12-12 +2022-12-12 +2022-12-12 +2022-12-12 +2022-12-05 + +query T +SELECT arrow_typeof(COALESCE(d1_date, d2_date, d3_date)) FROM test +---- +Date32 +Date32 +Date32 +Date32 +Date32 +Date32 +Date32 +Date32 + +statement ok +drop table test diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 5ef33307b521..1d6effb53fdb 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -1778,7 +1778,7 @@ AS VALUES ('BB', 6, 1); query TII -select col1, col2, coalesce(sum_col3, 0) as sum_col3 +select col1, col2, coalesce(sum_col3, arrow_cast(0, 'UInt64')) as sum_col3 from (select distinct col2 from tbl) AS q1 cross join (select distinct col1 from tbl) AS q2 left outer join (SELECT col1, col2, sum(col3) as sum_col3 FROM tbl GROUP BY col1, col2) AS q3 @@ -2010,7 +2010,7 @@ set datafusion.explain.logical_plan_only = false; statement ok set datafusion.execution.target_partitions = 4; -# Planning inner nested loop join +# Planning inner nested loop join # inputs are swapped due to inexact statistics + join reordering caused additional projection query TT diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index 0c3fca446526..7fb2d55ff84a 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1767,165 +1767,6 @@ SELECT make_array(1, 2, 3); ---- [1, 2, 3] -# coalesce static empty value -query T -SELECT COALESCE('', 'test') ----- -(empty) - -# coalesce static value with null -query T -SELECT COALESCE(NULL, 'test') ----- -test - - -statement ok -create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); - -# test coercion string -query ? -select coalesce(column1, 'none_set') from test1; ----- -foo -none_set - -# test coercion Int -query I -select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ----- -34 - -# test with Int -query I -select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'),34); ----- -123 - -# test with null -query I -select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ----- -34 - -# test with null -query T -select coalesce(null, column1, 'none_set') from test1; ----- -foo -none_set - -statement ok -drop table test1 - - -statement ok -CREATE TABLE test( - c1 INT, - c2 INT -) as VALUES -(0, 1), -(NULL, 1), -(1, 0), -(NULL, 1), -(NULL, NULL); - -# coalesce result -query I rowsort -SELECT COALESCE(c1, c2) FROM test ----- -0 -1 -1 -1 -NULL - -# coalesce result with default value -query T rowsort -SELECT COALESCE(c1, c2, '-1') FROM test ----- --1 -0 -1 -1 -1 - -statement ok -drop table test - -statement ok -CREATE TABLE test( - c1 INT, - c2 INT -) as VALUES -(1, 2), -(NULL, 2), -(1, NULL), -(NULL, NULL); - -# coalesce sum with default value -query I -SELECT SUM(COALESCE(c1, c2, 0)) FROM test ----- -4 - -# coalesce mul with default value -query I -SELECT COALESCE(c1 * c2, 0) FROM test ----- -2 -0 -0 -0 - -statement ok -drop table test - -# coalesce date32 - -statement ok -CREATE TABLE test( - d1_date DATE, - d2_date DATE, - d3_date DATE -) as VALUES - ('2022-12-12','2022-12-12','2022-12-12'), - (NULL,'2022-12-11','2022-12-12'), - ('2022-12-12','2022-12-10','2022-12-12'), - ('2022-12-12',NULL,'2022-12-12'), - ('2022-12-12','2022-12-8','2022-12-12'), - ('2022-12-12','2022-12-7',NULL), - ('2022-12-12',NULL,'2022-12-12'), - (NULL,'2022-12-5','2022-12-12') -; - -query D -SELECT COALESCE(d1_date, d2_date, d3_date) FROM test ----- -2022-12-12 -2022-12-11 -2022-12-12 -2022-12-12 -2022-12-12 -2022-12-12 -2022-12-12 -2022-12-05 - -query T -SELECT arrow_typeof(COALESCE(d1_date, d2_date, d3_date)) FROM test ----- -Date32 -Date32 -Date32 -Date32 -Date32 -Date32 -Date32 -Date32 - -statement ok -drop table test - statement ok CREATE TABLE test( i32 INT, diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index a3a4b3bfc584..003cdcaf8bd2 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1463,7 +1463,7 @@ DROP TABLE t; # related to https://github.com/apache/datafusion/issues/8814 statement ok -create table t(x int, y int) as values (1,1), (2,2), (3,3), (0,0), (4,0); +create table t(x bigint, y bigint) as values (1,1), (2,2), (3,3), (0,0), (4,0); query II SELECT @@ -1484,30 +1484,30 @@ query TT explain select coalesce(1, y/x), coalesce(2, y/x) from t; ---- logical_plan -01)Projection: coalesce(Int64(1), CAST(t.y / t.x AS Int64)), coalesce(Int64(2), CAST(t.y / t.x AS Int64)) +01)Projection: coalesce(Int64(1), t.y / t.x), coalesce(Int64(2), t.y / t.x) 02)--TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[coalesce(1, CAST(y@1 / x@0 AS Int64)) as coalesce(Int64(1),t.y / t.x), coalesce(2, CAST(y@1 / x@0 AS Int64)) as coalesce(Int64(2),t.y / t.x)] +01)ProjectionExec: expr=[coalesce(1, y@1 / x@0) as coalesce(Int64(1),t.y / t.x), coalesce(2, y@1 / x@0) as coalesce(Int64(2),t.y / t.x)] 02)--MemoryExec: partitions=1, partition_sizes=[1] query TT EXPLAIN SELECT y > 0 and 1 / y < 1, x > 0 and y > 0 and 1 / y < 1 / x from t; ---- logical_plan -01)Projection: t.y > Int32(0) AND Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y > Int64(0) AND Int64(1) / t.y < Int64(1), t.x > Int32(0) AND t.y > Int32(0) AND Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x +01)Projection: t.y > Int64(0) AND Int64(1) / t.y < Int64(1), t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x 02)--TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[y@1 > 0 AND 1 / CAST(y@1 AS Int64) < 1 as t.y > Int64(0) AND Int64(1) / t.y < Int64(1), x@0 > 0 AND y@1 > 0 AND 1 / CAST(y@1 AS Int64) < 1 / CAST(x@0 AS Int64) as t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x] +01)ProjectionExec: expr=[y@1 > 0 AND 1 / y@1 < 1 as t.y > Int64(0) AND Int64(1) / t.y < Int64(1), x@0 > 0 AND y@1 > 0 AND 1 / y@1 < 1 / x@0 as t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x] 02)--MemoryExec: partitions=1, partition_sizes=[1] query TT EXPLAIN SELECT y = 0 or 1 / y < 1, x = 0 or y = 0 or 1 / y < 1 / x from t; ---- logical_plan -01)Projection: t.y = Int32(0) OR Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y = Int64(0) OR Int64(1) / t.y < Int64(1), t.x = Int32(0) OR t.y = Int32(0) OR Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x +01)Projection: t.y = Int64(0) OR Int64(1) / t.y < Int64(1), t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x 02)--TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[y@1 = 0 OR 1 / CAST(y@1 AS Int64) < 1 as t.y = Int64(0) OR Int64(1) / t.y < Int64(1), x@0 = 0 OR y@1 = 0 OR 1 / CAST(y@1 AS Int64) < 1 / CAST(x@0 AS Int64) as t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x] +01)ProjectionExec: expr=[y@1 = 0 OR 1 / y@1 < 1 as t.y = Int64(0) OR Int64(1) / t.y < Int64(1), x@0 = 0 OR y@1 = 0 OR 1 / y@1 < 1 / x@0 as t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x] 02)--MemoryExec: partitions=1, partition_sizes=[1] # due to the reason describe in https://github.com/apache/datafusion/issues/8927, From a36e6b28ec877f4f570573070d6c92c1cc09a621 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 27 Apr 2024 23:07:37 +0800 Subject: [PATCH 02/37] add more test Signed-off-by: jayzhan211 --- .../sqllogictest/test_files/functions.slt | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index bd0ce3e7e8ea..9830b4e19380 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1203,6 +1203,28 @@ DataFusion error: Error during planning: No function matches the given name and statement ok drop table t1; +# test multi rows +statement ok +CREATE TABLE t1( + c1 int, + c2 int +) as VALUES +(1, 2), +(NULL, 2), +(1, NULL), +(NULL, NULL); + +query I +SELECT COALESCE(c1, c2) FROM t1 +---- +1 +2 +1 +NULL + +statement ok +drop table t1; + # coalesce static empty value query T From bf16c9240e2b1f541131619a756ce5b03745a6c2 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 07:29:29 +0800 Subject: [PATCH 03/37] add more test Signed-off-by: jayzhan211 --- datafusion/sqllogictest/test_files/functions.slt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 9830b4e19380..993cbf90229a 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1200,6 +1200,13 @@ DataFusion error: Error during planning: No function matches the given name and coalesce(T or Null, .., T or Null) +query I +select coalesce(b, c) from t1; +---- +1 +2 + + statement ok drop table t1; From 407e3c7a341a2ce64ada5370e935d7db54b4ba46 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 07:34:58 +0800 Subject: [PATCH 04/37] crate only visibility Signed-off-by: jayzhan211 --- datafusion/expr/src/signature.rs | 2 +- datafusion/expr/src/type_coercion/functions.rs | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index a877979ead2c..5df2625a3aea 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -236,7 +236,7 @@ impl TypeSignature { } /// Skip coercion to match the signature. - pub fn skip_coercion(&self) -> bool { + pub(crate) fn skip_coercion(&self) -> bool { matches!(self, TypeSignature::VariadicEqualOrNull) } } diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 3184753ff581..d0443703d3d4 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -206,8 +206,6 @@ fn get_valid_types( vec![valid_types] }, ) - - // vec![vec![current_types.first().unwrap().clone(); current_types.len()]] } TypeSignature::VariadicEqual => { let new_type = current_types.iter().skip(1).try_fold( From 03b916243466be9e09f849cfc9b65651a77a168d Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 07:40:09 +0800 Subject: [PATCH 05/37] polish comment Signed-off-by: jayzhan211 --- datafusion/sqllogictest/test_files/functions.slt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 993cbf90229a..61f75937d80e 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1187,11 +1187,11 @@ DataFusion error: Error during planning: No function matches the given name and coalesce(T or Null, .., T or Null) -# first non-null value is int32, but there is mixed types with value null, this should be error too. -# expect `skip_coercion` works statement ok create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2, null); +# first non-null value is int32, but there are mixed types (i64, i32, i32), so error is expected +# This test also validate `skip_coercion` works, without skipping, i32 is coerced to i64 and signature error is not thrown query error select coalesce(a, b, c) from t1; ---- From 4abf29d83022dc8ef087f1db11c0ba5a54374352 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 07:55:52 +0800 Subject: [PATCH 06/37] improve test Signed-off-by: jayzhan211 --- .../sqllogictest/test_files/functions.slt | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 61f75937d80e..9a5cf8772f0f 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1178,6 +1178,15 @@ select coalesce(null, null); ---- NULL +# test with empty args +query error +select coalesce(); +---- +DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce()'. You might need to add explicit type casts. + Candidate functions: + coalesce(T or Null, .., T or Null) + + # test with different types query error select coalesce(arrow_cast(1, 'Int32'), arrow_cast(1, 'Int64')); @@ -1200,12 +1209,15 @@ DataFusion error: Error during planning: No function matches the given name and coalesce(T or Null, .., T or Null) -query I -select coalesce(b, c) from t1; +# b, c has the same type i32 +query IT +select + coalesce(b, c), + arrow_typeof(coalesce(b, c)) +from t1; ---- -1 -2 - +1 Int32 +2 Int32 statement ok drop table t1; @@ -1232,7 +1244,6 @@ NULL statement ok drop table t1; - # coalesce static empty value query T SELECT COALESCE('', 'test') From 4965e8da146d88fbb9d43c4eb3ef32f353f86360 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 12:17:05 +0800 Subject: [PATCH 07/37] backup Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 163 ++++++++++++++++-- .../expr/src/type_coercion/functions.rs | 74 +++++--- datafusion/functions/src/core/coalesce.rs | 21 ++- .../physical-expr/src/scalar_function.rs | 1 + datafusion/physical-expr/src/udf.rs | 2 + .../sqllogictest/test_files/functions.slt | 39 +++-- 6 files changed, 248 insertions(+), 52 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 7eec606658f4..8582f2e5927c 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -30,6 +30,8 @@ use arrow::datatypes::{ use datafusion_common::{exec_datafusion_err, plan_datafusion_err, plan_err, Result}; +use super::functions::coerced_from; + /// The type signature of an instantiation of binary operator expression such as /// `lhs + rhs` /// @@ -289,7 +291,134 @@ fn bitwise_coercion(left_type: &DataType, right_type: &DataType) -> Option TypeCategory { + if data_type.is_numeric() { + return TypeCategory::Numeric; + } + + if matches!(data_type, DataType::Boolean) { + return TypeCategory::Boolean; + } + + if matches!( + data_type, + DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) + ) { + return TypeCategory::Array; + } + + if matches!(data_type, DataType::Utf8 | DataType::LargeUtf8) { + return TypeCategory::String; + } + + if matches!( + data_type, + DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Interval(_) + | DataType::Duration(_) + ) { + return TypeCategory::DateTime; + } + + if matches!( + data_type, + DataType::Dictionary(_, _) | DataType::Struct(_) | DataType::Union(_, _) + ) { + return TypeCategory::Composite; + } + + return TypeCategory::Unknown; +} + +/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of constructs including +/// CASE, ARRAY, VALUES, and the GREATEST and LEAST functions. +/// See https://www.postgresql.org/docs/current/typeconv-union-case.html for more information. +pub fn type_resolution(data_types: &[DataType]) -> Option { + if data_types.is_empty() { + return None; + } + + // if all the data_types is the same return first one + if data_types.iter().all(|t| t == &data_types[0]) { + return Some(data_types[0].clone()); + } + + // if all the data_types are null, return string + if data_types.iter().all(|t| t == &DataType::Null) { + return Some(DataType::Utf8); + } + + // Ignore Nulls, if any data_type category is not the same, return None + let data_types_category: Vec = data_types + .iter() + .filter(|&t| t != &DataType::Null) + .map(data_type_category) + .collect(); + if data_types_category + .iter() + .any(|t| t != &data_types_category[0]) + { + return None; + } + + // Ignore Nulls + let mut candidate_type: Option = None; + for data_type in data_types.iter() { + if data_type == &DataType::Null { + continue; + } + if let Some(ref candidate_t) = candidate_type { + println!("data_Type: {:?}", data_type); + println!("candidate_type: {:?}", candidate_t); + if let Some(t) = coerced_from(data_type, candidate_t) { + // if let Some(t) = type_resolution_coercion(data_type, &candidate_type) { + candidate_type = Some(t); + } else if coerced_from(candidate_t, data_type).is_some() { + // keep the candidate type + } else { + // Not coercible, return None + return None; + } + } else { + candidate_type = Some(data_type.clone()); + } + } + + println!("candidate_type: {:?}", candidate_type); + + candidate_type +} + +/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of type resolution +/// Unlike [comparison_coercion], usually the coerced type is more `wider`. +// fn type_resolution_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +// if lhs_type == rhs_type { +// // same type => all good +// return Some(lhs_type.clone()); +// } + +// // binary numeric is able to reuse +// comparison_binary_numeric_coercion(lhs_type, rhs_type) +// } + /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation +/// Unlike [type_resolution_coercion], usually the coerced type is for comparison only. +/// For example, compare with Dictionary and Dictionary, only value type is what we care about pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { if lhs_type == rhs_type { // same type => equality is possible @@ -375,20 +504,13 @@ pub(crate) fn comparison_binary_numeric_coercion( return Some(lhs_type.clone()); } + if let Some(t) = decimal_coercion(lhs_type, rhs_type) { + return Some(t); + } + // these are ordered from most informative to least informative so // that the coercion does not lose information via truncation match (lhs_type, rhs_type) { - // Prefer decimal data type over floating point for comparison operation - (Decimal128(_, _), Decimal128(_, _)) => { - get_wider_decimal_type(lhs_type, rhs_type) - } - (Decimal128(_, _), _) => get_comparison_common_decimal_type(lhs_type, rhs_type), - (_, Decimal128(_, _)) => get_comparison_common_decimal_type(rhs_type, lhs_type), - (Decimal256(_, _), Decimal256(_, _)) => { - get_wider_decimal_type(lhs_type, rhs_type) - } - (Decimal256(_, _), _) => get_comparison_common_decimal_type(lhs_type, rhs_type), - (_, Decimal256(_, _)) => get_comparison_common_decimal_type(rhs_type, lhs_type), (Float64, _) | (_, Float64) => Some(Float64), (_, Float32) | (Float32, _) => Some(Float32), // The following match arms encode the following logic: Given the two @@ -426,6 +548,25 @@ pub(crate) fn comparison_binary_numeric_coercion( } } +pub fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { + use arrow::datatypes::DataType::*; + + match (lhs_type, rhs_type) { + // Prefer decimal data type over floating point for comparison operation + (Decimal128(_, _), Decimal128(_, _)) => { + get_wider_decimal_type(lhs_type, rhs_type) + } + (Decimal128(_, _), _) => get_comparison_common_decimal_type(lhs_type, rhs_type), + (_, Decimal128(_, _)) => get_comparison_common_decimal_type(rhs_type, lhs_type), + (Decimal256(_, _), Decimal256(_, _)) => { + get_wider_decimal_type(lhs_type, rhs_type) + } + (Decimal256(_, _), _) => get_comparison_common_decimal_type(lhs_type, rhs_type), + (_, Decimal256(_, _)) => get_comparison_common_decimal_type(rhs_type, lhs_type), + (_, _) => None, + } +} + /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of /// a comparison operation where one is a decimal fn get_comparison_common_decimal_type( diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index d0443703d3d4..6eb8214bc3f1 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use crate::signature::{ ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD, }; +use crate::type_coercion::binary::{decimal_coercion, type_resolution}; use crate::{Signature, TypeSignature}; use arrow::{ compute::can_cast_types, @@ -53,8 +54,12 @@ pub fn data_types( } } + println!("current_types: {:?}", current_types); + println!("type_signature: {:?}", &signature.type_signature); let valid_types = get_valid_types(&signature.type_signature, current_types)?; + println!("valid_types: {:?}", valid_types); + if valid_types .iter() .any(|data_type| data_type == current_types) @@ -62,7 +67,9 @@ pub fn data_types( return Ok(current_types.to_vec()); } - if !signature.type_signature.skip_coercion() { + println!("valid_types: {:?}", valid_types); + + if true { // Try and coerce the argument types to match the signature, returning the // coerced types from the first matching signature. for valid_types in valid_types { @@ -187,25 +194,31 @@ fn get_valid_types( .map(|valid_type| (0..*number).map(|_| valid_type.clone()).collect()) .collect(), TypeSignature::VariadicEqualOrNull => { - current_types - .iter() - .find(|&t| t != &DataType::Null) - .map_or_else( - || vec![vec![DataType::Null; current_types.len()]], - |t| { - let valid_types = current_types - .iter() - .map(|d| { - if d != &DataType::Null { - t.clone() - } else { - DataType::Null - } - }) - .collect::>(); - vec![valid_types] - }, - ) + if let Some(common_type) = type_resolution(current_types) { + vec![vec![common_type; current_types.len()]] + } else { + vec![] + } + + // current_types + // .iter() + // .find(|&t| t != &DataType::Null) + // .map_or_else( + // || vec![vec![DataType::Null; current_types.len()]], + // |t| { + // let valid_types = current_types + // .iter() + // .map(|d| { + // if d != &DataType::Null { + // t.clone() + // } else { + // DataType::Null + // } + // }) + // .collect::>(); + // vec![valid_types] + // }, + // ) } TypeSignature::VariadicEqual => { let new_type = current_types.iter().skip(1).try_fold( @@ -330,11 +343,12 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { false } -fn coerced_from<'a>( +pub(crate) fn coerced_from<'a>( type_into: &'a DataType, type_from: &'a DataType, ) -> Option { use self::DataType::*; + // match Dictionary first match (type_into, type_from) { // coerced dictionary first @@ -348,6 +362,18 @@ fn coerced_from<'a>( { Some(type_into.clone()) } + // coerce decimal + // (Decimal128(_, _) | Decimal256(_, _), Null) => { + // Some(type_into.clone()) + // } + (Decimal128(_, _), Decimal128(_, _)) | (Decimal256(_, _), Decimal256(_, _)) => { + decimal_coercion(type_into, type_from) + } + (Decimal128(_, _) | Decimal256(_, _), _) + if matches!(type_from, Int8 | Int16 | Int32 | Int64) => + { + decimal_coercion(type_into, type_from) + } // coerced into type_into (Int8, _) if matches!(type_from, Null | Int8) => Some(type_into.clone()), (Int16, _) if matches!(type_from, Null | Int8 | Int16 | UInt8) => { @@ -473,6 +499,12 @@ fn coerced_from<'a>( { Some(type_into.clone()) } + // (Decimal128(_, _), _) if matches!(type_from, Null | Int8 | Int16 | Int32 | Int64) => { + // Some(type_into.clone()) + // } + // (Decimal256(_, _), _) if matches!(type_from, Null | Int8 | Int16 | Int32 | Int64) => { + // Some(type_into.clone()) + // } _ => None, } } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index 2ef657b6e263..b24c708e8988 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -22,7 +22,8 @@ use arrow::compute::kernels::zip::zip; use arrow::compute::{and, is_not_null, is_null}; use arrow::datatypes::DataType; -use datafusion_common::{exec_err, Result}; +use datafusion_common::{exec_err, internal_err, Result}; +use datafusion_expr::type_coercion::binary::type_resolution; use datafusion_expr::ColumnarValue; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; @@ -59,11 +60,19 @@ impl ScalarUDFImpl for CoalesceFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - Ok(arg_types - .iter() - .find(|&t| t != &DataType::Null) - .unwrap_or(&DataType::Null) - .clone()) + if let Some(common_type) = type_resolution(arg_types) { + println!("args: {:?}", arg_types); + println!("common_type: {:?}", common_type); + return Ok(common_type); + } else { + return internal_err!("Error should be thrown via signature validation"); + } + + // Ok(arg_types + // .iter() + // .find(|&t| t != &DataType::Null) + // .unwrap_or(&DataType::Null) + // .clone()) } /// coalesce evaluates to the first value which is not NULL diff --git a/datafusion/physical-expr/src/scalar_function.rs b/datafusion/physical-expr/src/scalar_function.rs index 3b360fc20c39..789d7d07ca30 100644 --- a/datafusion/physical-expr/src/scalar_function.rs +++ b/datafusion/physical-expr/src/scalar_function.rs @@ -129,6 +129,7 @@ impl PhysicalExpr for ScalarFunctionExpr { } fn data_type(&self, _input_schema: &Schema) -> Result { + println!("self.return_type: {:?}", self.return_type); Ok(self.return_type.clone()) } diff --git a/datafusion/physical-expr/src/udf.rs b/datafusion/physical-expr/src/udf.rs index aad78b7c2f90..54281412d27d 100644 --- a/datafusion/physical-expr/src/udf.rs +++ b/datafusion/physical-expr/src/udf.rs @@ -50,6 +50,8 @@ pub fn create_physical_expr( let return_type = fun.return_type_from_exprs(args, input_dfschema, &input_expr_types)?; + println!("return_type: {:?}", return_type); + println!("input_expr_types: {:?}", input_expr_types); let fun_def = ScalarFunctionDefinition::UDF(Arc::new(fun.clone())); Ok(Arc::new(ScalarFunctionExpr::new( fun.name(), diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 9a5cf8772f0f..7c96e8c91712 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1188,28 +1188,25 @@ DataFusion error: Error during planning: No function matches the given name and # test with different types -query error +query I select coalesce(arrow_cast(1, 'Int32'), arrow_cast(1, 'Int64')); ---- -DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Int32, Int64)'. You might need to add explicit type casts. - Candidate functions: - coalesce(T or Null, .., T or Null) - +1 statement ok create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2, null); -# first non-null value is int32, but there are mixed types (i64, i32, i32), so error is expected -# This test also validate `skip_coercion` works, without skipping, i32 is coerced to i64 and signature error is not thrown -query error -select coalesce(a, b, c) from t1; +# Follow Postgres and DuckDB behavior, since a is bigint, although the value is null, all args are coerced to bigint +query IT +select + coalesce(a, b, c), + arrow_typeof(coalesce(a, b, c)) +from t1; ---- -DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Int64, Int32, Int32)'. You might need to add explicit type casts. - Candidate functions: - coalesce(T or Null, .., T or Null) - +1 Int64 +2 Int64 -# b, c has the same type i32 +# b, c has the same type int, so the result is int query IT select coalesce(b, c), @@ -1244,6 +1241,20 @@ NULL statement ok drop table t1; +query RT +select + coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0), + arrow_typeof(coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0)) +---- +2 Decimal128(7, 2) + +query RT +select + coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0), + arrow_typeof(coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0)); +---- +2 Decimal256(7, 2) + # coalesce static empty value query T SELECT COALESCE('', 'test') From 81f0235f470359587e88958ffb32f7e069d3347d Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 13:23:32 +0800 Subject: [PATCH 08/37] introduce new signautre for coalesce Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 10 ++++----- datafusion/functions/src/core/coalesce.rs | 5 +++-- .../optimizer/src/analyzer/type_coercion.rs | 1 + datafusion/physical-expr/src/udf.rs | 4 ++-- .../sqllogictest/test_files/functions.slt | 21 ++++++++++++++++--- 5 files changed, 29 insertions(+), 12 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 8582f2e5927c..2c7296c6d626 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -383,13 +383,13 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { continue; } if let Some(ref candidate_t) = candidate_type { - println!("data_Type: {:?}", data_type); - println!("candidate_type: {:?}", candidate_t); + // println!("data_Type: {:?}", data_type); + // println!("candidate_type: {:?}", candidate_t); + // coerced_from is mostly uni-directional, so we need to check both directions if let Some(t) = coerced_from(data_type, candidate_t) { - // if let Some(t) = type_resolution_coercion(data_type, &candidate_type) { candidate_type = Some(t); - } else if coerced_from(candidate_t, data_type).is_some() { - // keep the candidate type + } else if let Some(t) = coerced_from(candidate_t, data_type) { + candidate_type = Some(t); } else { // Not coercible, return None return None; diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index b24c708e8988..a56a1d9e51ba 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -61,8 +61,9 @@ impl ScalarUDFImpl for CoalesceFunc { fn return_type(&self, arg_types: &[DataType]) -> Result { if let Some(common_type) = type_resolution(arg_types) { - println!("args: {:?}", arg_types); - println!("common_type: {:?}", common_type); + // println!("args: {:?}", arg_types); + // println!("common_type: {:?}", common_type); + // Ok(DataType::Decimal128(22, 2)) return Ok(common_type); } else { return internal_err!("Error should be thrown via signature validation"); diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index f96a359f9d47..615b07fb1037 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -559,6 +559,7 @@ fn coerce_arguments_for_signature( .collect::>>()?; let new_types = data_types(¤t_types, signature)?; + println!("new_types: {:?}", new_types); expressions .into_iter() diff --git a/datafusion/physical-expr/src/udf.rs b/datafusion/physical-expr/src/udf.rs index 54281412d27d..8ca12ecdf3de 100644 --- a/datafusion/physical-expr/src/udf.rs +++ b/datafusion/physical-expr/src/udf.rs @@ -50,8 +50,8 @@ pub fn create_physical_expr( let return_type = fun.return_type_from_exprs(args, input_dfschema, &input_expr_types)?; - println!("return_type: {:?}", return_type); - println!("input_expr_types: {:?}", input_expr_types); + // println!("return_type: {:?}", return_type); + // println!("input_expr_types: {:?}", input_expr_types); let fun_def = ScalarFunctionDefinition::UDF(Arc::new(fun.clone())); Ok(Arc::new(ScalarFunctionExpr::new( fun.name(), diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 7c96e8c91712..aa9f03aad6cb 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1241,19 +1241,20 @@ NULL statement ok drop table t1; +# Decimal128(7, 2) and int64 are coerced to common wider type Decimal128(22, 2) query RT select coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0), arrow_typeof(coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0)) ---- -2 Decimal128(7, 2) +2 Decimal128(22, 2) query RT select coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0), arrow_typeof(coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0)); ---- -2 Decimal256(7, 2) +2 Decimal256(22, 2) # coalesce static empty value query T @@ -1267,7 +1268,6 @@ SELECT COALESCE(NULL, 'test') ---- test - statement ok create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); @@ -1280,6 +1280,13 @@ DataFusion error: Error during planning: No function matches the given name and coalesce(T or Null, .., T or Null) +# explicit cast to Utf8 +query T +select coalesce(arrow_cast(column1, 'Utf8'), 'none_set') from test1; +---- +foo +none_set + # test coercion Int query error select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); @@ -1289,6 +1296,14 @@ DataFusion error: Error during planning: No function matches the given name and coalesce(T or Null, .., T or Null) +# explicit cast to Int8, and it will implicitly cast to Int64 +query IT +select + coalesce(arrow_cast(123, 'Int8'), 34), + arrow_typeof(coalesce(arrow_cast(123, 'Int8'), 34)); +---- +123 Int64 + # test with Int query error select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'), 34); From bae996c1cb1a3928d7cf7d04bece61aba0099468 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 13:37:24 +0800 Subject: [PATCH 09/37] cleanup Signed-off-by: jayzhan211 --- datafusion/expr/src/signature.rs | 8 ++- datafusion/expr/src/type_coercion/binary.rs | 22 ++------ .../expr/src/type_coercion/functions.rs | 51 +++---------------- datafusion/functions/src/core/coalesce.rs | 13 +---- 4 files changed, 15 insertions(+), 79 deletions(-) diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index 5df2625a3aea..c010eade350e 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -104,6 +104,9 @@ pub enum TypeSignature { /// No coercion is attempted. /// /// Functions like `coalesce` is `VariadicEqual`. + // TODO: Temporary Signature, to differentiate existing VariadicEqual. + // After we swtich `make_array` to VariadicEqualOrNull, + // we can reuse VariadicEqual. VariadicEqualOrNull, /// One or more arguments with arbitrary types VariadicAny, @@ -234,11 +237,6 @@ impl TypeSignature { _ => false, } } - - /// Skip coercion to match the signature. - pub(crate) fn skip_coercion(&self) -> bool { - matches!(self, TypeSignature::VariadicEqualOrNull) - } } /// Defines the supported argument types ([`TypeSignature`]) and [`Volatility`] for a function. diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 2c7296c6d626..8a66652a61db 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -342,12 +342,12 @@ fn data_type_category(data_type: &DataType) -> TypeCategory { return TypeCategory::Composite; } - return TypeCategory::Unknown; + TypeCategory::Unknown } /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of constructs including /// CASE, ARRAY, VALUES, and the GREATEST and LEAST functions. -/// See https://www.postgresql.org/docs/current/typeconv-union-case.html for more information. +/// See for more information. pub fn type_resolution(data_types: &[DataType]) -> Option { if data_types.is_empty() { return None; @@ -383,8 +383,6 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { continue; } if let Some(ref candidate_t) = candidate_type { - // println!("data_Type: {:?}", data_type); - // println!("candidate_type: {:?}", candidate_t); // coerced_from is mostly uni-directional, so we need to check both directions if let Some(t) = coerced_from(data_type, candidate_t) { candidate_type = Some(t); @@ -399,25 +397,11 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { } } - println!("candidate_type: {:?}", candidate_type); - candidate_type } -/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of type resolution -/// Unlike [comparison_coercion], usually the coerced type is more `wider`. -// fn type_resolution_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { -// if lhs_type == rhs_type { -// // same type => all good -// return Some(lhs_type.clone()); -// } - -// // binary numeric is able to reuse -// comparison_binary_numeric_coercion(lhs_type, rhs_type) -// } - /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation -/// Unlike [type_resolution_coercion], usually the coerced type is for comparison only. +/// Unlike [coerced_from], usually the coerced type is for comparison only. /// For example, compare with Dictionary and Dictionary, only value type is what we care about pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { if lhs_type == rhs_type { diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 6eb8214bc3f1..8dafc2240748 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -54,12 +54,7 @@ pub fn data_types( } } - println!("current_types: {:?}", current_types); - println!("type_signature: {:?}", &signature.type_signature); let valid_types = get_valid_types(&signature.type_signature, current_types)?; - - println!("valid_types: {:?}", valid_types); - if valid_types .iter() .any(|data_type| data_type == current_types) @@ -67,15 +62,11 @@ pub fn data_types( return Ok(current_types.to_vec()); } - println!("valid_types: {:?}", valid_types); - - if true { - // Try and coerce the argument types to match the signature, returning the - // coerced types from the first matching signature. - for valid_types in valid_types { - if let Some(types) = maybe_data_types(&valid_types, current_types) { - return Ok(types); - } + // Try and coerce the argument types to match the signature, returning the + // coerced types from the first matching signature. + for valid_types in valid_types { + if let Some(types) = maybe_data_types(&valid_types, current_types) { + return Ok(types); } } @@ -199,26 +190,6 @@ fn get_valid_types( } else { vec![] } - - // current_types - // .iter() - // .find(|&t| t != &DataType::Null) - // .map_or_else( - // || vec![vec![DataType::Null; current_types.len()]], - // |t| { - // let valid_types = current_types - // .iter() - // .map(|d| { - // if d != &DataType::Null { - // t.clone() - // } else { - // DataType::Null - // } - // }) - // .collect::>(); - // vec![valid_types] - // }, - // ) } TypeSignature::VariadicEqual => { let new_type = current_types.iter().skip(1).try_fold( @@ -343,6 +314,8 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { false } +/// Coerced_from implicitly casts between types. +/// Unlike [comparison_coercion], the coerced type is usually `wider` for lossless conversion. pub(crate) fn coerced_from<'a>( type_into: &'a DataType, type_from: &'a DataType, @@ -362,10 +335,6 @@ pub(crate) fn coerced_from<'a>( { Some(type_into.clone()) } - // coerce decimal - // (Decimal128(_, _) | Decimal256(_, _), Null) => { - // Some(type_into.clone()) - // } (Decimal128(_, _), Decimal128(_, _)) | (Decimal256(_, _), Decimal256(_, _)) => { decimal_coercion(type_into, type_from) } @@ -499,12 +468,6 @@ pub(crate) fn coerced_from<'a>( { Some(type_into.clone()) } - // (Decimal128(_, _), _) if matches!(type_from, Null | Int8 | Int16 | Int32 | Int64) => { - // Some(type_into.clone()) - // } - // (Decimal256(_, _), _) if matches!(type_from, Null | Int8 | Int16 | Int32 | Int64) => { - // Some(type_into.clone()) - // } _ => None, } } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index a56a1d9e51ba..e27b4fd04414 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -61,19 +61,10 @@ impl ScalarUDFImpl for CoalesceFunc { fn return_type(&self, arg_types: &[DataType]) -> Result { if let Some(common_type) = type_resolution(arg_types) { - // println!("args: {:?}", arg_types); - // println!("common_type: {:?}", common_type); - // Ok(DataType::Decimal128(22, 2)) - return Ok(common_type); + Ok(common_type) } else { - return internal_err!("Error should be thrown via signature validation"); + internal_err!("Error should be thrown via signature validation") } - - // Ok(arg_types - // .iter() - // .find(|&t| t != &DataType::Null) - // .unwrap_or(&DataType::Null) - // .clone()) } /// coalesce evaluates to the first value which is not NULL From ddf9b1c52f0b28672c21ab056d91ae8b69097945 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 13:49:04 +0800 Subject: [PATCH 10/37] cleanup Signed-off-by: jayzhan211 --- datafusion/expr/src/signature.rs | 2 +- datafusion/expr/src/type_coercion/binary.rs | 2 +- datafusion/expr/src/type_coercion/functions.rs | 1 - datafusion/optimizer/src/analyzer/type_coercion.rs | 1 - datafusion/physical-expr/src/scalar_function.rs | 1 - datafusion/sqllogictest/test_files/functions.slt | 14 +++++++------- 6 files changed, 9 insertions(+), 12 deletions(-) diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index c010eade350e..7c5873bc8387 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -202,7 +202,7 @@ impl TypeSignature { vec!["CoercibleT, .., CoercibleT".to_string()] } TypeSignature::VariadicEqualOrNull => { - vec!["T or Null, .., T or Null".to_string()] + vec!["CoercibleT or NULL, .., CoercibleT or NULL".to_string()] } TypeSignature::VariadicAny => vec!["Any, .., Any".to_string()], TypeSignature::OneOf(sigs) => { diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 8a66652a61db..8ee447a92a95 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -383,7 +383,7 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { continue; } if let Some(ref candidate_t) = candidate_type { - // coerced_from is mostly uni-directional, so we need to check both directions + // `coerced_from` is designed uni-directional for `can_coerced_from` so we need to check both directions if let Some(t) = coerced_from(data_type, candidate_t) { candidate_type = Some(t); } else if let Some(t) = coerced_from(candidate_t, data_type) { diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 8dafc2240748..a5f79becc1ec 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -447,7 +447,6 @@ pub(crate) fn coerced_from<'a>( } _ => None, }, - (Timestamp(unit, Some(tz)), _) if tz.as_ref() == TIMEZONE_WILDCARD => { match type_from { Timestamp(_, Some(from_tz)) => { diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 615b07fb1037..f96a359f9d47 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -559,7 +559,6 @@ fn coerce_arguments_for_signature( .collect::>>()?; let new_types = data_types(¤t_types, signature)?; - println!("new_types: {:?}", new_types); expressions .into_iter() diff --git a/datafusion/physical-expr/src/scalar_function.rs b/datafusion/physical-expr/src/scalar_function.rs index 789d7d07ca30..3b360fc20c39 100644 --- a/datafusion/physical-expr/src/scalar_function.rs +++ b/datafusion/physical-expr/src/scalar_function.rs @@ -129,7 +129,6 @@ impl PhysicalExpr for ScalarFunctionExpr { } fn data_type(&self, _input_schema: &Schema) -> Result { - println!("self.return_type: {:?}", self.return_type); Ok(self.return_type.clone()) } diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index aa9f03aad6cb..20ca94193b38 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1184,7 +1184,7 @@ select coalesce(); ---- DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce()'. You might need to add explicit type casts. Candidate functions: - coalesce(T or Null, .., T or Null) + coalesce(CoercibleT or NULL, .., CoercibleT or NULL) # test with different types @@ -1277,7 +1277,7 @@ select coalesce(column1, 'none_set') from test1; ---- DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Dictionary(Int32, Utf8), Utf8)'. You might need to add explicit type casts. Candidate functions: - coalesce(T or Null, .., T or Null) + coalesce(CoercibleT or NULL, .., CoercibleT or NULL) # explicit cast to Utf8 @@ -1293,7 +1293,7 @@ select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ---- DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Int64, Dictionary(Int32, Int8))'. You might need to add explicit type casts. Candidate functions: - coalesce(T or Null, .., T or Null) + coalesce(CoercibleT or NULL, .., CoercibleT or NULL) # explicit cast to Int8, and it will implicitly cast to Int64 @@ -1310,7 +1310,7 @@ select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'), 34); ---- DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Dictionary(Int32, Int8), Int64)'. You might need to add explicit type casts. Candidate functions: - coalesce(T or Null, .., T or Null) + coalesce(CoercibleT or NULL, .., CoercibleT or NULL) # test with null @@ -1319,7 +1319,7 @@ select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ---- DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Null, Int64, Dictionary(Int32, Int8))'. You might need to add explicit type casts. Candidate functions: - coalesce(T or Null, .., T or Null) + coalesce(CoercibleT or NULL, .., CoercibleT or NULL) # test with null @@ -1328,7 +1328,7 @@ select coalesce(null, column1, 'none_set') from test1; ---- DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Null, Dictionary(Int32, Utf8), Utf8)'. You might need to add explicit type casts. Candidate functions: - coalesce(T or Null, .., T or Null) + coalesce(CoercibleT or NULL, .., CoercibleT or NULL) statement ok @@ -1362,7 +1362,7 @@ SELECT COALESCE(c1, c2, '-1') FROM test ---- DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Int32, Int32, Utf8)'. You might need to add explicit type casts. Candidate functions: - coalesce(T or Null, .., T or Null) + coalesce(CoercibleT or NULL, .., CoercibleT or NULL) statement ok From c2799ead9e93f36a6464708ea17adf806fd4b2b4 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 14:04:19 +0800 Subject: [PATCH 11/37] ignore err msg Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 16 ++-- .../sqllogictest/test_files/functions.slt | 84 +++++++------------ 2 files changed, 39 insertions(+), 61 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 8ee447a92a95..c8987b256a82 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -532,7 +532,8 @@ pub(crate) fn comparison_binary_numeric_coercion( } } -pub fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +/// Decimal coercion rules. +pub(crate) fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { @@ -540,20 +541,19 @@ pub fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { get_wider_decimal_type(lhs_type, rhs_type) } - (Decimal128(_, _), _) => get_comparison_common_decimal_type(lhs_type, rhs_type), - (_, Decimal128(_, _)) => get_comparison_common_decimal_type(rhs_type, lhs_type), + (Decimal128(_, _), _) => get_common_decimal_type(lhs_type, rhs_type), + (_, Decimal128(_, _)) => get_common_decimal_type(rhs_type, lhs_type), (Decimal256(_, _), Decimal256(_, _)) => { get_wider_decimal_type(lhs_type, rhs_type) } - (Decimal256(_, _), _) => get_comparison_common_decimal_type(lhs_type, rhs_type), - (_, Decimal256(_, _)) => get_comparison_common_decimal_type(rhs_type, lhs_type), + (Decimal256(_, _), _) => get_common_decimal_type(lhs_type, rhs_type), + (_, Decimal256(_, _)) => get_common_decimal_type(rhs_type, lhs_type), (_, _) => None, } } -/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of -/// a comparison operation where one is a decimal -fn get_comparison_common_decimal_type( +/// Coerce `lhs_type` and `rhs_type` to a common type. +fn get_common_decimal_type( decimal_type: &DataType, other_type: &DataType, ) -> Option { diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 20ca94193b38..a75b04e27f15 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1178,14 +1178,26 @@ select coalesce(null, null); ---- NULL +# cast to float +query RT +select + coalesce(1, 2.0), + arrow_typeof(coalesce(1, 2.0)) +; +---- +1 Float64 + +query RT +select + coalesce(1, arrow_cast(2.0, 'Float32')), + arrow_typeof(coalesce(1, arrow_cast(2.0, 'Float32'))) +; +---- +1 Float32 + # test with empty args query error select coalesce(); ----- -DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce()'. You might need to add explicit type casts. - Candidate functions: - coalesce(CoercibleT or NULL, .., CoercibleT or NULL) - # test with different types query I @@ -1271,14 +1283,12 @@ test statement ok create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); -# test coercion string +# Dictionary and String are not coercible query error select coalesce(column1, 'none_set') from test1; ----- -DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Dictionary(Int32, Utf8), Utf8)'. You might need to add explicit type casts. - Candidate functions: - coalesce(CoercibleT or NULL, .., CoercibleT or NULL) +query error +select coalesce(null, column1, 'none_set') from test1; # explicit cast to Utf8 query T @@ -1287,14 +1297,18 @@ select coalesce(arrow_cast(column1, 'Utf8'), 'none_set') from test1; foo none_set -# test coercion Int +statement ok +drop table test1 + +# Numeric and Dictionary are not coercible query error select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ----- -DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Int64, Dictionary(Int32, Int8))'. You might need to add explicit type casts. - Candidate functions: - coalesce(CoercibleT or NULL, .., CoercibleT or NULL) +query error +select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'), 34); + +query error +select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); # explicit cast to Int8, and it will implicitly cast to Int64 query IT @@ -1304,37 +1318,6 @@ select ---- 123 Int64 -# test with Int -query error -select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'), 34); ----- -DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Dictionary(Int32, Int8), Int64)'. You might need to add explicit type casts. - Candidate functions: - coalesce(CoercibleT or NULL, .., CoercibleT or NULL) - - -# test with null -query error -select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ----- -DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Null, Int64, Dictionary(Int32, Int8))'. You might need to add explicit type casts. - Candidate functions: - coalesce(CoercibleT or NULL, .., CoercibleT or NULL) - - -# test with null -query error -select coalesce(null, column1, 'none_set') from test1; ----- -DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Null, Dictionary(Int32, Utf8), Utf8)'. You might need to add explicit type casts. - Candidate functions: - coalesce(CoercibleT or NULL, .., CoercibleT or NULL) - - -statement ok -drop table test1 - - statement ok CREATE TABLE test( c1 INT, @@ -1356,14 +1339,9 @@ SELECT COALESCE(c1, c2) FROM test 1 NULL -# coalesce result with default value +# int and string are not coercible query error -SELECT COALESCE(c1, c2, '-1') FROM test ----- -DataFusion error: Error during planning: No function matches the given name and argument types 'coalesce(Int32, Int32, Utf8)'. You might need to add explicit type casts. - Candidate functions: - coalesce(CoercibleT or NULL, .., CoercibleT or NULL) - +SELECT COALESCE(c1, c2, '-1') FROM test; statement ok drop table test From 25748964cff9ddb28dc3bb47a17a48b62c3cb216 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 14:25:02 +0800 Subject: [PATCH 12/37] fmt Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index c8987b256a82..71a27c069eb1 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -533,7 +533,10 @@ pub(crate) fn comparison_binary_numeric_coercion( } /// Decimal coercion rules. -pub(crate) fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +pub(crate) fn decimal_coercion( + lhs_type: &DataType, + rhs_type: &DataType, +) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { From 6a17e57dec7ce52aa3c476290a8b585d8a325b13 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 14:26:00 +0800 Subject: [PATCH 13/37] fix doc Signed-off-by: jayzhan211 --- datafusion/expr/src/signature.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index 7c5873bc8387..3b48656b2c9c 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -101,7 +101,7 @@ pub enum TypeSignature { /// `make_array(i32, i64) -> make_array(i64, i64)` VariadicEqual, /// One or more arguments of an arbitrary but equal type or Null. - /// No coercion is attempted. + /// Non-comparison coercion is attempted to match the signatures. /// /// Functions like `coalesce` is `VariadicEqual`. // TODO: Temporary Signature, to differentiate existing VariadicEqual. From 4cba8c535c996eeaca878a387889d313eb298eb9 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 14:28:43 +0800 Subject: [PATCH 14/37] cleanup Signed-off-by: jayzhan211 --- datafusion/physical-expr/src/udf.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/datafusion/physical-expr/src/udf.rs b/datafusion/physical-expr/src/udf.rs index 8ca12ecdf3de..aad78b7c2f90 100644 --- a/datafusion/physical-expr/src/udf.rs +++ b/datafusion/physical-expr/src/udf.rs @@ -50,8 +50,6 @@ pub fn create_physical_expr( let return_type = fun.return_type_from_exprs(args, input_dfschema, &input_expr_types)?; - // println!("return_type: {:?}", return_type); - // println!("input_expr_types: {:?}", input_expr_types); let fun_def = ScalarFunctionDefinition::UDF(Arc::new(fun.clone())); Ok(Arc::new(ScalarFunctionExpr::new( fun.name(), From f1cfb8dd192220fc08d8c5bbf81612b4fa48cb89 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 15:25:08 +0800 Subject: [PATCH 15/37] add more test Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 22 ++++++++++- .../expr/src/type_coercion/functions.rs | 9 ++++- .../sqllogictest/test_files/functions.slt | 37 +++++++++++++++++++ 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 71a27c069eb1..0f8b267320e3 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -383,10 +383,16 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { continue; } if let Some(ref candidate_t) = candidate_type { - // `coerced_from` is designed uni-directional for `can_coerced_from` so we need to check both directions + // We need to find a new possible candidate type that candidate type is able to + // coerced to, but NOT vice versa, so uni-directional coercion `coerced_from` is required if let Some(t) = coerced_from(data_type, candidate_t) { candidate_type = Some(t); } else if let Some(t) = coerced_from(candidate_t, data_type) { + // The reason why we check another direction is that: + // 1. Ensure that current type is able to coerced to candidate type + // 2. Coerced type may be different from the candidate and current data type + // For example, I64 and Decimal(7, 2) are expect to get coerced type Decimal(22, 2) + candidate_type = Some(t); } else { // Not coercible, return None @@ -987,7 +993,19 @@ fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { mod tests { use super::*; - use datafusion_common::assert_contains; + use datafusion_common::{assert_contains, ScalarValue}; + + #[test] + fn test_string_numeric() { + let a = ScalarValue::Utf8(Some(String::from("2"))); + let dt = a.data_type(); + let can_cast = can_cast_types(&dt, &DataType::Int32); + println!("{:?}", can_cast); + let a = ScalarValue::Utf8(Some(String::from("apple"))); + let dt = a.data_type(); + let can_cast = can_cast_types(&dt, &DataType::Int32); + println!("{:?}", can_cast); + } #[test] fn test_coercion_error() -> Result<()> { diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index a5f79becc1ec..eb02296a50ab 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -314,7 +314,11 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { false } -/// Coerced_from implicitly casts between types. +/// Find the coerced type for the given `type_into` and `type_from`. +/// Returns `None` if coercion is not possible. +/// +/// Expect uni-directional coercion, for example, i32 is coerced to i64, but i64 is not coerced to i32. +/// /// Unlike [comparison_coercion], the coerced type is usually `wider` for lossless conversion. pub(crate) fn coerced_from<'a>( type_into: &'a DataType, @@ -362,12 +366,15 @@ pub(crate) fn coerced_from<'a>( Some(type_into.clone()) } (UInt8, _) if matches!(type_from, Null | UInt8) => Some(type_into.clone()), + (UInt8, _) if matches!(type_from, Null | Int8) => Some(Int16), (UInt16, _) if matches!(type_from, Null | UInt8 | UInt16) => { Some(type_into.clone()) } + (UInt16, _) if matches!(type_from, Null | Int8 | Int16) => Some(Int32), (UInt32, _) if matches!(type_from, Null | UInt8 | UInt16 | UInt32) => { Some(type_into.clone()) } + (UInt32, _) if matches!(type_from, Null | Int8 | Int16 | Int32) => Some(Int64), (UInt64, _) if matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64) => { Some(type_into.clone()) } diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index a75b04e27f15..a21dfb03e9b0 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1205,6 +1205,43 @@ select coalesce(arrow_cast(1, 'Int32'), arrow_cast(1, 'Int64')); ---- 1 +# i32 and u32, cast to wider type i64 +query IT +select + coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32')), + arrow_typeof(coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32'))); +---- +2 Int64 + +query IT +select + coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16')), + arrow_typeof(coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16'))); +---- +2 Int32 + +query IT +select + coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8')), + arrow_typeof(coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8'))); +---- +2 Int16 + +# i64 and u32, cast to wider type i64 +query IT +select + coalesce(2, arrow_cast(3, 'UInt32')), + arrow_typeof(coalesce(2, arrow_cast(3, 'UInt32'))); +---- +2 Int64 + +# TODO: +# i64 and u64, should cast to i128 or decimal, not yet support +query error +select + coalesce(2, arrow_cast(3, 'UInt64')), + arrow_typeof(coalesce(2, arrow_cast(3, 'UInt64'))); + statement ok create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2, null); From d2e83d3e71df5540be092ec522c747b0c8b76c00 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 28 Apr 2024 16:44:30 +0800 Subject: [PATCH 16/37] switch to type_resolution coercion Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 96 ++++++++++++++----- .../expr/src/type_coercion/functions.rs | 18 +++- .../sqllogictest/test_files/functions.slt | 8 +- 3 files changed, 94 insertions(+), 28 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 0f8b267320e3..ba1070c1baa9 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -17,6 +17,7 @@ //! Coercion rules for matching argument types for binary operators +use std::collections::HashSet; use std::sync::Arc; use crate::Operator; @@ -30,8 +31,6 @@ use arrow::datatypes::{ use datafusion_common::{exec_datafusion_err, plan_datafusion_err, plan_err, Result}; -use super::functions::coerced_from; - /// The type signature of an instantiation of binary operator expression such as /// `lhs + rhs` /// @@ -291,15 +290,16 @@ fn bitwise_coercion(left_type: &DataType, right_type: &DataType) -> Option TypeCategory { @@ -318,8 +318,10 @@ fn data_type_category(data_type: &DataType) -> TypeCategory { return TypeCategory::Array; } - if matches!(data_type, DataType::Utf8 | DataType::LargeUtf8) { - return TypeCategory::String; + // String literal is possible to cast to many other types like numeric or datetime, + // therefore, it is categorized as a unknown type + if matches!(data_type, DataType::Utf8 | DataType::LargeUtf8 | DataType::Null) { + return TypeCategory::Unknown; } if matches!( @@ -342,7 +344,7 @@ fn data_type_category(data_type: &DataType) -> TypeCategory { return TypeCategory::Composite; } - TypeCategory::Unknown + TypeCategory::NotSupported } /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of constructs including @@ -369,10 +371,16 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { .filter(|&t| t != &DataType::Null) .map(data_type_category) .collect(); - if data_types_category + + if data_types_category.iter().any(|t| t == &TypeCategory::NotSupported) { + return None; + } + + // check if there is only one category excluding Unknown + let categories : HashSet= HashSet::from_iter(data_types_category .iter() - .any(|t| t != &data_types_category[0]) - { + .filter(|&c|c != &TypeCategory::Unknown).cloned()); + if categories.len() > 1 { return None; } @@ -383,29 +391,68 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { continue; } if let Some(ref candidate_t) = candidate_type { - // We need to find a new possible candidate type that candidate type is able to - // coerced to, but NOT vice versa, so uni-directional coercion `coerced_from` is required - if let Some(t) = coerced_from(data_type, candidate_t) { - candidate_type = Some(t); - } else if let Some(t) = coerced_from(candidate_t, data_type) { - // The reason why we check another direction is that: - // 1. Ensure that current type is able to coerced to candidate type - // 2. Coerced type may be different from the candidate and current data type - // For example, I64 and Decimal(7, 2) are expect to get coerced type Decimal(22, 2) + if let Some(t) = type_resolution_coercion(data_type, candidate_t) { candidate_type = Some(t); } else { - // Not coercible, return None return None; - } + } + + // // We need to find a new possible candidate type that candidate type is able to + // // coerced to, but NOT vice versa, so uni-directional coercion `coerced_from` is required + // if let Some(t) = coerced_from(data_type, candidate_t) { + // candidate_type = Some(t); + // } else if let Some(t) = coerced_from(candidate_t, data_type) { + // // The reason why we check another direction is that: + // // 1. Ensure that current type is able to coerced to candidate type + // // 2. Coerced type may be different from the candidate and current data type + // // For example, I64 and Decimal(7, 2) are expect to get coerced type Decimal(22, 2) + + // candidate_type = Some(t); + // } else { + // // Not coercible, return None + // return None; + // } } else { candidate_type = Some(data_type.clone()); } } + println!("candidate_type: {:?}", candidate_type); + candidate_type } +fn type_resolution_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { + use arrow::datatypes::DataType::*; + if lhs_type == rhs_type { + return Some(lhs_type.clone()); + } + + if let Some(t) = binary_numeric_coercion(lhs_type, rhs_type) { + return Some(t); + } + + match (lhs_type, rhs_type) { + // (Decimal128(_, _), Decimal128(_, _)) | (Decimal256(_, _), Decimal256(_, _)) => { + // decimal_coercion(lhs_type, rhs_type) + // } + // (Decimal128(_, _) | Decimal256(_, _), other_type) + // | (other_type, Decimal128(_, _) | Decimal256(_, _)) + // if matches!(other_type, Int8 | Int16 | Int32 | Int64) => + // { + // decimal_coercion(lhs_type, rhs_type) + // } + (data_type, Utf8 | LargeUtf8) | + (Utf8 | LargeUtf8, data_type) if data_type.is_numeric() => { + // let arrow_cast handle the actual casting + Some(data_type.clone()) + } + // numeric types + _ => None, + } +} + /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation /// Unlike [coerced_from], usually the coerced type is for comparison only. /// For example, compare with Dictionary and Dictionary, only value type is what we care about @@ -414,7 +461,7 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option equality is possible return Some(lhs_type.clone()); } - comparison_binary_numeric_coercion(lhs_type, rhs_type) + binary_numeric_coercion(lhs_type, rhs_type) .or_else(|| dictionary_coercion(lhs_type, rhs_type, true)) .or_else(|| temporal_coercion(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) @@ -478,9 +525,8 @@ fn string_temporal_coercion( match_rule(lhs_type, rhs_type).or_else(|| match_rule(rhs_type, lhs_type)) } -/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation -/// where one both are numeric -pub(crate) fn comparison_binary_numeric_coercion( +/// Coerce `lhs_type` and `rhs_type` to a common type where both are numeric +pub(crate) fn binary_numeric_coercion( lhs_type: &DataType, rhs_type: &DataType, ) -> Option { diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index eb02296a50ab..b41e236a9a10 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -185,7 +185,9 @@ fn get_valid_types( .map(|valid_type| (0..*number).map(|_| valid_type.clone()).collect()) .collect(), TypeSignature::VariadicEqualOrNull => { + println!("current_types: {:?}", current_types); if let Some(common_type) = type_resolution(current_types) { + println!("common_Type: {:?}", common_type); vec![vec![common_type; current_types.len()]] } else { vec![] @@ -281,6 +283,9 @@ fn maybe_data_types( return None; } + println!("current_types: {:?}", current_types); + println!("valid_types: {:?}", valid_types); + let mut new_type = Vec::with_capacity(valid_types.len()); for (i, valid_type) in valid_types.iter().enumerate() { let current_type = ¤t_types[i]; @@ -289,12 +294,21 @@ fn maybe_data_types( new_type.push(current_type.clone()) } else { // attempt to coerce. - if let Some(coerced_type) = coerced_from(valid_type, current_type) { - new_type.push(coerced_type) + + if can_cast_types(current_type, valid_type) { + new_type.push(valid_type.clone()); + } else { // not possible return None; } + + // if let Some(coerced_type) = coerced_from(valid_type, current_type) { + // new_type.push(coerced_type) + // } else { + // // not possible + // return None; + // } } } Some(new_type) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index a21dfb03e9b0..b792cbf4fc2d 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1377,8 +1377,14 @@ SELECT COALESCE(c1, c2) FROM test NULL # int and string are not coercible -query error +query I SELECT COALESCE(c1, c2, '-1') FROM test; +---- +0 +1 +1 +1 +-1 statement ok drop table test From 03880a3050ce8e871001aead362535f86cf3e4b8 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 29 Apr 2024 20:07:41 +0800 Subject: [PATCH 17/37] fix i64 and u64 case Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 18 +++-- .../expr/src/type_coercion/functions.rs | 66 ++++++++++++++----- .../sqllogictest/test_files/functions.slt | 7 +- 3 files changed, 62 insertions(+), 29 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index ba1070c1baa9..23ea18e0f008 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -418,7 +418,7 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { } } - println!("candidate_type: {:?}", candidate_type); + // println!("candidate_type: {:?}", candidate_type); candidate_type } @@ -429,20 +429,13 @@ fn type_resolution_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option< return Some(lhs_type.clone()); } + // numeric coercion is the same as comparison coercion, both find the narrowest type + // that can accommodate both types if let Some(t) = binary_numeric_coercion(lhs_type, rhs_type) { return Some(t); } match (lhs_type, rhs_type) { - // (Decimal128(_, _), Decimal128(_, _)) | (Decimal256(_, _), Decimal256(_, _)) => { - // decimal_coercion(lhs_type, rhs_type) - // } - // (Decimal128(_, _) | Decimal256(_, _), other_type) - // | (other_type, Decimal128(_, _) | Decimal256(_, _)) - // if matches!(other_type, Int8 | Int16 | Int32 | Int64) => - // { - // decimal_coercion(lhs_type, rhs_type) - // } (data_type, Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, data_type) if data_type.is_numeric() => { // let arrow_cast handle the actual casting @@ -554,6 +547,11 @@ pub(crate) fn binary_numeric_coercion( // accommodates all values of both types. Note that some information // loss is inevitable when we have a signed type and a `UInt64`, in // which case we use `Int64`;i.e. the widest signed integral type. + + // TODO: For i64 and u64, we can use decimal or float64 + // Postgres has no unsigned type :( + // DuckDB v.0.10.0 has double (double precision floating-point number (8 bytes)) + // for largest signed (signed sixteen-byte integer) and unsigned integer (unsigned sixteen-byte integer) (Int64, _) | (_, Int64) | (UInt64, Int8) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index b41e236a9a10..cb8f704102a3 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -54,7 +54,7 @@ pub fn data_types( } } - let valid_types = get_valid_types(&signature.type_signature, current_types)?; + let mut valid_types = get_valid_types(&signature.type_signature, current_types)?; if valid_types .iter() .any(|data_type| data_type == current_types) @@ -62,11 +62,21 @@ pub fn data_types( return Ok(current_types.to_vec()); } - // Try and coerce the argument types to match the signature, returning the - // coerced types from the first matching signature. - for valid_types in valid_types { - if let Some(types) = maybe_data_types(&valid_types, current_types) { - return Ok(types); + // Well-supported signature that returns exact valid types. + if !valid_types.is_empty() && matches!(signature.type_signature,TypeSignature::VariadicEqualOrNull) { + // exact valid types + assert_eq!(valid_types.len(), 1); + let valid_types = valid_types.swap_remove(0); + if let Some(t) = maybe_data_types_without_coercion(&valid_types, current_types) { + return Ok(t); + } + } else { + // Try and coerce the argument types to match the signature, returning the + // coerced types from the first matching signature. + for valid_types in valid_types { + if let Some(types) = maybe_data_types(&valid_types, current_types) { + return Ok(types); + } } } @@ -294,21 +304,42 @@ fn maybe_data_types( new_type.push(current_type.clone()) } else { // attempt to coerce. - - if can_cast_types(current_type, valid_type) { - new_type.push(valid_type.clone()); - + // TODO: Replace with `can_cast_types` after failing cases are resolved + // (they need new signature that returns exactly valid types instead of list of possible valid types). + if let Some(coerced_type) = coerced_from(valid_type, current_type) { + new_type.push(coerced_type) } else { // not possible return None; } + } + } + Some(new_type) +} + +/// Check if the current argument types can be coerced to match the given `valid_types` +/// unlike `maybe_data_types`, this function does not coerce the types. +/// TODO: I think this function should replace `maybe_data_types` after signature are well-supported. +fn maybe_data_types_without_coercion( + valid_types: &[DataType], + current_types: &[DataType], +) -> Option> { + if valid_types.len() != current_types.len() { + return None; + } - // if let Some(coerced_type) = coerced_from(valid_type, current_type) { - // new_type.push(coerced_type) - // } else { - // // not possible - // return None; - // } + let mut new_type = Vec::with_capacity(valid_types.len()); + for (i, valid_type) in valid_types.iter().enumerate() { + let current_type = ¤t_types[i]; + + if current_type == valid_type { + new_type.push(current_type.clone()) + } else { + if can_cast_types(current_type, valid_type) { + new_type.push(valid_type.clone()) + } else { + return None; + } } } Some(new_type) @@ -439,6 +470,9 @@ pub(crate) fn coerced_from<'a>( } // Any type can be coerced into strings (Utf8 | LargeUtf8, _) => Some(type_into.clone()), + // convert string numeric to numeric, let arrow::cast handle the actual conversion + // expect arrow error for non-numeric strings + // (data_type, Utf8 | LargeUtf8) if data_type.is_numeric() => Some(type_into.clone()), (Null, _) if can_cast_types(type_from, type_into) => Some(type_into.clone()), (List(_), _) if matches!(type_from, FixedSizeList(_, _)) => { diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index b792cbf4fc2d..0e882460753a 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1235,12 +1235,13 @@ select ---- 2 Int64 -# TODO: -# i64 and u64, should cast to i128 or decimal, not yet support -query error +# TODO: Got types (i64, u64), casting to decimal or double or even i128 if supported +query IT select coalesce(2, arrow_cast(3, 'UInt64')), arrow_typeof(coalesce(2, arrow_cast(3, 'UInt64'))); +---- +2 Int64 statement ok create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2, null); From 481f54812684c3352068d1dd8cf3c6959a53facb Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 29 Apr 2024 20:49:10 +0800 Subject: [PATCH 18/37] add more tests Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 122 +++++++++--------- .../expr/src/type_coercion/functions.rs | 19 +-- .../sqllogictest/test_files/functions.slt | 48 +++++-- 3 files changed, 105 insertions(+), 84 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 23ea18e0f008..3aa786e27b69 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -320,7 +320,10 @@ fn data_type_category(data_type: &DataType) -> TypeCategory { // String literal is possible to cast to many other types like numeric or datetime, // therefore, it is categorized as a unknown type - if matches!(data_type, DataType::Utf8 | DataType::LargeUtf8 | DataType::Null) { + if matches!( + data_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Null + ) { return TypeCategory::Unknown; } @@ -350,6 +353,7 @@ fn data_type_category(data_type: &DataType) -> TypeCategory { /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of constructs including /// CASE, ARRAY, VALUES, and the GREATEST and LEAST functions. /// See for more information. +/// The actual rules follows the behavior of Postgres and DuckDB pub fn type_resolution(data_types: &[DataType]) -> Option { if data_types.is_empty() { return None; @@ -371,15 +375,21 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { .filter(|&t| t != &DataType::Null) .map(data_type_category) .collect(); - - if data_types_category.iter().any(|t| t == &TypeCategory::NotSupported) { + + if data_types_category + .iter() + .any(|t| t == &TypeCategory::NotSupported) + { return None; } // check if there is only one category excluding Unknown - let categories : HashSet= HashSet::from_iter(data_types_category - .iter() - .filter(|&c|c != &TypeCategory::Unknown).cloned()); + let categories: HashSet = HashSet::from_iter( + data_types_category + .iter() + .filter(|&c| c != &TypeCategory::Unknown) + .cloned(), + ); if categories.len() > 1 { return None; } @@ -391,64 +401,46 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { continue; } if let Some(ref candidate_t) = candidate_type { - + // Find candidate type that all the data types can be coerced to + // Follows the behavior of Postgres and DuckDB + // Coerced type may be different from the candidate and current data type + // For example, + // i64 and decimal(7, 2) are expect to get coerced type decimal(22, 2) + // numeric string ('1') and numeric (2) are expect to get coerced type numeric (1, 2) if let Some(t) = type_resolution_coercion(data_type, candidate_t) { candidate_type = Some(t); } else { return None; - } - - // // We need to find a new possible candidate type that candidate type is able to - // // coerced to, but NOT vice versa, so uni-directional coercion `coerced_from` is required - // if let Some(t) = coerced_from(data_type, candidate_t) { - // candidate_type = Some(t); - // } else if let Some(t) = coerced_from(candidate_t, data_type) { - // // The reason why we check another direction is that: - // // 1. Ensure that current type is able to coerced to candidate type - // // 2. Coerced type may be different from the candidate and current data type - // // For example, I64 and Decimal(7, 2) are expect to get coerced type Decimal(22, 2) - - // candidate_type = Some(t); - // } else { - // // Not coercible, return None - // return None; - // } + } } else { candidate_type = Some(data_type.clone()); } } - // println!("candidate_type: {:?}", candidate_type); - candidate_type } -fn type_resolution_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { - use arrow::datatypes::DataType::*; +/// See [type_resolution] for more information. +fn type_resolution_coercion( + lhs_type: &DataType, + rhs_type: &DataType, +) -> Option { if lhs_type == rhs_type { return Some(lhs_type.clone()); } - + // numeric coercion is the same as comparison coercion, both find the narrowest type // that can accommodate both types - if let Some(t) = binary_numeric_coercion(lhs_type, rhs_type) { - return Some(t); - } - - match (lhs_type, rhs_type) { - (data_type, Utf8 | LargeUtf8) | - (Utf8 | LargeUtf8, data_type) if data_type.is_numeric() => { - // let arrow_cast handle the actual casting - Some(data_type.clone()) - } - // numeric types - _ => None, - } + binary_numeric_coercion(lhs_type, rhs_type) + .or_else(|| pure_string_coercion(lhs_type, rhs_type)) + .or_else(|| numeric_string_coercion(lhs_type, rhs_type)) } /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation /// Unlike [coerced_from], usually the coerced type is for comparison only. /// For example, compare with Dictionary and Dictionary, only value type is what we care about +/// +/// [coerced_from]: super::functions::coerced_from pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { if lhs_type == rhs_type { // same type => equality is possible @@ -457,6 +449,7 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { - (Utf8, Utf8) => Some(Utf8), - (LargeUtf8, Utf8) => Some(LargeUtf8), - (Utf8, LargeUtf8) => Some(LargeUtf8), - (LargeUtf8, LargeUtf8) => Some(LargeUtf8), // TODO: cast between array elements (#6558) (List(_), List(_)) => Some(lhs_type.clone()), (List(_), _) => Some(lhs_type.clone()), @@ -892,6 +881,29 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option } } +fn pure_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { + use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { + (Utf8, Utf8) => Some(Utf8), + (LargeUtf8, Utf8) => Some(LargeUtf8), + (Utf8, LargeUtf8) => Some(LargeUtf8), + (LargeUtf8, LargeUtf8) => Some(LargeUtf8), + _ => None, + } +} + +fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { + use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { + (Utf8 | LargeUtf8, other_type) | (other_type, Utf8 | LargeUtf8) + if other_type.is_numeric() => + { + Some(other_type.clone()) + } + _ => None, + } +} + /// Coercion rules for binary (Binary/LargeBinary) to string (Utf8/LargeUtf8): /// If one argument is binary and the other is a string then coerce to string /// (e.g. for `like`) @@ -1037,19 +1049,7 @@ fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { mod tests { use super::*; - use datafusion_common::{assert_contains, ScalarValue}; - - #[test] - fn test_string_numeric() { - let a = ScalarValue::Utf8(Some(String::from("2"))); - let dt = a.data_type(); - let can_cast = can_cast_types(&dt, &DataType::Int32); - println!("{:?}", can_cast); - let a = ScalarValue::Utf8(Some(String::from("apple"))); - let dt = a.data_type(); - let can_cast = can_cast_types(&dt, &DataType::Int32); - println!("{:?}", can_cast); - } + use datafusion_common::assert_contains; #[test] fn test_coercion_error() -> Result<()> { diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index cb8f704102a3..e4969a052774 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -63,7 +63,9 @@ pub fn data_types( } // Well-supported signature that returns exact valid types. - if !valid_types.is_empty() && matches!(signature.type_signature,TypeSignature::VariadicEqualOrNull) { + if !valid_types.is_empty() + && matches!(signature.type_signature, TypeSignature::VariadicEqualOrNull) + { // exact valid types assert_eq!(valid_types.len(), 1); let valid_types = valid_types.swap_remove(0); @@ -195,9 +197,7 @@ fn get_valid_types( .map(|valid_type| (0..*number).map(|_| valid_type.clone()).collect()) .collect(), TypeSignature::VariadicEqualOrNull => { - println!("current_types: {:?}", current_types); if let Some(common_type) = type_resolution(current_types) { - println!("common_Type: {:?}", common_type); vec![vec![common_type; current_types.len()]] } else { vec![] @@ -293,9 +293,6 @@ fn maybe_data_types( return None; } - println!("current_types: {:?}", current_types); - println!("valid_types: {:?}", valid_types); - let mut new_type = Vec::with_capacity(valid_types.len()); for (i, valid_type) in valid_types.iter().enumerate() { let current_type = ¤t_types[i]; @@ -304,7 +301,7 @@ fn maybe_data_types( new_type.push(current_type.clone()) } else { // attempt to coerce. - // TODO: Replace with `can_cast_types` after failing cases are resolved + // TODO: Replace with `can_cast_types` after failing cases are resolved // (they need new signature that returns exactly valid types instead of list of possible valid types). if let Some(coerced_type) = coerced_from(valid_type, current_type) { new_type.push(coerced_type) @@ -334,12 +331,10 @@ fn maybe_data_types_without_coercion( if current_type == valid_type { new_type.push(current_type.clone()) + } else if can_cast_types(current_type, valid_type) { + new_type.push(valid_type.clone()) } else { - if can_cast_types(current_type, valid_type) { - new_type.push(valid_type.clone()) - } else { - return None; - } + return None; } } Some(new_type) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 0e882460753a..44eb21431e2e 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1306,17 +1306,43 @@ select ---- 2 Decimal256(22, 2) -# coalesce static empty value -query T -SELECT COALESCE('', 'test') +# coalesce string +query TT +select + coalesce('', 'test'), + coalesce(null, 'test'); ---- -(empty) +(empty) test -# coalesce static value with null -query T -SELECT COALESCE(NULL, 'test') +# coalesce utf8 and large utf8 +query TT +select + coalesce('a', arrow_cast('b', 'LargeUtf8')), + arrow_typeof(coalesce('a', arrow_cast('b', 'LargeUtf8'))) +; ---- -test +a LargeUtf8 + +# coalesce array +query ?T +select + coalesce(array[1, 2], array[3, 4]), + arrow_typeof(coalesce(array[1, 2], array[3, 4])); +---- +[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) + +query ?T +select + coalesce(null, array[3, 4]), + arrow_typeof(coalesce(array[1, 2], array[3, 4])); +---- +[3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) + +# TODO: after switch signature of array to the same with coalesce, this query should be fixed +query error +select + coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]), + arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')])); statement ok create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); @@ -1328,7 +1354,7 @@ select coalesce(column1, 'none_set') from test1; query error select coalesce(null, column1, 'none_set') from test1; -# explicit cast to Utf8 +# explicitly cast to Utf8 query T select coalesce(arrow_cast(column1, 'Utf8'), 'none_set') from test1; ---- @@ -1348,7 +1374,7 @@ select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'), 34); query error select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); -# explicit cast to Int8, and it will implicitly cast to Int64 +# explicitly cast to Int8, and it will implicitly cast to Int64 query IT select coalesce(arrow_cast(123, 'Int8'), 34), @@ -1377,7 +1403,7 @@ SELECT COALESCE(c1, c2) FROM test 1 NULL -# int and string are not coercible +# numeric string is coerced to numeric in both Postgres and DuckDB query I SELECT COALESCE(c1, c2, '-1') FROM test; ---- From dfc41766bf154a84e0ed11aa57b80acc3f38373c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 29 Apr 2024 21:06:49 +0800 Subject: [PATCH 19/37] cleanup Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 5 +++++ datafusion/expr/src/type_coercion/functions.rs | 15 ++------------- datafusion/functions/src/core/coalesce.rs | 11 ++++------- .../sqllogictest/test_files/functions.slt | 18 +++++++++++++----- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 3aa786e27b69..f69aa45d14f8 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -872,6 +872,11 @@ fn string_concat_internal_coercion( /// to string type. fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; + + if let Some(t) = pure_string_coercion(lhs_type, rhs_type) { + return Some(t); + } + match (lhs_type, rhs_type) { // TODO: cast between array elements (#6558) (List(_), List(_)) => Some(lhs_type.clone()), diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index e4969a052774..da9ef621bb0d 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use crate::signature::{ ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD, }; -use crate::type_coercion::binary::{decimal_coercion, type_resolution}; +use crate::type_coercion::binary::type_resolution; use crate::{Signature, TypeSignature}; use arrow::{ compute::can_cast_types, @@ -360,7 +360,7 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { /// Expect uni-directional coercion, for example, i32 is coerced to i64, but i64 is not coerced to i32. /// /// Unlike [comparison_coercion], the coerced type is usually `wider` for lossless conversion. -pub(crate) fn coerced_from<'a>( +fn coerced_from<'a>( type_into: &'a DataType, type_from: &'a DataType, ) -> Option { @@ -379,14 +379,6 @@ pub(crate) fn coerced_from<'a>( { Some(type_into.clone()) } - (Decimal128(_, _), Decimal128(_, _)) | (Decimal256(_, _), Decimal256(_, _)) => { - decimal_coercion(type_into, type_from) - } - (Decimal128(_, _) | Decimal256(_, _), _) - if matches!(type_from, Int8 | Int16 | Int32 | Int64) => - { - decimal_coercion(type_into, type_from) - } // coerced into type_into (Int8, _) if matches!(type_from, Null | Int8) => Some(type_into.clone()), (Int16, _) if matches!(type_from, Null | Int8 | Int16 | UInt8) => { @@ -406,15 +398,12 @@ pub(crate) fn coerced_from<'a>( Some(type_into.clone()) } (UInt8, _) if matches!(type_from, Null | UInt8) => Some(type_into.clone()), - (UInt8, _) if matches!(type_from, Null | Int8) => Some(Int16), (UInt16, _) if matches!(type_from, Null | UInt8 | UInt16) => { Some(type_into.clone()) } - (UInt16, _) if matches!(type_from, Null | Int8 | Int16) => Some(Int32), (UInt32, _) if matches!(type_from, Null | UInt8 | UInt16 | UInt32) => { Some(type_into.clone()) } - (UInt32, _) if matches!(type_from, Null | Int8 | Int16 | Int32) => Some(Int64), (UInt64, _) if matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64) => { Some(type_into.clone()) } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index e27b4fd04414..61885103f169 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -22,8 +22,8 @@ use arrow::compute::kernels::zip::zip; use arrow::compute::{and, is_not_null, is_null}; use arrow::datatypes::DataType; -use datafusion_common::{exec_err, internal_err, Result}; -use datafusion_expr::type_coercion::binary::type_resolution; +use datafusion_common::{exec_err, Result}; +// use datafusion_expr::type_coercion::binary::type_resolution; use datafusion_expr::ColumnarValue; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; @@ -60,11 +60,8 @@ impl ScalarUDFImpl for CoalesceFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - if let Some(common_type) = type_resolution(arg_types) { - Ok(common_type) - } else { - internal_err!("Error should be thrown via signature validation") - } + // In theroy arg_types are coerced types, so we can just return the first one. + Ok(arg_types[0].clone()) } /// coalesce evaluates to the first value which is not NULL diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 44eb21431e2e..a0d495d34f70 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1167,10 +1167,10 @@ select coalesce(1, 2, 3); 1 # test with first null -query I -select coalesce(null, 3, 2, 1); +query ?T +select coalesce(null, 3, 2, 1), arrow_typeof(coalesce(null, 3, 2, 1)); ---- -3 +3 Int64 # test with null values query ? @@ -1179,7 +1179,7 @@ select coalesce(null, null); NULL # cast to float -query RT +query IT select coalesce(1, 2.0), arrow_typeof(coalesce(1, 2.0)) @@ -1188,6 +1188,14 @@ select 1 Float64 query RT +select + coalesce(2.0, 1), + arrow_typeof(coalesce(2.0, 1)) +; +---- +2 Float64 + +query IT select coalesce(1, arrow_cast(2.0, 'Float32')), arrow_typeof(coalesce(1, arrow_cast(2.0, 'Float32'))) @@ -1307,7 +1315,7 @@ select 2 Decimal256(22, 2) # coalesce string -query TT +query T? select coalesce('', 'test'), coalesce(null, 'test'); From 46a906065db808e3232720cc2990acfe45ac13ab Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 29 Apr 2024 21:11:12 +0800 Subject: [PATCH 20/37] add null case Signed-off-by: jayzhan211 --- datafusion/sqllogictest/test_files/functions.slt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index a0d495d34f70..8af5563b8689 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1213,6 +1213,12 @@ select coalesce(arrow_cast(1, 'Int32'), arrow_cast(1, 'Int64')); ---- 1 +# test with nulls +query ?T +select coalesce(null, null, null), arrow_typeof(coalesce(null, null)); +---- +NULL Null + # i32 and u32, cast to wider type i64 query IT select From d6566454abaf9fcfab4a6e8c05e3f29e4583f094 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 29 Apr 2024 21:13:16 +0800 Subject: [PATCH 21/37] fmt Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index f69aa45d14f8..c77b8f5d66c5 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -439,7 +439,7 @@ fn type_resolution_coercion( /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation /// Unlike [coerced_from], usually the coerced type is for comparison only. /// For example, compare with Dictionary and Dictionary, only value type is what we care about -/// +/// /// [coerced_from]: super::functions::coerced_from pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { if lhs_type == rhs_type { From 568344739dbc43e7dd1015b1bb1da49ca12b7382 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 29 Apr 2024 21:24:05 +0800 Subject: [PATCH 22/37] fix Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 4 +--- datafusion/expr/src/type_coercion/functions.rs | 3 --- datafusion/functions/src/core/coalesce.rs | 3 +-- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index c77b8f5d66c5..f03594c1b16d 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -437,10 +437,8 @@ fn type_resolution_coercion( } /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation -/// Unlike [coerced_from], usually the coerced type is for comparison only. +/// Unlike `coerced_from`, usually the coerced type is for comparison only. /// For example, compare with Dictionary and Dictionary, only value type is what we care about -/// -/// [coerced_from]: super::functions::coerced_from pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { if lhs_type == rhs_type { // same type => equality is possible diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index da9ef621bb0d..175d3cc58eb4 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -454,9 +454,6 @@ fn coerced_from<'a>( } // Any type can be coerced into strings (Utf8 | LargeUtf8, _) => Some(type_into.clone()), - // convert string numeric to numeric, let arrow::cast handle the actual conversion - // expect arrow error for non-numeric strings - // (data_type, Utf8 | LargeUtf8) if data_type.is_numeric() => Some(type_into.clone()), (Null, _) if can_cast_types(type_from, type_into) => Some(type_into.clone()), (List(_), _) if matches!(type_from, FixedSizeList(_, _)) => { diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index 61885103f169..06a79b87ad15 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -23,7 +23,6 @@ use arrow::compute::{and, is_not_null, is_null}; use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; -// use datafusion_expr::type_coercion::binary::type_resolution; use datafusion_expr::ColumnarValue; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; @@ -60,7 +59,7 @@ impl ScalarUDFImpl for CoalesceFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - // In theroy arg_types are coerced types, so we can just return the first one. + // arg_types are coerced, so we can just return the first one. Ok(arg_types[0].clone()) } From b949fae5854e3168f686e6563c98e6433687aeae Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 30 Apr 2024 07:52:35 +0800 Subject: [PATCH 23/37] rename to type_union_resolution Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 19 ++++++++++++------- .../expr/src/type_coercion/functions.rs | 4 ++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index f03594c1b16d..dbbb0fa72973 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -350,11 +350,15 @@ fn data_type_category(data_type: &DataType) -> TypeCategory { TypeCategory::NotSupported } -/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of constructs including -/// CASE, ARRAY, VALUES, and the GREATEST and LEAST functions. +/// Coerce dissimilar data types to a single data type. +/// UNION, INTERSECT, EXCEPT, CASE, ARRAY, VALUES, and the GREATEST and LEAST functions are +/// examples that has the similar resolution rules. /// See for more information. -/// The actual rules follows the behavior of Postgres and DuckDB -pub fn type_resolution(data_types: &[DataType]) -> Option { +/// The rules in the document provide a clue, but adhering strictly to them doesn't precisely +/// align with the behavior of Postgres. Therefore, we've made slight adjustments to the rules +/// to better match the behavior of both Postgres and DuckDB. For example, we expect adjusted +/// decimal percision and scale when coercing decimal types. +pub(super) fn type_union_resolution(data_types: &[DataType]) -> Option { if data_types.is_empty() { return None; } @@ -407,7 +411,7 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { // For example, // i64 and decimal(7, 2) are expect to get coerced type decimal(22, 2) // numeric string ('1') and numeric (2) are expect to get coerced type numeric (1, 2) - if let Some(t) = type_resolution_coercion(data_type, candidate_t) { + if let Some(t) = type_union_resolution_coercion(data_type, candidate_t) { candidate_type = Some(t); } else { return None; @@ -420,8 +424,9 @@ pub fn type_resolution(data_types: &[DataType]) -> Option { candidate_type } -/// See [type_resolution] for more information. -fn type_resolution_coercion( +/// Coerce `lhs_type` and `rhs_type` to a common type for [type_union_resolution] +/// See [type_union_resolution] for more information. +fn type_union_resolution_coercion( lhs_type: &DataType, rhs_type: &DataType, ) -> Option { diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 175d3cc58eb4..9d2a303d3f55 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use crate::signature::{ ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD, }; -use crate::type_coercion::binary::type_resolution; +use crate::type_coercion::binary::type_union_resolution; use crate::{Signature, TypeSignature}; use arrow::{ compute::can_cast_types, @@ -197,7 +197,7 @@ fn get_valid_types( .map(|valid_type| (0..*number).map(|_| valid_type.clone()).collect()) .collect(), TypeSignature::VariadicEqualOrNull => { - if let Some(common_type) = type_resolution(current_types) { + if let Some(common_type) = type_union_resolution(current_types) { vec![vec![common_type; current_types.len()]] } else { vec![] From 5aaeb5b327dd3e8b164ae5139c53643fda18312e Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 30 Apr 2024 07:55:02 +0800 Subject: [PATCH 24/37] add comment Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index dbbb0fa72973..e6f3301250ce 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -880,6 +880,7 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option return Some(t); } + // TODO: Move out to a separate function or remove it. Review the need for this. match (lhs_type, rhs_type) { // TODO: cast between array elements (#6558) (List(_), List(_)) => Some(lhs_type.clone()), From 15471ab3a88f370130d219f4e241b4862e0dc2ea Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 2 May 2024 09:27:33 +0800 Subject: [PATCH 25/37] cleanup Signed-off-by: jayzhan211 --- datafusion/expr/src/signature.rs | 23 ++--- datafusion/expr/src/type_coercion/binary.rs | 84 ++++++++++--------- .../expr/src/type_coercion/functions.rs | 27 ++++-- datafusion/functions/src/core/coalesce.rs | 2 +- 4 files changed, 75 insertions(+), 61 deletions(-) diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index 3b48656b2c9c..615538ee56a7 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -92,7 +92,9 @@ pub enum TypeSignature { /// A function such as `concat` is `Variadic(vec![DataType::Utf8, DataType::LargeUtf8])` Variadic(Vec), /// One or more arguments of an arbitrary but equal type. - /// DataFusion attempts to coerce all argument types to match to the common type with comparision coercion. + /// DataFusion attempts to coerce all argument types to match to the common type with [comparision coercion]. + /// + /// [comparision coercion]: https://docs.rs/datafusion/latest/datafusion/logical_expr/type_coercion/binary/fn.comparison_coercion.html /// /// # Examples /// Given types in signature should be coercible to the same final type. @@ -101,13 +103,12 @@ pub enum TypeSignature { /// `make_array(i32, i64) -> make_array(i64, i64)` VariadicEqual, /// One or more arguments of an arbitrary but equal type or Null. - /// Non-comparison coercion is attempted to match the signatures. + /// [Type union coercion] is attempted to match the signatures. + /// + /// Functions like `coalesce` is `Union(VariadicAny)`. /// - /// Functions like `coalesce` is `VariadicEqual`. - // TODO: Temporary Signature, to differentiate existing VariadicEqual. - // After we swtich `make_array` to VariadicEqualOrNull, - // we can reuse VariadicEqual. - VariadicEqualOrNull, + /// [Type union coercion]: https://docs.rs/datafusion/latest/datafusion/logical_expr/type_coercion/binary/fn.type_union_resolution.html + Union(Box), /// One or more arguments with arbitrary types VariadicAny, /// Fixed number of arguments of an arbitrary but equal type out of a list of valid types. @@ -201,8 +202,8 @@ impl TypeSignature { TypeSignature::VariadicEqual => { vec!["CoercibleT, .., CoercibleT".to_string()] } - TypeSignature::VariadicEqualOrNull => { - vec!["CoercibleT or NULL, .., CoercibleT or NULL".to_string()] + TypeSignature::Union(sig) => { + vec![format!("Union({:?})", sig.to_string_repr())] } TypeSignature::VariadicAny => vec!["Any, .., Any".to_string()], TypeSignature::OneOf(sigs) => { @@ -274,9 +275,9 @@ impl Signature { } } /// One or more number of arguments of the same type. - pub fn variadic_equal_or_null(volatility: Volatility) -> Self { + pub fn union_variadic(volatility: Volatility) -> Self { Self { - type_signature: TypeSignature::VariadicEqualOrNull, + type_signature: TypeSignature::Union(Box::new(TypeSignature::VariadicAny)), volatility, } } diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index e6f3301250ce..9f5b988f83ae 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -302,52 +302,54 @@ enum TypeCategory { NotSupported, } -fn data_type_category(data_type: &DataType) -> TypeCategory { - if data_type.is_numeric() { - return TypeCategory::Numeric; - } +impl From<&DataType> for TypeCategory { + fn from(data_type: &DataType) -> Self { + if data_type.is_numeric() { + return TypeCategory::Numeric; + } - if matches!(data_type, DataType::Boolean) { - return TypeCategory::Boolean; - } + if matches!(data_type, DataType::Boolean) { + return TypeCategory::Boolean; + } - if matches!( - data_type, - DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) - ) { - return TypeCategory::Array; - } + if matches!( + data_type, + DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) + ) { + return TypeCategory::Array; + } - // String literal is possible to cast to many other types like numeric or datetime, - // therefore, it is categorized as a unknown type - if matches!( - data_type, - DataType::Utf8 | DataType::LargeUtf8 | DataType::Null - ) { - return TypeCategory::Unknown; - } + // String literal is possible to cast to many other types like numeric or datetime, + // therefore, it is categorized as a unknown type + if matches!( + data_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Null + ) { + return TypeCategory::Unknown; + } - if matches!( - data_type, - DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Interval(_) - | DataType::Duration(_) - ) { - return TypeCategory::DateTime; - } + if matches!( + data_type, + DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Interval(_) + | DataType::Duration(_) + ) { + return TypeCategory::DateTime; + } - if matches!( - data_type, - DataType::Dictionary(_, _) | DataType::Struct(_) | DataType::Union(_, _) - ) { - return TypeCategory::Composite; - } + if matches!( + data_type, + DataType::Dictionary(_, _) | DataType::Struct(_) | DataType::Union(_, _) + ) { + return TypeCategory::Composite; + } - TypeCategory::NotSupported + TypeCategory::NotSupported + } } /// Coerce dissimilar data types to a single data type. @@ -377,7 +379,7 @@ pub(super) fn type_union_resolution(data_types: &[DataType]) -> Option let data_types_category: Vec = data_types .iter() .filter(|&t| t != &DataType::Null) - .map(data_type_category) + .map(|t| t.into()) .collect(); if data_types_category diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 9d2a303d3f55..c4c0c2e75c86 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -27,7 +27,9 @@ use arrow::{ datatypes::{DataType, TimeUnit}, }; use datafusion_common::utils::{coerced_fixed_size_list_to_list, list_ndims}; -use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result}; +use datafusion_common::{ + internal_datafusion_err, internal_err, not_impl_err, plan_err, Result, +}; use super::binary::comparison_coercion; @@ -64,7 +66,7 @@ pub fn data_types( // Well-supported signature that returns exact valid types. if !valid_types.is_empty() - && matches!(signature.type_signature, TypeSignature::VariadicEqualOrNull) + && matches!(signature.type_signature, TypeSignature::Union(_)) { // exact valid types assert_eq!(valid_types.len(), 1); @@ -196,13 +198,21 @@ fn get_valid_types( .iter() .map(|valid_type| (0..*number).map(|_| valid_type.clone()).collect()) .collect(), - TypeSignature::VariadicEqualOrNull => { - if let Some(common_type) = type_union_resolution(current_types) { - vec![vec![common_type; current_types.len()]] - } else { - vec![] + TypeSignature::Union(sig) => match sig.as_ref() { + TypeSignature::VariadicAny => { + if let Some(common_type) = type_union_resolution(current_types) { + vec![vec![common_type; current_types.len()]] + } else { + vec![] + } } - } + _ => { + return not_impl_err!( + "Union type with signature {:?} not supported.", + sig + ) + } + }, TypeSignature::VariadicEqual => { let new_type = current_types.iter().skip(1).try_fold( current_types.first().unwrap().clone(), @@ -332,6 +342,7 @@ fn maybe_data_types_without_coercion( if current_type == valid_type { new_type.push(current_type.clone()) } else if can_cast_types(current_type, valid_type) { + // validate the valid type is castable from the current type new_type.push(valid_type.clone()) } else { return None; diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index 06a79b87ad15..d54c615ac1de 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -40,7 +40,7 @@ impl Default for CoalesceFunc { impl CoalesceFunc { pub fn new() -> Self { Self { - signature: Signature::variadic_equal_or_null(Volatility::Immutable), + signature: Signature::union_variadic(Volatility::Immutable), } } } From e5cc46baa879ae7e8898130888d285fbb8aa87ee Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 2 May 2024 10:22:46 +0800 Subject: [PATCH 26/37] fix test Signed-off-by: jayzhan211 --- .../sqllogictest/test_files/coalesce.slt | 36 ++++++------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index 527d4fe9c41e..704e69f88b06 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -23,7 +23,7 @@ select coalesce(1, 2, 3); 1 # test with first null -query IT +query ?T select coalesce(null, 3, 2, 1), arrow_typeof(coalesce(null, 3, 2, 1)); ---- 3 Int64 @@ -35,7 +35,7 @@ select coalesce(null, null); NULL # cast to float -query RT +query IT select coalesce(1, 2.0), arrow_typeof(coalesce(1, 2.0)) @@ -51,7 +51,7 @@ select ---- 2 Float64 -query RT +query IT select coalesce(1, arrow_cast(2.0, 'Float32')), arrow_typeof(coalesce(1, arrow_cast(2.0, 'Float32'))) @@ -177,7 +177,7 @@ select 2 Decimal256(22, 2) # coalesce string -query TT +query T? select coalesce('', 'test'), coalesce(null, 'test'); @@ -209,28 +209,20 @@ select [3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) # TODO: after switch signature of array to the same with coalesce, this query should be fixed -query ?T +query error select coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]), arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')])); ----- -[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) statement ok create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); # Dictionary and String are not coercible -query ? +query error select coalesce(column1, 'none_set') from test1; ----- -foo -none_set -query T +query error select coalesce(null, column1, 'none_set') from test1; ----- -foo -none_set # explicitly cast to Utf8 query T @@ -243,20 +235,14 @@ statement ok drop table test1 # Numeric and Dictionary are not coercible -query I +query error select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ----- -34 -query I +query error select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'), 34); ----- -123 -query I +query error select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ----- -34 # explicitly cast to Int8, and it will implicitly cast to Int64 query IT @@ -288,7 +274,7 @@ SELECT COALESCE(c1, c2) FROM test NULL # numeric string is coerced to numeric in both Postgres and DuckDB -query T +query I SELECT COALESCE(c1, c2, '-1') FROM test; ---- 0 From a810e8536c05b624cf8d30e9c8925955944c531e Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 2 May 2024 11:55:31 +0800 Subject: [PATCH 27/37] add comment Signed-off-by: jayzhan211 --- datafusion/expr/src/signature.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index 615538ee56a7..e480d717835c 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -267,14 +267,16 @@ impl Signature { volatility, } } - /// One or more number of arguments to the same type. + /// One or more number of arguments that is coercible to the same type, with `comparison_coercion` + /// See [TypeSignature::VariadicEqual] for more details. pub fn variadic_equal(volatility: Volatility) -> Self { Self { type_signature: TypeSignature::VariadicEqual, volatility, } } - /// One or more number of arguments of the same type. + /// One or more number of arguments that is coercible to the same type, with `type_union_resolution` + /// See [TypeSignature::Union] for more details. pub fn union_variadic(volatility: Volatility) -> Self { Self { type_signature: TypeSignature::Union(Box::new(TypeSignature::VariadicAny)), From cb16cdad8f4de0b15279f13305ea2c51b196085c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 3 May 2024 11:32:09 +0800 Subject: [PATCH 28/37] rm test Signed-off-by: jayzhan211 --- .../sqllogictest/test_files/functions.slt | 346 +----------------- 1 file changed, 1 insertion(+), 345 deletions(-) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 8af5563b8689..2227eb172ddc 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1157,348 +1157,4 @@ statement ok drop table uuid_table statement ok -drop table t - - -# Test coalesce function -query I -select coalesce(1, 2, 3); ----- -1 - -# test with first null -query ?T -select coalesce(null, 3, 2, 1), arrow_typeof(coalesce(null, 3, 2, 1)); ----- -3 Int64 - -# test with null values -query ? -select coalesce(null, null); ----- -NULL - -# cast to float -query IT -select - coalesce(1, 2.0), - arrow_typeof(coalesce(1, 2.0)) -; ----- -1 Float64 - -query RT -select - coalesce(2.0, 1), - arrow_typeof(coalesce(2.0, 1)) -; ----- -2 Float64 - -query IT -select - coalesce(1, arrow_cast(2.0, 'Float32')), - arrow_typeof(coalesce(1, arrow_cast(2.0, 'Float32'))) -; ----- -1 Float32 - -# test with empty args -query error -select coalesce(); - -# test with different types -query I -select coalesce(arrow_cast(1, 'Int32'), arrow_cast(1, 'Int64')); ----- -1 - -# test with nulls -query ?T -select coalesce(null, null, null), arrow_typeof(coalesce(null, null)); ----- -NULL Null - -# i32 and u32, cast to wider type i64 -query IT -select - coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32')), - arrow_typeof(coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32'))); ----- -2 Int64 - -query IT -select - coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16')), - arrow_typeof(coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16'))); ----- -2 Int32 - -query IT -select - coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8')), - arrow_typeof(coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8'))); ----- -2 Int16 - -# i64 and u32, cast to wider type i64 -query IT -select - coalesce(2, arrow_cast(3, 'UInt32')), - arrow_typeof(coalesce(2, arrow_cast(3, 'UInt32'))); ----- -2 Int64 - -# TODO: Got types (i64, u64), casting to decimal or double or even i128 if supported -query IT -select - coalesce(2, arrow_cast(3, 'UInt64')), - arrow_typeof(coalesce(2, arrow_cast(3, 'UInt64'))); ----- -2 Int64 - -statement ok -create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2, null); - -# Follow Postgres and DuckDB behavior, since a is bigint, although the value is null, all args are coerced to bigint -query IT -select - coalesce(a, b, c), - arrow_typeof(coalesce(a, b, c)) -from t1; ----- -1 Int64 -2 Int64 - -# b, c has the same type int, so the result is int -query IT -select - coalesce(b, c), - arrow_typeof(coalesce(b, c)) -from t1; ----- -1 Int32 -2 Int32 - -statement ok -drop table t1; - -# test multi rows -statement ok -CREATE TABLE t1( - c1 int, - c2 int -) as VALUES -(1, 2), -(NULL, 2), -(1, NULL), -(NULL, NULL); - -query I -SELECT COALESCE(c1, c2) FROM t1 ----- -1 -2 -1 -NULL - -statement ok -drop table t1; - -# Decimal128(7, 2) and int64 are coerced to common wider type Decimal128(22, 2) -query RT -select - coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0), - arrow_typeof(coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0)) ----- -2 Decimal128(22, 2) - -query RT -select - coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0), - arrow_typeof(coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0)); ----- -2 Decimal256(22, 2) - -# coalesce string -query T? -select - coalesce('', 'test'), - coalesce(null, 'test'); ----- -(empty) test - -# coalesce utf8 and large utf8 -query TT -select - coalesce('a', arrow_cast('b', 'LargeUtf8')), - arrow_typeof(coalesce('a', arrow_cast('b', 'LargeUtf8'))) -; ----- -a LargeUtf8 - -# coalesce array -query ?T -select - coalesce(array[1, 2], array[3, 4]), - arrow_typeof(coalesce(array[1, 2], array[3, 4])); ----- -[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) - -query ?T -select - coalesce(null, array[3, 4]), - arrow_typeof(coalesce(array[1, 2], array[3, 4])); ----- -[3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) - -# TODO: after switch signature of array to the same with coalesce, this query should be fixed -query error -select - coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]), - arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')])); - -statement ok -create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); - -# Dictionary and String are not coercible -query error -select coalesce(column1, 'none_set') from test1; - -query error -select coalesce(null, column1, 'none_set') from test1; - -# explicitly cast to Utf8 -query T -select coalesce(arrow_cast(column1, 'Utf8'), 'none_set') from test1; ----- -foo -none_set - -statement ok -drop table test1 - -# Numeric and Dictionary are not coercible -query error -select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); - -query error -select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'), 34); - -query error -select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); - -# explicitly cast to Int8, and it will implicitly cast to Int64 -query IT -select - coalesce(arrow_cast(123, 'Int8'), 34), - arrow_typeof(coalesce(arrow_cast(123, 'Int8'), 34)); ----- -123 Int64 - -statement ok -CREATE TABLE test( - c1 INT, - c2 INT -) as VALUES -(0, 1), -(NULL, 1), -(1, 0), -(NULL, 1), -(NULL, NULL); - -# coalesce result -query I rowsort -SELECT COALESCE(c1, c2) FROM test ----- -0 -1 -1 -1 -NULL - -# numeric string is coerced to numeric in both Postgres and DuckDB -query I -SELECT COALESCE(c1, c2, '-1') FROM test; ----- -0 -1 -1 -1 --1 - -statement ok -drop table test - -statement ok -CREATE TABLE test( - c1 BIGINT, - c2 BIGINT, -) as VALUES -(1, 2), -(NULL, 2), -(1, NULL), -(NULL, NULL); - -# coalesce sum with default value -query I -SELECT SUM(COALESCE(c1, c2, 0)) FROM test ----- -4 - -# coalesce mul with default value -query I -SELECT COALESCE(c1 * c2, 0) FROM test ----- -2 -0 -0 -0 - -statement ok -drop table test - -# coalesce date32 - -statement ok -CREATE TABLE test( - d1_date DATE, - d2_date DATE, - d3_date DATE -) as VALUES - ('2022-12-12','2022-12-12','2022-12-12'), - (NULL,'2022-12-11','2022-12-12'), - ('2022-12-12','2022-12-10','2022-12-12'), - ('2022-12-12',NULL,'2022-12-12'), - ('2022-12-12','2022-12-8','2022-12-12'), - ('2022-12-12','2022-12-7',NULL), - ('2022-12-12',NULL,'2022-12-12'), - (NULL,'2022-12-5','2022-12-12') -; - -query D -SELECT COALESCE(d1_date, d2_date, d3_date) FROM test ----- -2022-12-12 -2022-12-11 -2022-12-12 -2022-12-12 -2022-12-12 -2022-12-12 -2022-12-12 -2022-12-05 - -query T -SELECT arrow_typeof(COALESCE(d1_date, d2_date, d3_date)) FROM test ----- -Date32 -Date32 -Date32 -Date32 -Date32 -Date32 -Date32 -Date32 - -statement ok -drop table test +drop table t \ No newline at end of file From a37da2dcbb9a3301bb6ab41780dffc48e0af9ffe Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 12 May 2024 15:12:25 +0800 Subject: [PATCH 29/37] cleanup since rebase Signed-off-by: jayzhan211 --- datafusion/expr/src/signature.rs | 3 - datafusion/expr/src/type_coercion/binary.rs | 136 ++++++++++++------ .../expr/src/type_coercion/functions.rs | 50 +++---- datafusion/functions/src/core/coalesce.rs | 24 +--- .../sqllogictest/test_files/coalesce.slt | 20 ++- .../sqllogictest/test_files/functions.slt | 2 +- 6 files changed, 132 insertions(+), 103 deletions(-) diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index 6982818b77a0..5d925c8605ee 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -190,9 +190,6 @@ impl TypeSignature { TypeSignature::UserDefined => { vec!["UserDefined".to_string()] } - TypeSignature::Union(sig) => { - vec![format!("Union({:?})", sig.to_string_repr())] - } TypeSignature::VariadicAny => vec!["Any, .., Any".to_string()], TypeSignature::OneOf(sigs) => { sigs.iter().flat_map(|s| s.to_string_repr()).collect() diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 9f5b988f83ae..4e3f9927ed33 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -304,51 +304,62 @@ enum TypeCategory { impl From<&DataType> for TypeCategory { fn from(data_type: &DataType) -> Self { - if data_type.is_numeric() { - return TypeCategory::Numeric; - } + match data_type { + // Dict is a special type in arrow, we check the value type + DataType::Dictionary(_, v) => { + let v = v.as_ref(); + TypeCategory::from(v) + } + _ => { + if data_type.is_numeric() { + return TypeCategory::Numeric; + } - if matches!(data_type, DataType::Boolean) { - return TypeCategory::Boolean; - } + if matches!(data_type, DataType::Boolean) { + return TypeCategory::Boolean; + } - if matches!( - data_type, - DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) - ) { - return TypeCategory::Array; - } + if matches!( + data_type, + DataType::List(_) + | DataType::FixedSizeList(_, _) + | DataType::LargeList(_) + ) { + return TypeCategory::Array; + } - // String literal is possible to cast to many other types like numeric or datetime, - // therefore, it is categorized as a unknown type - if matches!( - data_type, - DataType::Utf8 | DataType::LargeUtf8 | DataType::Null - ) { - return TypeCategory::Unknown; - } + // String literal is possible to cast to many other types like numeric or datetime, + // therefore, it is categorized as a unknown type + if matches!( + data_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Null + ) { + return TypeCategory::Unknown; + } - if matches!( - data_type, - DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Interval(_) - | DataType::Duration(_) - ) { - return TypeCategory::DateTime; - } + if matches!( + data_type, + DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Interval(_) + | DataType::Duration(_) + ) { + return TypeCategory::DateTime; + } - if matches!( - data_type, - DataType::Dictionary(_, _) | DataType::Struct(_) | DataType::Union(_, _) - ) { - return TypeCategory::Composite; - } + if matches!( + data_type, + DataType::Map(_, _) | DataType::Struct(_) | DataType::Union(_, _) + ) { + return TypeCategory::Composite; + } - TypeCategory::NotSupported + TypeCategory::NotSupported + } + } } } @@ -360,7 +371,7 @@ impl From<&DataType> for TypeCategory { /// align with the behavior of Postgres. Therefore, we've made slight adjustments to the rules /// to better match the behavior of both Postgres and DuckDB. For example, we expect adjusted /// decimal percision and scale when coercing decimal types. -pub(super) fn type_union_resolution(data_types: &[DataType]) -> Option { +pub fn type_union_resolution(data_types: &[DataType]) -> Option { if data_types.is_empty() { return None; } @@ -436,11 +447,46 @@ fn type_union_resolution_coercion( return Some(lhs_type.clone()); } - // numeric coercion is the same as comparison coercion, both find the narrowest type - // that can accommodate both types - binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| pure_string_coercion(lhs_type, rhs_type)) - .or_else(|| numeric_string_coercion(lhs_type, rhs_type)) + match (lhs_type, rhs_type) { + ( + DataType::Dictionary(lhs_index_type, lhs_value_type), + DataType::Dictionary(rhs_index_type, rhs_value_type), + ) => { + + let new_index_type = type_union_resolution_coercion(lhs_index_type, rhs_index_type); + let new_value_type = type_union_resolution_coercion(lhs_value_type, rhs_value_type); + if let (Some(new_index_type), Some(new_value_type)) = (new_index_type, new_value_type) { + Some(DataType::Dictionary( + Box::new(new_index_type), + Box::new(new_value_type), + )) + } else { + None + } + } + (DataType::Dictionary(index_type, value_type), other_type) + | (other_type, DataType::Dictionary(index_type, value_type)) + => { + let new_value_type = type_union_resolution_coercion(value_type, other_type); + if let Some(new_value_type) = new_value_type { + Some(DataType::Dictionary( + index_type.clone(), + Box::new(new_value_type), + )) + } else { + None + } + } + _ => { + + // numeric coercion is the same as comparison coercion, both find the narrowest type + // that can accommodate both types + binary_numeric_coercion(lhs_type, rhs_type) + .or_else(|| pure_string_coercion(lhs_type, rhs_type)) + .or_else(|| numeric_string_coercion(lhs_type, rhs_type)) + } + } + } /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index a4fb6c60329c..e9e6ad1ef3f8 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -66,20 +66,7 @@ pub fn data_types_with_scalar_udf( return Ok(current_types.to_vec()); } - // Try and coerce the argument types to match the signature, returning the - // coerced types from the first matching signature. - for valid_types in valid_types { - if let Some(types) = maybe_data_types(&valid_types, current_types) { - return Ok(types); - } - } - - // none possible -> Error - plan_err!( - "[data_types_with_scalar_udf] Coercion from {:?} to the signature {:?} failed.", - current_types, - &signature.type_signature - ) + try_coerce_types(valid_types, current_types, &signature.type_signature) } pub fn data_types_with_aggregate_udf( @@ -112,20 +99,7 @@ pub fn data_types_with_aggregate_udf( return Ok(current_types.to_vec()); } - // Try and coerce the argument types to match the signature, returning the - // coerced types from the first matching signature. - for valid_types in valid_types { - if let Some(types) = maybe_data_types(&valid_types, current_types) { - return Ok(types); - } - } - - // none possible -> Error - plan_err!( - "[data_types_with_aggregate_udf] Coercion from {:?} to the signature {:?} failed.", - current_types, - &signature.type_signature - ) + try_coerce_types(valid_types, current_types, &signature.type_signature) } /// Performs type coercion for function arguments. @@ -151,7 +125,7 @@ pub fn data_types( } } - let mut valid_types = get_valid_types(&signature.type_signature, current_types)?; + let valid_types = get_valid_types(&signature.type_signature, current_types)?; if valid_types .iter() .any(|data_type| data_type == current_types) @@ -159,10 +133,18 @@ pub fn data_types( return Ok(current_types.to_vec()); } + try_coerce_types(valid_types, current_types, &signature.type_signature) +} + +fn try_coerce_types( + valid_types: Vec>, + current_types: &[DataType], + type_signature: &TypeSignature, +) -> Result> { + let mut valid_types = valid_types; + // Well-supported signature that returns exact valid types. - if !valid_types.is_empty() - && matches!(signature.type_signature, TypeSignature::Union(_)) - { + if !valid_types.is_empty() && matches!(type_signature, TypeSignature::UserDefined) { // exact valid types assert_eq!(valid_types.len(), 1); let valid_types = valid_types.swap_remove(0); @@ -181,9 +163,9 @@ pub fn data_types( // none possible -> Error plan_err!( - "[data_types] Coercion from {:?} to the signature {:?} failed.", + "Coercion from {:?} to the signature {:?} failed.", current_types, - &signature.type_signature + type_signature ) } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index 63778eb7738a..15a3ddd9d6e9 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -22,8 +22,8 @@ use arrow::compute::kernels::zip::zip; use arrow::compute::{and, is_not_null, is_null}; use arrow::datatypes::DataType; -use datafusion_common::{exec_err, internal_err, Result}; -use datafusion_expr::type_coercion::binary::comparison_coercion; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::type_coercion::binary::type_union_resolution; use datafusion_expr::ColumnarValue; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; @@ -124,21 +124,11 @@ impl ScalarUDFImpl for CoalesceFunc { } fn coerce_types(&self, arg_types: &[DataType]) -> Result> { - let new_type = arg_types.iter().skip(1).try_fold( - arg_types.first().unwrap().clone(), - |acc, x| { - // The coerced types found by `comparison_coercion` are not guaranteed to be - // coercible for the arguments. `comparison_coercion` returns more loose - // types that can be coerced to both `acc` and `x` for comparison purpose. - // See `maybe_data_types` for the actual coercion. - let coerced_type = comparison_coercion(&acc, x); - if let Some(coerced_type) = coerced_type { - Ok(coerced_type) - } else { - internal_err!("Coercion from {acc:?} to {x:?} failed.") - } - }, - )?; + if arg_types.is_empty() { + return exec_err!("coalesce must have at least one argument"); + } + let new_type = type_union_resolution(arg_types) + .unwrap_or(arg_types.first().unwrap().clone()); Ok(vec![new_type; arg_types.len()]) } } diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index 1767fd825b19..a0317ac4a5f4 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -209,20 +209,28 @@ select [3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) # TODO: after switch signature of array to the same with coalesce, this query should be fixed -query error +query ?T select coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]), arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')])); +---- +[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) statement ok create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); # Dictionary and String are not coercible -query error +query ? select coalesce(column1, 'none_set') from test1; +---- +foo +none_set query ? select coalesce(null, column1, 'none_set') from test1; +---- +foo +none_set # explicitly cast to Utf8 query T @@ -235,14 +243,20 @@ statement ok drop table test1 # Numeric and Dictionary are not coercible -query error +query I select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); +---- +34 query ? select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'), 34); +---- +123 query ? select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); +---- +34 # explicitly cast to Int8, and it will implicitly cast to Int64 query IT diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index ee1de97d955b..d03b33d0c8e5 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1157,4 +1157,4 @@ statement ok drop table uuid_table statement ok -drop table t \ No newline at end of file +drop table t From 70239e0f9211ad8cb68c2d92d1cfdcded8ff8adc Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 12 May 2024 15:25:35 +0800 Subject: [PATCH 30/37] add more test Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 32 +++++--------- .../sqllogictest/test_files/coalesce.slt | 42 +++++++++++++++++-- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 4e3f9927ed33..194637616835 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -452,10 +452,13 @@ fn type_union_resolution_coercion( DataType::Dictionary(lhs_index_type, lhs_value_type), DataType::Dictionary(rhs_index_type, rhs_value_type), ) => { - - let new_index_type = type_union_resolution_coercion(lhs_index_type, rhs_index_type); - let new_value_type = type_union_resolution_coercion(lhs_value_type, rhs_value_type); - if let (Some(new_index_type), Some(new_value_type)) = (new_index_type, new_value_type) { + let new_index_type = + type_union_resolution_coercion(lhs_index_type, rhs_index_type); + let new_value_type = + type_union_resolution_coercion(lhs_value_type, rhs_value_type); + if let (Some(new_index_type), Some(new_value_type)) = + (new_index_type, new_value_type) + { Some(DataType::Dictionary( Box::new(new_index_type), Box::new(new_value_type), @@ -465,20 +468,11 @@ fn type_union_resolution_coercion( } } (DataType::Dictionary(index_type, value_type), other_type) - | (other_type, DataType::Dictionary(index_type, value_type)) - => { + | (other_type, DataType::Dictionary(index_type, value_type)) => { let new_value_type = type_union_resolution_coercion(value_type, other_type); - if let Some(new_value_type) = new_value_type { - Some(DataType::Dictionary( - index_type.clone(), - Box::new(new_value_type), - )) - } else { - None - } - } + new_value_type.map(|t| DataType::Dictionary(index_type.clone(), Box::new(t))) + } _ => { - // numeric coercion is the same as comparison coercion, both find the narrowest type // that can accommodate both types binary_numeric_coercion(lhs_type, rhs_type) @@ -486,7 +480,6 @@ fn type_union_resolution_coercion( .or_else(|| numeric_string_coercion(lhs_type, rhs_type)) } } - } /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation @@ -627,10 +620,7 @@ pub(crate) fn binary_numeric_coercion( } /// Decimal coercion rules. -pub(crate) fn decimal_coercion( - lhs_type: &DataType, - rhs_type: &DataType, -) -> Option { +pub fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index a0317ac4a5f4..f43f6ef3364b 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -208,7 +208,7 @@ select ---- [3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -# TODO: after switch signature of array to the same with coalesce, this query should be fixed +# coalesce with array query ?T select coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]), @@ -216,10 +216,10 @@ select ---- [1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +# test dict(int32, utf8) statement ok create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); -# Dictionary and String are not coercible query ? select coalesce(column1, 'none_set') from test1; ---- @@ -242,7 +242,43 @@ none_set statement ok drop table test1 -# Numeric and Dictionary are not coercible +# test dictionary coercion +statement ok +create table t as values + (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), + (null); + +query ?T +select + coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, LargeUtf8)')), + arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, LargeUtf8)'))) +from t; +---- +foo Dictionary(Int64, LargeUtf8) +bar Dictionary(Int64, LargeUtf8) + +query ?T +select + coalesce(column1, arrow_cast('bar', 'Dictionary(Int32, LargeUtf8)')), + arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int32, LargeUtf8)'))) +from t; +---- +foo Dictionary(Int32, LargeUtf8) +bar Dictionary(Int32, LargeUtf8) + +query ?T +select + coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, Utf8)')), + arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, Utf8)'))) +from t; +---- +foo Dictionary(Int64, Utf8) +bar Dictionary(Int64, Utf8) + +statement ok +drop table t; + +# test dict(int32, int8) query I select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ---- From be116f84eb776a2d984727414d6d079ead660851 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 12 May 2024 15:30:13 +0800 Subject: [PATCH 31/37] add more test Signed-off-by: jayzhan211 --- .../sqllogictest/test_files/coalesce.slt | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index f43f6ef3364b..92eb293d4a9b 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -242,7 +242,23 @@ none_set statement ok drop table test1 -# test dictionary coercion +# test dict coercion with value +statement ok +create table t(c varchar) as values ('a'), (null); + +query TT +select + coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)')), + arrow_typeof(coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)'))) +from t; +---- +a Dictionary(Int32, Utf8) +b Dictionary(Int32, Utf8) + +statement ok +drop table t; + +# test dict coercion with dict statement ok create table t as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), From 8f4e991118456970d4a071898422be42cf5b0414 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 12 May 2024 15:59:45 +0800 Subject: [PATCH 32/37] fix msg Signed-off-by: jayzhan211 --- datafusion/optimizer/src/analyzer/type_coercion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 994adf732785..1e5d3f1f4e31 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -940,7 +940,7 @@ mod test { .err() .unwrap(); assert_eq!( - "type_coercion\ncaused by\nError during planning: [data_types_with_aggregate_udf] Coercion from [Utf8] to the signature Uniform(1, [Float64]) failed.", + "type_coercion\ncaused by\nError during planning: Coercion from [Utf8] to the signature Uniform(1, [Float64]) failed.", err.strip_backtrace() ); Ok(()) From 41535933424295c8dafb9d6eb5fc3fcc1918957e Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 14 May 2024 19:39:21 +0800 Subject: [PATCH 33/37] fmt Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 31074a372979..5c73ec50f501 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -962,7 +962,7 @@ fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option None + _ => None, } } From 030a5196e22fe0e02efc94fbfb47486a6e83724c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 14 May 2024 19:46:23 +0800 Subject: [PATCH 34/37] rm pure_string_coercion Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 21 ++----------------- .../sqllogictest/test_files/coalesce.slt | 6 ++++++ 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 5c73ec50f501..744272b3f19e 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -476,7 +476,7 @@ fn type_union_resolution_coercion( // numeric coercion is the same as comparison coercion, both find the narrowest type // that can accommodate both types binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| pure_string_coercion(lhs_type, rhs_type)) + .or_else(|| string_coercion(lhs_type, rhs_type)) .or_else(|| numeric_string_coercion(lhs_type, rhs_type)) } } @@ -493,7 +493,7 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { use arrow::datatypes::DataType::*; - - if let Some(t) = pure_string_coercion(lhs_type, rhs_type) { - return Some(t); - } - - // TODO: Move out to a separate function or remove it. Review the need for this. - match (lhs_type, rhs_type) { - // TODO: cast between array elements (#6558) - (List(_), List(_)) => Some(lhs_type.clone()), - (List(_), _) => Some(lhs_type.clone()), - (_, List(_)) => Some(rhs_type.clone()), - _ => None, - } -} - -fn pure_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { - use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { (Utf8, Utf8) => Some(Utf8), (LargeUtf8, Utf8) => Some(LargeUtf8), diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index 92eb293d4a9b..17b0e774d9cb 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -310,6 +310,12 @@ select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ---- 34 +# numeric string coercion +query RT +select coalesce(2.0, 1, '3'), arrow_typeof(coalesce(2.0, 1, '3')); +---- +2 Float64 + # explicitly cast to Int8, and it will implicitly cast to Int64 query IT select From 5b797d58f0d9f6f94130d5dc04c9187a50617cd0 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 14 May 2024 19:47:31 +0800 Subject: [PATCH 35/37] rm duplicate Signed-off-by: jayzhan211 --- datafusion/expr/src/type_coercion/binary.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 744272b3f19e..615bb3ac568c 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -494,7 +494,6 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Date: Sat, 25 May 2024 21:00:28 +0800 Subject: [PATCH 36/37] change type in select.slt Signed-off-by: jayzhan211 --- datafusion/sqllogictest/test_files/select.slt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 25f8984d316b..aae35c1ce7bf 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1473,7 +1473,7 @@ DROP TABLE t; # related to https://github.com/apache/datafusion/issues/8814 statement ok -create table t(x bigint, y bigint) as values (1,1), (2,2), (3,3), (0,0), (4,0); +create table t(x int, y int) as values (1,1), (2,2), (3,3), (0,0), (4,0); query II SELECT @@ -1494,30 +1494,30 @@ query TT explain select coalesce(1, y/x), coalesce(2, y/x) from t; ---- logical_plan -01)Projection: coalesce(Int64(1), t.y / t.x), coalesce(Int64(2), t.y / t.x) +01)Projection: coalesce(Int64(1), CAST(t.y / t.x AS Int64)), coalesce(Int64(2), CAST(t.y / t.x AS Int64)) 02)--TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[coalesce(1, y@1 / x@0) as coalesce(Int64(1),t.y / t.x), coalesce(2, y@1 / x@0) as coalesce(Int64(2),t.y / t.x)] +01)ProjectionExec: expr=[coalesce(1, CAST(y@1 / x@0 AS Int64)) as coalesce(Int64(1),t.y / t.x), coalesce(2, CAST(y@1 / x@0 AS Int64)) as coalesce(Int64(2),t.y / t.x)] 02)--MemoryExec: partitions=1, partition_sizes=[1] query TT EXPLAIN SELECT y > 0 and 1 / y < 1, x > 0 and y > 0 and 1 / y < 1 / x from t; ---- logical_plan -01)Projection: t.y > Int64(0) AND Int64(1) / t.y < Int64(1), t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x +01)Projection: t.y > Int32(0) AND Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y > Int64(0) AND Int64(1) / t.y < Int64(1), t.x > Int32(0) AND t.y > Int32(0) AND Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x 02)--TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[y@1 > 0 AND 1 / y@1 < 1 as t.y > Int64(0) AND Int64(1) / t.y < Int64(1), x@0 > 0 AND y@1 > 0 AND 1 / y@1 < 1 / x@0 as t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x] +01)ProjectionExec: expr=[y@1 > 0 AND 1 / CAST(y@1 AS Int64) < 1 as t.y > Int64(0) AND Int64(1) / t.y < Int64(1), x@0 > 0 AND y@1 > 0 AND 1 / CAST(y@1 AS Int64) < 1 / CAST(x@0 AS Int64) as t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x] 02)--MemoryExec: partitions=1, partition_sizes=[1] query TT EXPLAIN SELECT y = 0 or 1 / y < 1, x = 0 or y = 0 or 1 / y < 1 / x from t; ---- logical_plan -01)Projection: t.y = Int64(0) OR Int64(1) / t.y < Int64(1), t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x +01)Projection: t.y = Int32(0) OR Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y = Int64(0) OR Int64(1) / t.y < Int64(1), t.x = Int32(0) OR t.y = Int32(0) OR Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x 02)--TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[y@1 = 0 OR 1 / y@1 < 1 as t.y = Int64(0) OR Int64(1) / t.y < Int64(1), x@0 = 0 OR y@1 = 0 OR 1 / y@1 < 1 / x@0 as t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x] +01)ProjectionExec: expr=[y@1 = 0 OR 1 / CAST(y@1 AS Int64) < 1 as t.y = Int64(0) OR Int64(1) / t.y < Int64(1), x@0 = 0 OR y@1 = 0 OR 1 / CAST(y@1 AS Int64) < 1 / CAST(x@0 AS Int64) as t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x] 02)--MemoryExec: partitions=1, partition_sizes=[1] # due to the reason describe in https://github.com/apache/datafusion/issues/8927, From 829b5a2250428fc1ff63d4f40267966f6aaeedbe Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 25 May 2024 21:17:31 +0800 Subject: [PATCH 37/37] fix slt Signed-off-by: jayzhan211 --- datafusion/sqllogictest/test_files/joins.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 869b6b544b53..0c45e3ffbf69 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -1778,7 +1778,7 @@ AS VALUES ('BB', 6, 1); query TII -select col1, col2, coalesce(sum_col3, arrow_cast(0, 'UInt64')) as sum_col3 +select col1, col2, coalesce(sum_col3, 0) as sum_col3 from (select distinct col2 from tbl) AS q1 cross join (select distinct col1 from tbl) AS q2 left outer join (SELECT col1, col2, sum(col3) as sum_col3 FROM tbl GROUP BY col1, col2) AS q3