-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Introduce TypeSignature::Comparable and update NullIf
signature
#13356
Changes from 4 commits
c6ef8a5
ffd02c4
784018f
ca87f2c
95f09c4
60496e5
8d0b73f
e4316a2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -180,6 +180,7 @@ fn is_well_supported_signature(type_signature: &TypeSignature) -> bool { | |||||
| TypeSignature::Numeric(_) | ||||||
| TypeSignature::String(_) | ||||||
| TypeSignature::Coercible(_) | ||||||
| TypeSignature::Boolean(_) | ||||||
| TypeSignature::Any(_) | ||||||
| TypeSignature::NullAry | ||||||
) | ||||||
|
@@ -194,13 +195,18 @@ fn try_coerce_types( | |||||
|
||||||
// Well-supported signature that returns exact valid types. | ||||||
if !valid_types.is_empty() && is_well_supported_signature(type_signature) { | ||||||
// exact valid types | ||||||
assert_eq!(valid_types.len(), 1); | ||||||
// There may be many valid types if valid signature is OneOf | ||||||
// Otherwise, there should be only one valid type | ||||||
if !type_signature.is_one_of() { | ||||||
assert_eq!(valid_types.len(), 1); | ||||||
} | ||||||
|
||||||
let valid_types = valid_types.swap_remove(0); | ||||||
if let Some(t) = maybe_data_types_without_coercion(&valid_types, current_types) { | ||||||
return Ok(t); | ||||||
} | ||||||
} else { | ||||||
// TODO: Deprecate this branch after all signatures are well-supported (aka coercion are happend already) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
// Try and coerce the argument types to match the signature, returning the | ||||||
// coerced types from the first matching signature. | ||||||
for valid_types in valid_types { | ||||||
|
@@ -477,6 +483,33 @@ fn get_valid_types( | |||||
|
||||||
vec![vec![base_type_or_default_type(&coerced_type); *number]] | ||||||
} | ||||||
TypeSignature::Boolean(number) => { | ||||||
function_length_check(current_types.len(), *number)?; | ||||||
|
||||||
// Find common boolean type amongs given types | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
let mut valid_type = current_types.first().unwrap().to_owned(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unless I'm mistaken there is only one possible valid type here - Boolean doesn't have multiple types, does it? If so, I don't see the need for this variable nor the code below the for loop. valid_type must be DataType::Boolean, no? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to ensure the given types are all boolean, if it isn't we should return error here. If we got null, we return boolean |
||||||
for t in current_types.iter().skip(1) { | ||||||
let logical_data_type: NativeType = t.into(); | ||||||
if logical_data_type == NativeType::Null { | ||||||
continue; | ||||||
} | ||||||
|
||||||
if logical_data_type != NativeType::Boolean { | ||||||
return plan_err!( | ||||||
"The signature expected NativeType::Boolean but received {logical_data_type}" | ||||||
); | ||||||
} | ||||||
|
||||||
valid_type = t.to_owned(); | ||||||
} | ||||||
|
||||||
let logical_data_type: NativeType = valid_type.clone().into(); | ||||||
if logical_data_type == NativeType::Null { | ||||||
valid_type = DataType::Boolean; | ||||||
} | ||||||
|
||||||
vec![vec![valid_type; *number]] | ||||||
} | ||||||
TypeSignature::Numeric(number) => { | ||||||
function_length_check(current_types.len(), *number)?; | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,7 @@ | |
|
||
use arrow::datatypes::DataType; | ||
use datafusion_common::{exec_err, Result}; | ||
use datafusion_expr::{ColumnarValue, Documentation}; | ||
use datafusion_expr::{ColumnarValue, Documentation, TypeSignature}; | ||
|
||
use arrow::compute::kernels::cmp::eq; | ||
use arrow::compute::kernels::nullif::nullif; | ||
|
@@ -32,26 +32,6 @@ pub struct NullIfFunc { | |
signature: Signature, | ||
} | ||
|
||
/// Currently supported types by the nullif function. | ||
/// The order of these types correspond to the order on which coercion applies | ||
/// This should thus be from least informative to most informative | ||
static SUPPORTED_NULLIF_TYPES: &[DataType] = &[ | ||
DataType::Boolean, | ||
DataType::UInt8, | ||
DataType::UInt16, | ||
DataType::UInt32, | ||
DataType::UInt64, | ||
DataType::Int8, | ||
DataType::Int16, | ||
DataType::Int32, | ||
DataType::Int64, | ||
DataType::Float32, | ||
DataType::Float64, | ||
DataType::Utf8View, | ||
DataType::Utf8, | ||
DataType::LargeUtf8, | ||
]; | ||
|
||
impl Default for NullIfFunc { | ||
fn default() -> Self { | ||
Self::new() | ||
|
@@ -61,9 +41,13 @@ impl Default for NullIfFunc { | |
impl NullIfFunc { | ||
pub fn new() -> Self { | ||
Self { | ||
signature: Signature::uniform( | ||
2, | ||
SUPPORTED_NULLIF_TYPES.to_vec(), | ||
signature: Signature::one_of( | ||
// Hack: String is at the beginning so the return type is String if both args are Nulls | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I'd call that a hack tbh |
||
vec![ | ||
TypeSignature::String(2), | ||
TypeSignature::Numeric(2), | ||
TypeSignature::Boolean(2), | ||
], | ||
Volatility::Immutable, | ||
), | ||
} | ||
|
@@ -83,14 +67,7 @@ impl ScalarUDFImpl for NullIfFunc { | |
} | ||
|
||
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> { | ||
// NULLIF has two args and they might get coerced, get a preview of this | ||
let coerced_types = datafusion_expr::type_coercion::functions::data_types( | ||
arg_types, | ||
&self.signature, | ||
); | ||
coerced_types | ||
.map(|typs| typs[0].clone()) | ||
.map_err(|e| e.context("Failed to coerce arguments for NULLIF")) | ||
Ok(arg_types[0].to_owned()) | ||
} | ||
|
||
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -97,11 +97,35 @@ SELECT NULLIF(1, 3); | |
---- | ||
1 | ||
|
||
query I | ||
query T | ||
SELECT NULLIF(NULL, NULL); | ||
---- | ||
NULL | ||
|
||
query R | ||
select nullif(1, 1.2); | ||
---- | ||
1 | ||
|
||
query R | ||
select nullif(1.0, 2); | ||
---- | ||
1 | ||
|
||
query error DataFusion error: Error during planning: Internal error: Failed to match any signature, errors: Error during planning: The signature expected NativeType::String but received NativeType::Int64 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This used to work, I just ran this locally against v43. I can't see a reason why this should no longer be supported. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This query failed in Postgres, I think we should return error for this query |
||
select nullif(2, 'a'); | ||
|
||
|
||
query T | ||
select nullif('2', '3'); | ||
---- | ||
2 | ||
|
||
# TODO: support numeric string | ||
# This query success in Postgres and DuckDB | ||
query error DataFusion error: Error during planning: Internal error: Failed to match any signature, errors: Error during planning: The signature expected NativeType::String but received NativeType::Int64 | ||
select nullif(2, '1'); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This also used to work.
Interesting that the type is text vs number but still, it did work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is quite a hard issue to fix, since we can't tell the difference between Unknown type and String type currently This query pass in
The change did have regression on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar issue in #13240 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't have strong opinions against this PR because it fixes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in #13285 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Duckdb can run this query
😕 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To me what is important is not strictly what other systems support - those may/could be a guide to what datafusion will support - but whether the provided arguments to a signature can be losslessly converted to what the signature accepts and whether it logically makes sense to do so. I personally would rather be lenient for what is accepted and do casting/coercion as required than to be strict and push the onus onto the user to do that. That's just me though, I don't know if that is the general consensus of the community. Perhaps we should file a discussion ticket with the options and decide? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just double checked on this branch and it seems good to me (supports things reasonably) Running `target/debug/datafusion-cli`
DataFusion CLI v43.0.0
> select nullif(2, '1');
+----------------------------+
| nullif(Int64(2),Utf8("1")) |
+----------------------------+
| 2 |
+----------------------------+
1 row(s) fetched.
Elapsed 0.027 seconds.
> select nullif('1'::varchar, 2);
+----------------------------+
| nullif(Utf8("1"),Int64(2)) |
+----------------------------+
| 1 |
+----------------------------+
1 row(s) fetched.
Elapsed 0.007 seconds. |
||
|
||
query T | ||
SELECT NULLIF(arrow_cast('a', 'Utf8View'), 'a'); | ||
---- | ||
|
@@ -130,4 +154,4 @@ NULL | |
query T | ||
SELECT NULLIF(arrow_cast('a', 'Utf8View'), null); | ||
---- | ||
a | ||
a |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍