From 3858c95441f7b8f88e13053ba74df1bb2b1a8394 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 28 Jun 2022 19:13:45 +0100 Subject: [PATCH] Use specialized dictionary kernels (#1178) --- datafusion/expr/src/binary_rule.rs | 35 ++++++++++--------- .../physical-expr/src/expressions/binary.rs | 27 +++++++++++--- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/datafusion/expr/src/binary_rule.rs b/datafusion/expr/src/binary_rule.rs index b7b2c57e83e5..2f3266cb02eb 100644 --- a/datafusion/expr/src/binary_rule.rs +++ b/datafusion/expr/src/binary_rule.rs @@ -155,14 +155,12 @@ pub fn comparison_eq_coercion( lhs_type: &DataType, rhs_type: &DataType, ) -> Option { - // can't compare dictionaries directly due to - // https://github.com/apache/arrow-rs/issues/1201 - if lhs_type == rhs_type && !is_dictionary(lhs_type) { + if lhs_type == rhs_type { // same type => equality is possible return Some(lhs_type.clone()); } comparison_binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| dictionary_coercion(lhs_type, rhs_type)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, true)) .or_else(|| temporal_coercion(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) .or_else(|| null_coercion(lhs_type, rhs_type)) @@ -173,15 +171,13 @@ fn comparison_order_coercion( lhs_type: &DataType, rhs_type: &DataType, ) -> Option { - // can't compare dictionaries directly due to - // https://github.com/apache/arrow-rs/issues/1201 - if lhs_type == rhs_type && !is_dictionary(lhs_type) { + if lhs_type == rhs_type { // same type => all good return Some(lhs_type.clone()); } comparison_binary_numeric_coercion(lhs_type, rhs_type) .or_else(|| string_coercion(lhs_type, rhs_type)) - .or_else(|| dictionary_coercion(lhs_type, rhs_type)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, true)) .or_else(|| temporal_coercion(lhs_type, rhs_type)) .or_else(|| null_coercion(lhs_type, rhs_type)) } @@ -448,17 +444,24 @@ fn dictionary_value_coercion( /// Coercion rules for Dictionaries: the type that both lhs and rhs /// can be casted to for the purpose of a computation. /// -/// It would likely be preferable to cast primitive values to -/// dictionaries, and thus avoid unpacking dictionary as well as doing -/// faster comparisons. However, the arrow compute kernels (e.g. eq) -/// don't have DictionaryArray support yet, so fall back to unpacking -/// the dictionaries -fn dictionary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +/// Not all operators support dictionaries, if `preserve_dictionaries` is true +/// dictionaries will be preserved if possible +fn dictionary_coercion( + lhs_type: &DataType, + rhs_type: &DataType, + preserve_dictionaries: bool, +) -> Option { match (lhs_type, rhs_type) { ( DataType::Dictionary(_lhs_index_type, lhs_value_type), DataType::Dictionary(_rhs_index_type, rhs_value_type), ) => dictionary_value_coercion(lhs_value_type, rhs_value_type), + (d @ DataType::Dictionary(_, value_type), other_type) + | (other_type, d @ DataType::Dictionary(_, value_type)) + if preserve_dictionaries && value_type.as_ref() == other_type => + { + Some(d.clone()) + } (DataType::Dictionary(_index_type, value_type), _) => { dictionary_value_coercion(value_type, rhs_type) } @@ -514,7 +517,7 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option /// This is a union of string coercion rules and dictionary coercion rules fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { string_coercion(lhs_type, rhs_type) - .or_else(|| dictionary_coercion(lhs_type, rhs_type)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, false)) .or_else(|| null_coercion(lhs_type, rhs_type)) } @@ -616,7 +619,7 @@ fn eq_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { return Some(lhs_type.clone()); } numerical_coercion(lhs_type, rhs_type) - .or_else(|| dictionary_coercion(lhs_type, rhs_type)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, true)) .or_else(|| temporal_coercion(lhs_type, rhs_type)) .or_else(|| null_coercion(lhs_type, rhs_type)) } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 87de0acbb46e..fbc3f233c49e 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -1009,11 +1009,28 @@ impl PhysicalExpr for BinaryExpr { let left_data_type = left_value.data_type(); let right_data_type = right_value.data_type(); - if left_data_type != right_data_type { - return Err(DataFusionError::Internal(format!( - "Cannot evaluate binary expression {:?} with types {:?} and {:?}", - self.op, left_data_type, right_data_type - ))); + match (&left_value, &left_data_type, &right_value, &right_data_type) { + // Types are equal => valid + (_, l, _, r) if l == r => {} + // Allow comparing a dictionary value with its corresponding scalar value + ( + ColumnarValue::Array(_), + DataType::Dictionary(_, dict_t), + ColumnarValue::Scalar(_), + scalar_t, + ) + | ( + ColumnarValue::Scalar(_), + scalar_t, + ColumnarValue::Array(_), + DataType::Dictionary(_, dict_t), + ) if dict_t.as_ref() == scalar_t => {} + _ => { + return Err(DataFusionError::Internal(format!( + "Cannot evaluate binary expression {:?} with types {:?} and {:?}", + self.op, left_data_type, right_data_type + ))); + } } // Attempt to use special kernels if one input is scalar and the other is an array