Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Officially maintained Arrow2 branch #1556

Merged
merged 46 commits into from
Jan 20, 2022
Merged
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
099398e
Wip.
jorgecarleitao Jun 8, 2021
a5b2557
resolve merge conflicts and bump to latest arrow2
houqp Sep 4, 2021
a0c9669
use lexicographical_partition_ranges from arrow2
houqp Sep 4, 2021
3218759
Merge remote-tracking branch 'upstream/master' into arrow22
houqp Sep 4, 2021
a035200
Fix build errors
houqp Sep 6, 2021
843fbe6
Fix DataFusion test and try to make ballista compile (#4)
yjshen Sep 18, 2021
fccbddb
pin arrow-flight to 0.1 in arrow2 repo
houqp Sep 18, 2021
77c69cf
turn on io_parquet_compression feature for arrow2
houqp Sep 18, 2021
2d2e379
estimate array memory usage with estimated_bytes_size
houqp Sep 18, 2021
cb187a6
Merge remote-tracking branch 'upstream/master' into arrow2-merge
houqp Sep 18, 2021
25363d2
fix compile and tests
houqp Sep 19, 2021
7a5294b
Make ballista compile (#6)
yjshen Sep 24, 2021
4030615
Make `cargo test` compile (#7)
yjshen Sep 25, 2021
fde82cf
fix str to timestamp scalarvalue casting
houqp Sep 25, 2021
b585f3b
fixing datafusion tests (#8)
yjshen Sep 25, 2021
99907fd
fix crypto expression tests
houqp Sep 26, 2021
b2f709d
fix floating point precision
houqp Sep 26, 2021
ed5281c
fix list scalar to_arry method for timestamps
houqp Sep 26, 2021
f9504e7
Fix tests (#9)
yjshen Sep 26, 2021
33b6931
Ignore last test, fix `cargo clippy`, format and pass integration tes…
yjshen Sep 28, 2021
ca53b64
bump to latest arrow2, remove ord for interval type
houqp Sep 29, 2021
8702e12
add back case insenstive regex support
houqp Sep 30, 2021
41153dc
support type cast failure message
houqp Oct 2, 2021
ba57aa8
bump to arrow2 and parquet2 0.7, replace arrow-flight with arrow-format
houqp Nov 23, 2021
387fdf6
chore: arrow2 to 0.8, parquet to 0.8, prost to 0.9, tonic to 0.6
yjshen Nov 30, 2021
0d504e6
Merge remote-tracking branch 'upstream/master' into arrow22
houqp Dec 19, 2021
ea6d7fa
Fix build and tests
houqp Dec 20, 2021
44db376
Merge remote-tracking branch 'origin/master' into arrow2_merge
Igosuki Jan 11, 2022
ca9b485
merge latest datafusion
Igosuki Jan 11, 2022
b9125bc
start migrating avro to arrow2
Igosuki Jan 11, 2022
99fdac3
lints
Igosuki Jan 11, 2022
1b916aa
merge latest datafusion
Igosuki Jan 12, 2022
d611d4d
Fix hash utils
Igosuki Jan 12, 2022
171332f
missing import in hash_utils test with no_collision
Igosuki Jan 12, 2022
4344454
address clippies in root workspace
Igosuki Jan 12, 2022
257a7c5
fix tests #1
Igosuki Jan 12, 2022
b5cb938
fix decimal tests
houqp Jan 13, 2022
e53d165
Arrow2 test fixes (#18)
Igosuki Jan 14, 2022
2293921
Fix tests and parquet read performance (#19)
Igosuki Jan 16, 2022
505084c
address review feedback and add back parquet reexport
houqp Jan 16, 2022
a27de10
fix sql tests
houqp Jan 17, 2022
7e8b8d9
fix parquet row group filter test
houqp Jan 17, 2022
8a6fb2c
remove empty python/src/dataframe.rs file
houqp Jan 17, 2022
60e869e
implement bit_length function
houqp Jan 18, 2022
1e352c3
fix binary array print formatting
houqp Jan 17, 2022
2698383
fix cli json print and avro example
houqp Jan 20, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
support type cast failure message
houqp committed Oct 2, 2021
commit 41153dce51585a09a238d6beb95fc69db5fa3ea4
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -31,5 +31,5 @@ members = [
exclude = ["python"]

[patch.crates-io]
arrow2 = { git = "https://github.com/jorgecarleitao/arrow2.git", rev = "2db7f57345192c6b9ae83bd5d1f99b2c57032648" }
arrow-flight = { git = "https://github.com/jorgecarleitao/arrow2.git", rev = "2db7f57345192c6b9ae83bd5d1f99b2c57032648" }
arrow2 = { git = "https://github.com/jorgecarleitao/arrow2.git", rev = "b7e991366104d1647b955a828e0551256ef2e7c9" }
arrow-flight = { git = "https://github.com/jorgecarleitao/arrow2.git", rev = "b7e991366104d1647b955a828e0551256ef2e7c9" }
76 changes: 40 additions & 36 deletions datafusion/src/physical_plan/array_expressions.rs
Original file line number Diff line number Diff line change
@@ -38,18 +38,19 @@ fn array_array(arrays: &[&dyn Array]) -> Result<ArrayRef> {
$DATA_TYPE,
);
let mut array = MutableFixedSizeListArray::new(array, size);
// for each entry in the array
for index in 0..first.len() {
let values = array.mut_values();
for arg in arrays {
let arg = arg.as_any().downcast_ref::<$ARRAY>().unwrap();
if arg.is_null(index) {
values.push(None);
} else {
values.push(Some(arg.value(index)));
}
}
}
array.try_extend(
// for each entry in the array
(0..first.len()).map(|idx| {
Some(arrays.iter().map(move |arg| {
let arg = arg.as_any().downcast_ref::<$ARRAY>().unwrap();
if arg.is_null(idx) {
None
} else {
Some(arg.value(idx))
}
}))
}),
)?;
Ok(array.as_arc())
}};
}
@@ -58,18 +59,20 @@ fn array_array(arrays: &[&dyn Array]) -> Result<ArrayRef> {
($OFFSET: ty) => {{
let array = MutableUtf8Array::<$OFFSET>::with_capacity(first.len() * size);
let mut array = MutableFixedSizeListArray::new(array, size);
// for each entry in the array
for index in 0..first.len() {
let values = array.mut_values();
for arg in arrays {
let arg = arg.as_any().downcast_ref::<Utf8Array<$OFFSET>>().unwrap();
if arg.is_null(index) {
values.push::<&str>(None);
} else {
values.push(Some(arg.value(index)));
}
}
}
array.try_extend(
// for each entry in the array
(0..first.len()).map(|idx| {
Some(arrays.iter().map(move |arg| {
let arg =
arg.as_any().downcast_ref::<Utf8Array<$OFFSET>>().unwrap();
if arg.is_null(idx) {
None
} else {
Some(arg.value(idx))
}
}))
}),
)?;
Ok(array.as_arc())
}};
}
@@ -78,18 +81,19 @@ fn array_array(arrays: &[&dyn Array]) -> Result<ArrayRef> {
DataType::Boolean => {
let array = MutableBooleanArray::with_capacity(first.len() * size);
let mut array = MutableFixedSizeListArray::new(array, size);
// for each entry in the array
for index in 0..first.len() {
let values = array.mut_values();
for arg in arrays {
let arg = arg.as_any().downcast_ref::<BooleanArray>().unwrap();
if arg.is_null(index) {
values.push(None);
} else {
values.push(Some(arg.value(index)));
}
}
}
array.try_extend(
// for each entry in the array
(0..first.len()).map(|idx| {
Some(arrays.iter().map(move |arg| {
let arg = arg.as_any().downcast_ref::<BooleanArray>().unwrap();
if arg.is_null(idx) {
None
} else {
Some(arg.value(idx))
}
}))
}),
)?;
Ok(array.as_arc())
}
DataType::UInt8 => array!(u8, PrimitiveArray<u8>, DataType::UInt8),
39 changes: 37 additions & 2 deletions datafusion/src/physical_plan/expressions/cast.rs
Original file line number Diff line number Diff line change
@@ -23,7 +23,9 @@ use super::ColumnarValue;
use crate::error::{DataFusionError, Result};
use crate::physical_plan::PhysicalExpr;
use crate::scalar::ScalarValue;
use arrow::array::{Array, Int32Array};
use arrow::compute::cast;
use arrow::compute::take;
use arrow::datatypes::{DataType, Schema};
use arrow::record_batch::RecordBatch;

@@ -79,15 +81,38 @@ impl PhysicalExpr for CastExpr {
}
}

fn cast_with_error(array: &dyn Array, cast_type: &DataType) -> Result<Box<dyn Array>> {
let result = cast::cast(array, cast_type)?;
if result.null_count() != array.null_count() {
let casted_valids = result.validity().unwrap();
let failed_casts = match array.validity() {
Some(valids) => valids ^ casted_valids,
None => !casted_valids,
};
let invalid_indices = failed_casts
.iter()
.enumerate()
.filter(|(_, failed)| *failed)
.map(|(idx, _)| Some(idx as i32))
.collect::<Vec<Option<i32>>>();
let invalid_values = take::take(array, &Int32Array::from(&invalid_indices))?;
return Err(DataFusionError::Execution(format!(
"Could not cast {} to value of type {}",
invalid_values, cast_type
)));
}
Ok(result)
}

/// Internal cast function for casting ColumnarValue -> ColumnarValue for cast_type
pub fn cast_column(value: &ColumnarValue, cast_type: &DataType) -> Result<ColumnarValue> {
match value {
ColumnarValue::Array(array) => Ok(ColumnarValue::Array(
cast::cast(array.as_ref(), cast_type)?.into(),
cast_with_error(array.as_ref(), cast_type)?.into(),
)),
ColumnarValue::Scalar(scalar) => {
let scalar_array = scalar.to_array();
let cast_array = cast::cast(scalar_array.as_ref(), cast_type)?.into();
let cast_array = cast_with_error(scalar_array.as_ref(), cast_type)?.into();
let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?;
Ok(ColumnarValue::Scalar(cast_scalar))
}
@@ -243,4 +268,14 @@ mod tests {
let result = cast(col("a", &schema).unwrap(), &schema, DataType::LargeBinary);
result.expect_err("expected Invalid CAST");
}

#[test]
fn invalid_str_cast() {
let arr = Utf8Array::<i32>::from_slice(&["a", "b", "123", "!", "456"]);
let err = cast_with_error(&arr, &DataType::Int64).unwrap_err();
assert_eq!(
err.to_string(),
"Execution error: Could not cast Utf8[a, b, !] to value of type Int64"
);
}
}
8 changes: 1 addition & 7 deletions datafusion/src/physical_plan/regex_expressions.rs
Original file line number Diff line number Diff line change
@@ -241,13 +241,7 @@ pub fn regexp_matches<O: Offset>(
});
let mut array = MutableListArray::<O, MutableUtf8Array<O>>::new();
for items in iter {
if let Some(items) = items? {
let values = array.mut_values();
values.try_extend(items)?;
array.try_push_valid()?;
} else {
array.push_null();
}
array.try_push(items?)?;
}

Ok(array.into())
17 changes: 14 additions & 3 deletions datafusion/tests/sql.rs
Original file line number Diff line number Diff line change
@@ -4326,9 +4326,20 @@ async fn test_cast_expressions_error() -> Result<()> {
let mut ctx = create_ctx()?;
register_aggregate_csv(&mut ctx)?;
let sql = "SELECT CAST(c1 AS INT) FROM aggregate_test_100";
let actual = execute(&mut ctx, sql).await;
let expected = vec![vec![""]; 100];
assert_eq!(expected, actual);
let plan = ctx.create_logical_plan(sql).unwrap();
let plan = ctx.optimize(&plan).unwrap();
let plan = ctx.create_physical_plan(&plan).unwrap();
let result = collect(plan).await;

match result {
Ok(_) => panic!("expected cast error"),
Err(e) => {
assert_contains!(
e.to_string(),
"Execution error: Could not cast Utf8[c, d, b, a, b, b, e, a, d, a, d, a, e, d, b, c, e, d, d, e, e, d, a, e, c, a, c, a, a, b, e, c, e, b, a, c, d, c, c, c, b, d, d, a, e, b, b, c, a, d, b, c, d, d, b, d, e, b, a, b, c, b, c, e, e, d, e, c, d, e, e, a, a, e, a, b, e, c, e, c, a, c, b, a, a, c, a, c, c, c, b, a, a, b, d, e, e, d, b, e] to value of type Int32"
);
}
}

Ok(())
}