From 5477e8058b1960c701c00902f2c88e84728a6db3 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Fri, 2 Aug 2024 10:13:27 +0800 Subject: [PATCH 1/4] support cast between binaryview and string --- arrow-cast/src/cast/mod.rs | 89 +++++++++++++++++++++++++++++++++----- 1 file changed, 78 insertions(+), 11 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index f6103cb84136..60934171572d 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -227,8 +227,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Timestamp(Nanosecond, _) | Interval(_), ) => true, - (Utf8 | LargeUtf8, Utf8View) => true, - (BinaryView, Binary | LargeBinary) => true, + (Utf8 | LargeUtf8, Utf8View | BinaryView) => true, + (BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 ) => true, (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), @@ -1229,6 +1229,13 @@ pub fn cast_with_options( cast_byte_container::(&binary) } Utf8View => Ok(Arc::new(StringViewArray::from(array.as_string::()))), + BinaryView => Ok(Arc::new(BinaryViewArray::from( + array + .as_string::() + .into_iter() + .map(|x| x.map(|x| x.as_bytes())) + .collect::>(), + ))), LargeUtf8 => cast_byte_container::(array), Time32(TimeUnit::Second) => parse_string::(array, cast_options), Time32(TimeUnit::Millisecond) => { @@ -1339,6 +1346,13 @@ pub fn cast_with_options( array.as_string::().clone(), ))), Utf8View => Ok(Arc::new(StringViewArray::from(array.as_string::()))), + BinaryView => Ok(Arc::new(BinaryViewArray::from( + array + .as_string::() + .into_iter() + .map(|x| x.map(|x| x.as_bytes())) + .collect::>(), + ))), Time32(TimeUnit::Second) => parse_string::(array, cast_options), Time32(TimeUnit::Millisecond) => { parse_string::(array, cast_options) @@ -1413,10 +1427,15 @@ pub fn cast_with_options( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (BinaryView, Binary) => cast_view_to_byte::>(array), - (BinaryView, LargeBinary) => { - cast_view_to_byte::>(array) - } + (BinaryView, _) => match to_type { + Binary => cast_view_to_byte::>(array), + LargeBinary => cast_view_to_byte::>(array), + Utf8 => cast_slice_view_to_byte::>(array), + LargeUtf8 => cast_slice_view_to_byte::>(array), + _ => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported", + ))), + }, (from_type, LargeUtf8) if from_type.is_primitive() => { value_to_string::(array, cast_options) } @@ -2417,6 +2436,37 @@ where Ok(Arc::new(byte_array_builder.finish())) } +/// Specialized function to cast from one `ByteViewType` array to `ByteArrayType` array. +/// Equvilent to [`cast_view_to_byte`] but with additional constraint on the `FROM::Native` type. +fn cast_slice_view_to_byte(array: &dyn Array) -> Result +where + FROM: ByteViewType, + TO: ByteArrayType, + FROM::Native: AsRef<[u8]>, + str: AsRef, +{ + let data = array.to_data(); + let view_array = GenericByteViewArray::::from(data); + + let len = view_array.len(); + let bytes = view_array + .views() + .iter() + .map(|v| ByteView::from(*v).length as usize) + .sum::(); + + let mut byte_array_builder = GenericByteBuilder::::with_capacity(len, bytes); + + for val in view_array.iter() { + let val = val + .map(|val| std::str::from_utf8(val.as_ref())) + .transpose()?; + byte_array_builder.append_option(val); + } + + Ok(Arc::new(byte_array_builder.finish())) +} + #[cfg(test)] mod tests { use arrow_buffer::{Buffer, IntervalDayTime, NullBuffer}; @@ -5281,11 +5331,22 @@ mod tests { &DataType::Utf8View )); + assert!(can_cast_types( + string_array.data_type(), + &DataType::BinaryView + )); + let string_view_array = cast(&string_array, &DataType::Utf8View).unwrap(); assert_eq!(string_view_array.data_type(), &DataType::Utf8View); + let binary_view_array = cast(&string_array, &DataType::BinaryView).unwrap(); + assert_eq!(binary_view_array.data_type(), &DataType::BinaryView); + let expect_string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA); assert_eq!(string_view_array.as_ref(), &expect_string_view_array); + + let expect_binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); + assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array); } #[test] @@ -5380,7 +5441,7 @@ mod tests { where O: OffsetSizeTrait, { - let view_array = { + let string_view_array = { let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers. for s in VIEW_TEST_DATA.iter() { builder.append_option(*s); @@ -5388,15 +5449,21 @@ mod tests { builder.finish() }; + let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); + let expected_string_array = GenericStringArray::::from_iter(VIEW_TEST_DATA); let expected_type = expected_string_array.data_type(); - assert!(can_cast_types(view_array.data_type(), expected_type)); + assert!(can_cast_types(string_view_array.data_type(), expected_type)); + assert!(can_cast_types(binary_view_array.data_type(), expected_type)); - let string_array = cast(&view_array, expected_type).unwrap(); - assert_eq!(string_array.data_type(), expected_type); + let string_view_casted_array = cast(&string_view_array, expected_type).unwrap(); + assert_eq!(string_view_casted_array.data_type(), expected_type); + assert_eq!(string_view_casted_array.as_ref(), &expected_string_array); - assert_eq!(string_array.as_ref(), &expected_string_array); + let binary_view_casted_array = cast(&binary_view_array, expected_type).unwrap(); + assert_eq!(binary_view_casted_array.data_type(), expected_type); + assert_eq!(binary_view_casted_array.as_ref(), &expected_string_array); } #[test] From 31b1fe4ba12b69e46c175fc00624601b9aaf8db5 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Sun, 4 Aug 2024 18:14:24 +0800 Subject: [PATCH 2/4] update impl. and add bench mark --- arrow-cast/src/cast/mod.rs | 71 +++++++++++++++++------------------ arrow/benches/cast_kernels.rs | 39 +++++++++++++++++++ 2 files changed, 73 insertions(+), 37 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 60934171572d..40bb9e2cda72 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -225,10 +225,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Timestamp(Millisecond, _) | Timestamp(Microsecond, _) | Timestamp(Nanosecond, _) - | Interval(_), + | Interval(_) + | BinaryView, ) => true, - (Utf8 | LargeUtf8, Utf8View | BinaryView) => true, - (BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 ) => true, + (Utf8 | LargeUtf8, Utf8View) => true, + (BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View ) => true, (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), @@ -1289,6 +1290,15 @@ pub fn cast_with_options( Date64 => parse_string_view::(array, cast_options), Binary => cast_view_to_byte::>(array), LargeBinary => cast_view_to_byte::>(array), + BinaryView => { + if let Some(arr) = array.as_any().downcast_ref::() { + Ok(Arc::new(arr.clone().to_binary_view())) + } else { + Err(ArrowError::CastError( + "Cannot cast StringView to BinaryView".to_string(), + )) + } + } Utf8 => cast_view_to_byte::>(array), LargeUtf8 => cast_view_to_byte::>(array), Time32(TimeUnit::Second) => parse_string_view::(array, cast_options), @@ -1430,8 +1440,27 @@ pub fn cast_with_options( (BinaryView, _) => match to_type { Binary => cast_view_to_byte::>(array), LargeBinary => cast_view_to_byte::>(array), - Utf8 => cast_slice_view_to_byte::>(array), - LargeUtf8 => cast_slice_view_to_byte::>(array), + Utf8 => { + let binary_arr = + cast_view_to_byte::>(array)?; + cast_binary_to_string::(&binary_arr, cast_options) + } + LargeUtf8 => { + let binary_arr = + cast_view_to_byte::>(array)?; + cast_binary_to_string::(&binary_arr, cast_options) + } + Utf8View => { + if let Some(arr) = array.as_any().downcast_ref::() { + arr.clone() + .to_string_view() + .map(|x| Arc::new(x) as ArrayRef) + } else { + Err(ArrowError::CastError( + "Cannot cast BinaryView to StringView".to_string(), + )) + } + } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -2027,7 +2056,6 @@ pub fn cast_with_options( })?, )) } - (Date64, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( array .as_primitive::() @@ -2436,37 +2464,6 @@ where Ok(Arc::new(byte_array_builder.finish())) } -/// Specialized function to cast from one `ByteViewType` array to `ByteArrayType` array. -/// Equvilent to [`cast_view_to_byte`] but with additional constraint on the `FROM::Native` type. -fn cast_slice_view_to_byte(array: &dyn Array) -> Result -where - FROM: ByteViewType, - TO: ByteArrayType, - FROM::Native: AsRef<[u8]>, - str: AsRef, -{ - let data = array.to_data(); - let view_array = GenericByteViewArray::::from(data); - - let len = view_array.len(); - let bytes = view_array - .views() - .iter() - .map(|v| ByteView::from(*v).length as usize) - .sum::(); - - let mut byte_array_builder = GenericByteBuilder::::with_capacity(len, bytes); - - for val in view_array.iter() { - let val = val - .map(|val| std::str::from_utf8(val.as_ref())) - .transpose()?; - byte_array_builder.append_option(val); - } - - Ok(Arc::new(byte_array_builder.finish())) -} - #[cfg(test)] mod tests { use arrow_buffer::{Buffer, IntervalDayTime, NullBuffer}; diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index 8803e8eea878..ec7990d3d764 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -114,6 +114,18 @@ fn build_decimal256_array(size: usize, precision: u8, scale: i8) -> ArrayRef { ) } +fn build_string_array(size: usize) -> ArrayRef { + let mut builder = StringBuilder::new(); + for v in 0..size { + match v % 3 { + 0 => builder.append_value("small"), + 1 => builder.append_value("larger string more than 12 bytes"), + _ => builder.append_null(), + } + } + Arc::new(builder.finish()) +} + fn build_dict_array(size: usize) -> ArrayRef { let values = StringArray::from_iter([ Some("small"), @@ -148,9 +160,12 @@ fn add_benchmark(c: &mut Criterion) { let decimal128_array = build_decimal128_array(512, 10, 3); let decimal256_array = build_decimal256_array(512, 50, 3); + let string_array = build_string_array(512); + let wide_string_array = cast(&string_array, &DataType::LargeUtf8).unwrap(); let dict_array = build_dict_array(10_000); let string_view_array = cast(&dict_array, &DataType::Utf8View).unwrap(); + let binary_view_array = cast(&string_view_array, &DataType::BinaryView).unwrap(); c.bench_function("cast int32 to int32 512", |b| { b.iter(|| cast_array(&i32_array, DataType::Int32)) @@ -262,6 +277,30 @@ fn add_benchmark(c: &mut Criterion) { ) }) }); + c.bench_function("cast string view to string", |b| { + b.iter(|| cast_array(&string_view_array, DataType::Utf8)) + }); + c.bench_function("cast string view to wide string", |b| { + b.iter(|| cast_array(&string_view_array, DataType::LargeUtf8)) + }); + c.bench_function("cast binary view to string", |b| { + b.iter(|| cast_array(&binary_view_array, DataType::Utf8)) + }); + c.bench_function("cast binary view to wide string", |b| { + b.iter(|| cast_array(&binary_view_array, DataType::LargeUtf8)) + }); + c.bench_function("cast string to binary view 512", |b| { + b.iter(|| cast_array(&string_array, DataType::BinaryView)) + }); + c.bench_function("cast wide string to binary view 512", |b| { + b.iter(|| cast_array(&wide_string_array, DataType::BinaryView)) + }); + c.bench_function("cast string view to binary view", |b| { + b.iter(|| cast_array(&string_view_array, DataType::BinaryView)) + }); + c.bench_function("cast binary view to string view", |b| { + b.iter(|| cast_array(&binary_view_array, DataType::Utf8View)) + }); } criterion_group!(benches, add_benchmark); From 9b3f00fad731a3bbd688e27df47236016e507c60 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Mon, 5 Aug 2024 21:38:28 +0800 Subject: [PATCH 3/4] Add ut for views --- arrow-cast/src/cast/mod.rs | 48 +++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 40bb9e2cda72..d237d9059b89 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -5303,12 +5303,6 @@ mod tests { } } - #[test] - fn test_string_to_view() { - _test_string_to_view::(); - _test_string_to_view::(); - } - const VIEW_TEST_DATA: [Option<&str>; 5] = [ Some("hello"), Some("repeated"), @@ -5317,6 +5311,48 @@ mod tests { Some("repeated"), ]; + #[test] + fn test_between_views() { + _test_string_view_to_binary_view(); + _test_binary_view_to_string_view(); + } + + fn _test_string_view_to_binary_view() { + let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA); + + assert!(can_cast_types( + string_view_array.data_type(), + &DataType::BinaryView + )); + + let binary_view_array = cast(&string_view_array, &DataType::BinaryView).unwrap(); + assert_eq!(binary_view_array.data_type(), &DataType::BinaryView); + + let expect_binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); + assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array); + } + + fn _test_binary_view_to_string_view() { + let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); + + assert!(can_cast_types( + binary_view_array.data_type(), + &DataType::Utf8View + )); + + let string_view_array = cast(&binary_view_array, &DataType::Utf8View).unwrap(); + assert_eq!(string_view_array.data_type(), &DataType::Utf8View); + + let expect_string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA); + assert_eq!(string_view_array.as_ref(), &expect_string_view_array); + } + + #[test] + fn test_string_to_view() { + _test_string_to_view::(); + _test_string_to_view::(); + } + fn _test_string_to_view() where O: OffsetSizeTrait, From 79c47ace67e42ba89ef85329f528ce3ef0dd94fa Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 8 Aug 2024 16:24:51 +0800 Subject: [PATCH 4/4] Apply coments --- arrow-cast/src/cast/mod.rs | 76 +++++++++++++------------------------- 1 file changed, 25 insertions(+), 51 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index d237d9059b89..9f552ec72502 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -1230,13 +1230,9 @@ pub fn cast_with_options( cast_byte_container::(&binary) } Utf8View => Ok(Arc::new(StringViewArray::from(array.as_string::()))), - BinaryView => Ok(Arc::new(BinaryViewArray::from( - array - .as_string::() - .into_iter() - .map(|x| x.map(|x| x.as_bytes())) - .collect::>(), - ))), + BinaryView => Ok(Arc::new( + StringViewArray::from(array.as_string::()).to_binary_view(), + )), LargeUtf8 => cast_byte_container::(array), Time32(TimeUnit::Second) => parse_string::(array, cast_options), Time32(TimeUnit::Millisecond) => { @@ -1290,15 +1286,7 @@ pub fn cast_with_options( Date64 => parse_string_view::(array, cast_options), Binary => cast_view_to_byte::>(array), LargeBinary => cast_view_to_byte::>(array), - BinaryView => { - if let Some(arr) = array.as_any().downcast_ref::() { - Ok(Arc::new(arr.clone().to_binary_view())) - } else { - Err(ArrowError::CastError( - "Cannot cast StringView to BinaryView".to_string(), - )) - } - } + BinaryView => Ok(Arc::new(array.as_string_view().clone().to_binary_view())), Utf8 => cast_view_to_byte::>(array), LargeUtf8 => cast_view_to_byte::>(array), Time32(TimeUnit::Second) => parse_string_view::(array, cast_options), @@ -1437,34 +1425,24 @@ pub fn cast_with_options( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (BinaryView, _) => match to_type { - Binary => cast_view_to_byte::>(array), - LargeBinary => cast_view_to_byte::>(array), - Utf8 => { - let binary_arr = - cast_view_to_byte::>(array)?; - cast_binary_to_string::(&binary_arr, cast_options) - } - LargeUtf8 => { - let binary_arr = - cast_view_to_byte::>(array)?; - cast_binary_to_string::(&binary_arr, cast_options) - } - Utf8View => { - if let Some(arr) = array.as_any().downcast_ref::() { - arr.clone() - .to_string_view() - .map(|x| Arc::new(x) as ArrayRef) - } else { - Err(ArrowError::CastError( - "Cannot cast BinaryView to StringView".to_string(), - )) - } - } - _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", - ))), - }, + (BinaryView, Binary) => cast_view_to_byte::>(array), + (BinaryView, LargeBinary) => { + cast_view_to_byte::>(array) + } + (BinaryView, Utf8) => { + let binary_arr = cast_view_to_byte::>(array)?; + cast_binary_to_string::(&binary_arr, cast_options) + } + (BinaryView, LargeUtf8) => { + let binary_arr = cast_view_to_byte::>(array)?; + cast_binary_to_string::(&binary_arr, cast_options) + } + (BinaryView, Utf8View) => { + Ok(Arc::new(array.as_binary_view().clone().to_string_view()?) as ArrayRef) + } + (BinaryView, _) => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported", + ))), (from_type, LargeUtf8) if from_type.is_primitive() => { value_to_string::(array, cast_options) } @@ -5312,12 +5290,7 @@ mod tests { ]; #[test] - fn test_between_views() { - _test_string_view_to_binary_view(); - _test_binary_view_to_string_view(); - } - - fn _test_string_view_to_binary_view() { + fn test_string_view_to_binary_view() { let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA); assert!(can_cast_types( @@ -5332,7 +5305,8 @@ mod tests { assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array); } - fn _test_binary_view_to_string_view() { + #[test] + fn test_binary_view_to_string_view() { let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA); assert!(can_cast_types(