diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index 2f8fbab64777..b378b2a6c3df 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -30,7 +30,8 @@ use arrow::datatypes::{ use arrow_array::{ make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, + IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray, LargeStringArray, RecordBatch, StringArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, @@ -1072,6 +1073,84 @@ async fn test_dates_64_diff_rg_sizes() { .run(); } +#[tokio::test] +#[should_panic] +// Currently this test `should_panic` since statistics for `Intervals` +// are not supported and `IntervalMonthDayNano` cannot be written +// to parquet yet. +// Refer to issue: https://github.com/apache/arrow-rs/issues/5847 +// and https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_writer/mod.rs#L747 +async fn test_interval_diff_rg_sizes() { + // This creates a parquet files of 3 columns: + // "year_month" --> IntervalYearMonthArray + // "day_time" --> IntervalDayTimeArray + // "month_day_nano" --> IntervalMonthDayNanoArray + // + // The file is created by 4 record batches (each has a null row) + // each has 5 rows but then will be split into 2 row groups with size 13, 7 + let reader = TestReader { + scenario: Scenario::Interval, + row_per_group: 13, + } + .build() + .await; + + // TODO: expected values need to be changed once issue is resolved + // expected_min: Arc::new(IntervalYearMonthArray::from(vec![ + // IntervalYearMonthType::make_value(1, 10), + // IntervalYearMonthType::make_value(4, 13), + // ])), + // expected_max: Arc::new(IntervalYearMonthArray::from(vec![ + // IntervalYearMonthType::make_value(6, 51), + // IntervalYearMonthType::make_value(8, 53), + // ])), + Test { + reader: &reader, + expected_min: Arc::new(IntervalYearMonthArray::from(vec![None, None])), + expected_max: Arc::new(IntervalYearMonthArray::from(vec![None, None])), + expected_null_counts: UInt64Array::from(vec![2, 2]), + expected_row_counts: UInt64Array::from(vec![13, 7]), + column_name: "year_month", + } + .run(); + + // expected_min: Arc::new(IntervalDayTimeArray::from(vec![ + // IntervalDayTimeType::make_value(1, 10), + // IntervalDayTimeType::make_value(4, 13), + // ])), + // expected_max: Arc::new(IntervalDayTimeArray::from(vec![ + // IntervalDayTimeType::make_value(6, 51), + // IntervalDayTimeType::make_value(8, 53), + // ])), + Test { + reader: &reader, + expected_min: Arc::new(IntervalDayTimeArray::from(vec![None, None])), + expected_max: Arc::new(IntervalDayTimeArray::from(vec![None, None])), + expected_null_counts: UInt64Array::from(vec![2, 2]), + expected_row_counts: UInt64Array::from(vec![13, 7]), + column_name: "day_time", + } + .run(); + + // expected_min: Arc::new(IntervalMonthDayNanoArray::from(vec![ + // IntervalMonthDayNanoType::make_value(1, 10, 100), + // IntervalMonthDayNanoType::make_value(4, 13, 103), + // ])), + // expected_max: Arc::new(IntervalMonthDayNanoArray::from(vec![ + // IntervalMonthDayNanoType::make_value(6, 51, 501), + // IntervalMonthDayNanoType::make_value(8, 53, 503), + // ])), + Test { + reader: &reader, + expected_min: Arc::new(IntervalMonthDayNanoArray::from(vec![None, None])), + expected_max: Arc::new(IntervalMonthDayNanoArray::from(vec![None, None])), + expected_null_counts: UInt64Array::from(vec![2, 2]), + expected_row_counts: UInt64Array::from(vec![13, 7]), + column_name: "month_day_nano", + } + .run(); +} + #[tokio::test] async fn test_uint() { // This creates a parquet files of 4 columns named "u8", "u16", "u32", "u64" diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index f36a9a194a8f..99769a336722 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -17,7 +17,9 @@ //! Parquet integration tests use arrow::array::Decimal128Array; -use arrow::datatypes::i256; +use arrow::datatypes::{ + i256, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, +}; use arrow::{ array::{ make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, @@ -33,6 +35,10 @@ use arrow::{ record_batch::RecordBatch, util::pretty::pretty_format_batches, }; +use arrow_array::{ + IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, +}; +use arrow_schema::IntervalUnit; use chrono::{Datelike, Duration, TimeDelta}; use datafusion::{ datasource::{physical_plan::ParquetExec, provider_as_source, TableProvider}, @@ -80,6 +86,7 @@ enum Scenario { Time32Millisecond, Time64Nanosecond, Time64Microsecond, + Interval, /// 7 Rows, for each i8, i16, i32, i64, u8, u16, u32, u64, f32, f64 /// -MIN, -100, -1, 0, 1, 100, MAX NumericLimits, @@ -925,6 +932,71 @@ fn make_dict_batch() -> RecordBatch { .unwrap() } +fn make_interval_batch(offset: i32) -> RecordBatch { + let schema = Schema::new(vec![ + Field::new( + "year_month", + DataType::Interval(IntervalUnit::YearMonth), + true, + ), + Field::new("day_time", DataType::Interval(IntervalUnit::DayTime), true), + Field::new( + "month_day_nano", + DataType::Interval(IntervalUnit::MonthDayNano), + true, + ), + ]); + let schema = Arc::new(schema); + + let ym_arr = IntervalYearMonthArray::from(vec![ + Some(IntervalYearMonthType::make_value(1 + offset, 10 + offset)), + Some(IntervalYearMonthType::make_value(2 + offset, 20 + offset)), + Some(IntervalYearMonthType::make_value(3 + offset, 30 + offset)), + None, + Some(IntervalYearMonthType::make_value(5 + offset, 50 + offset)), + ]); + + let dt_arr = IntervalDayTimeArray::from(vec![ + Some(IntervalDayTimeType::make_value(1 + offset, 10 + offset)), + Some(IntervalDayTimeType::make_value(2 + offset, 20 + offset)), + Some(IntervalDayTimeType::make_value(3 + offset, 30 + offset)), + None, + Some(IntervalDayTimeType::make_value(5 + offset, 50 + offset)), + ]); + + // Not yet implemented, refer to: + // https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_writer/mod.rs#L747 + let mdn_arr = IntervalMonthDayNanoArray::from(vec![ + Some(IntervalMonthDayNanoType::make_value( + 1 + offset, + 10 + offset, + 100 + (offset as i64), + )), + Some(IntervalMonthDayNanoType::make_value( + 2 + offset, + 20 + offset, + 200 + (offset as i64), + )), + Some(IntervalMonthDayNanoType::make_value( + 3 + offset, + 30 + offset, + 300 + (offset as i64), + )), + None, + Some(IntervalMonthDayNanoType::make_value( + 5 + offset, + 50 + offset, + 500 + (offset as i64), + )), + ]); + + RecordBatch::try_new( + schema, + vec![Arc::new(ym_arr), Arc::new(dt_arr), Arc::new(mdn_arr)], + ) + .unwrap() +} + fn create_data_batch(scenario: Scenario) -> Vec { match scenario { Scenario::Boolean => { @@ -1346,6 +1418,12 @@ fn create_data_batch(scenario: Scenario) -> Vec { ]), ] } + Scenario::Interval => vec![ + make_interval_batch(0), + make_interval_batch(1), + make_interval_batch(2), + make_interval_batch(3), + ], } }