From e5aaa67b9540e13cd68ee0902c14d31c69f01362 Mon Sep 17 00:00:00 2001 From: Duong Cong Toai <35887761+duongcongtoai@users.noreply.github.com> Date: Sun, 2 Feb 2025 19:45:07 +0700 Subject: [PATCH] test: add regression test for unnesting dictionary encoded columns (#14395) * chore: add regression test for unnest dict encoded cols * chore: use dataframe api for testing * chore: rm unused dep --- datafusion/core/tests/dataframe/mod.rs | 67 ++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index c43fcf93f1e50..e570ec75c691a 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -39,6 +39,7 @@ use datafusion_functions_aggregate::count::count_udaf; use datafusion_functions_aggregate::expr_fn::{ array_agg, avg, count, count_distinct, max, median, min, sum, }; +use datafusion_functions_nested::make_array::make_array_udf; use datafusion_functions_window::expr_fn::{first_value, row_number}; use object_store::local::LocalFileSystem; use sqlparser::ast::NullTreatment; @@ -3358,6 +3359,72 @@ async fn unnest_columns() -> Result<()> { Ok(()) } +#[tokio::test] +async fn unnest_dict_encoded_columns() -> Result<()> { + let strings = vec!["x", "y", "z"]; + let keys = Int32Array::from_iter(0..strings.len() as i32); + + let utf8_values = StringArray::from(strings.clone()); + let utf8_dict = DictionaryArray::new(keys.clone(), Arc::new(utf8_values)); + + let make_array_udf_expr1 = make_array_udf().call(vec![col("column1")]); + let batch = + RecordBatch::try_from_iter(vec![("column1", Arc::new(utf8_dict) as ArrayRef)])?; + + let ctx = SessionContext::new(); + ctx.register_batch("test", batch)?; + let df = ctx + .table("test") + .await? + .select(vec![ + make_array_udf_expr1.alias("make_array_expr"), + col("column1"), + ])? + .unnest_columns(&["make_array_expr"])?; + + let results = df.collect().await.unwrap(); + let expected = [ + "+-----------------+---------+", + "| make_array_expr | column1 |", + "+-----------------+---------+", + "| x | x |", + "| y | y |", + "| z | z |", + "+-----------------+---------+", + ]; + assert_batches_eq!(expected, &results); + + // make_array(dict_encoded_string,literal string) + let make_array_udf_expr2 = make_array_udf().call(vec![ + col("column1"), + lit(ScalarValue::new_utf8("fixed_string")), + ]); + let df = ctx + .table("test") + .await? + .select(vec![ + make_array_udf_expr2.alias("make_array_expr"), + col("column1"), + ])? + .unnest_columns(&["make_array_expr"])?; + + let results = df.collect().await.unwrap(); + let expected = [ + "+-----------------+---------+", + "| make_array_expr | column1 |", + "+-----------------+---------+", + "| x | x |", + "| fixed_string | x |", + "| y | y |", + "| fixed_string | y |", + "| z | z |", + "| fixed_string | z |", + "+-----------------+---------+", + ]; + assert_batches_eq!(expected, &results); + Ok(()) +} + #[tokio::test] async fn unnest_column_nulls() -> Result<()> { let df = table_with_lists_and_nulls().await?;