diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index c1dc41196b36..1077778756af 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -82,3 +82,7 @@ tokio = { workspace = true, features = ["rt-multi-thread"] } harness = false name = "sqllogictests" path = "bin/sqllogictests.rs" + +[[bin]] +name = "diff" +path = "bin/diff.rs" diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/archive/complete_aggregate.slt similarity index 100% rename from datafusion/sqllogictest/test_files/aggregate.slt rename to datafusion/sqllogictest/archive/complete_aggregate.slt diff --git a/datafusion/sqllogictest/bin/diff.rs b/datafusion/sqllogictest/bin/diff.rs new file mode 100644 index 000000000000..434cd08fe3a1 --- /dev/null +++ b/datafusion/sqllogictest/bin/diff.rs @@ -0,0 +1,191 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use sqllogictest::parser::parse_file; +use sqllogictest::DefaultColumnType; +use sqllogictest::Record; +use sqllogictest::Record::{Query, Statement}; +use std::error::Error; +use std::fs; +use std::path::Path; + +// run inside sqllogictest or run with -p datafusion-sqllogictest +// cargo run --bin diff -- path1.slt path2(.slt or folder) +pub fn main() -> Result<(), Box> { + let args: Vec = std::env::args().collect(); + + // Check if we have exactly two file paths as arguments + if args.len() < 3 { + return Err("Please provide two arguments: a file path or folder, and a second file or folder.".into()); + } + + let path1 = Path::new(&args[1]); + let path2 = Path::new(&args[2]); + + let result = diff(path1, path2); + println!("{:?}", result); + Ok(()) +} + +pub fn diff(path1: &Path, path2: &Path) -> Result> { + println!("Needles: {:?}", path1); + println!("Haystack: {:?}", path2); + + // Parse the records from the first file + let records1 = parse_file(path1)?; + + // let records2; + + let records2 = if path2.is_dir() { + // If the second path is a directory, parse all files in that directory + parse_files_in_directory(path2)? + } else { + // If the second path is a file, just parse that file + parse_file(path2)? + }; + let mut diffs = Vec::new(); + + // Check if each record in file 1 is contained in any record in file 2 + for record1 in &records1 { + // let found; + let found = if !check_type(record1) { + false + } else { + records2 + .iter() + .any(|record2| check_equality(record1, record2)) + }; + if check_equality(record1, record1) && !found { + diffs.push(format!( + "Record from Needles not found in Haystack: {:?}", + get_sql(record1) + )); + } + } + // If we have collected any differences, return them all at once + if !diffs.is_empty() { + Ok(diffs.join("\n")) + } else { + Ok("All records from Needles are present in Haystack.".to_string()) + } +} + +fn get_sql(record: &Record) -> String { + match record { + Query { sql, .. } => sql.clone(), + Statement { sql, .. } => sql.clone(), + _ => String::new(), + } +} +pub fn check_type(record1: &Record) -> bool { + // the type which is acceptable by check_equality(Query and Statement) + check_equality(record1, record1) +} + +pub fn check_equality( + record1: &Record, + record2: &Record, +) -> bool { + match (record1, record2) { + ( + Query { + loc: _, + conditions: _, + connection: _, + sql: sql1, + expected: expected1, + retry: _, + }, + Query { + loc: _, + conditions: _, + connection: _, + sql: sql2, + expected: expected2, + retry: _, + }, + ) => sql1 == sql2 && expected1 == expected2, + ( + Statement { + loc: _, + conditions: _, + connection: _, + sql: sql1, + expected: expected1, + retry: _, + }, + Statement { + loc: _, + conditions: _, + connection: _, + sql: sql2, + expected: expected2, + retry: _, + }, + ) => sql1 == sql2 && expected1 == expected2, + _ => false, + } +} + +// Warning: This is not recursive, can be made recursive in future if needed. +fn parse_files_in_directory( + directory: &Path, +) -> Result>, Box> { + let mut all_records = Vec::new(); + + // Read all files in the directory + for entry in fs::read_dir(directory)? { + let entry = entry?; + let path = entry.path(); + + // Only process files (not directories) + if path.is_file() && path.extension().map(|ext| ext == "slt").unwrap_or(false) { + let records = parse_file(&path)?; + all_records.extend(records); // Add the records from this file + } + } + + Ok(all_records) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_files_aggregate_diff_archived() { + // Create two test files + let path1 = Path::new("./archive/complete_aggregate.slt"); + let path2 = Path::new("./test_files/aggregate"); + let result = diff(path1, path2); + assert_eq!( + result.unwrap(), + format!("All records from Needles are present in Haystack.") + ) + } + #[test] + fn test_files_aggregate_diff_base() { + // Create two test files + let path1 = Path::new("./test_files/aggregate/base_aggregate.slt"); + let path2 = Path::new("./archive/complete_aggregate.slt"); + let result = diff(path1, path2); + assert_eq!( + result.unwrap(), + format!("All records from Needles are present in Haystack.") + ) + } +} diff --git a/datafusion/sqllogictest/test_files/aggregate/README.md b/datafusion/sqllogictest/test_files/aggregate/README.md new file mode 100644 index 000000000000..8a1a321bcbc3 --- /dev/null +++ b/datafusion/sqllogictest/test_files/aggregate/README.md @@ -0,0 +1,14 @@ +# Aggregate Tests + +##### History and Context: + +Aggregate used to be (perhaps still is depending on the progress on issue [#13723](https://github.com/apache/datafusion/issues/13723)). +We have decided to refactor it for better navigation by using `extract and subtract` approach. + +Formally, `base_aggregate.slt` starts with all the test cases of original `aggregate.slt` which is currently in `datafusion/sqllogictest/archive`. + +Gradually, as we move out(`extract`) different portions of code we remove(`substract`) only that protion from `base_aggregate.slt`. + +It should be done in a manner that at all times set of all tests in aggregate folder is a superset of all tests covered in `datafusion/sqllogictest/archive/complete_aggregate.slt` + +Refer to [#14301](https://github.com/apache/datafusion/pull/14301) and [#14306](https://github.com/apache/datafusion/pull/14306) for more context and details. diff --git a/datafusion/sqllogictest/test_files/aggregate/base_aggregate.slt b/datafusion/sqllogictest/test_files/aggregate/base_aggregate.slt new file mode 100644 index 000000000000..187b7f5ffdd1 --- /dev/null +++ b/datafusion/sqllogictest/test_files/aggregate/base_aggregate.slt @@ -0,0 +1,6158 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####### +# Setup test data table +####### +include ./init.slt.part + +####### +# Error tests +####### + +# https://github.com/apache/datafusion/issues/3353 +statement error DataFusion error: Schema error: Schema contains duplicate unqualified field name "approx_distinct\(aggregate_test_100\.c9\)" +SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100 + +# csv_query_approx_percentile_cont_with_weight +statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to approx_percentile_cont_with_weight function: coercion from \[Utf8, Int8, Float64\] to the signature OneOf(.*) failed(.|\n)* +SELECT approx_percentile_cont_with_weight(c1, c2, 0.95) FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to approx_percentile_cont_with_weight function: coercion from \[Int16, Utf8, Float64\] to the signature OneOf(.*) failed(.|\n)* +SELECT approx_percentile_cont_with_weight(c3, c1, 0.95) FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to approx_percentile_cont_with_weight function: coercion from \[Int16, Int8, Utf8\] to the signature OneOf(.*) failed(.|\n)* +SELECT approx_percentile_cont_with_weight(c3, c2, c1) FROM aggregate_test_100 + +# csv_query_approx_percentile_cont_with_histogram_bins +statement error DataFusion error: External error: This feature is not implemented: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal \(got data type Int64\)\. +SELECT c1, approx_percentile_cont(c3, 0.95, -1000) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 + +statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to approx_percentile_cont function: coercion from \[Int16, Float64, Utf8\] to the signature OneOf(.*) failed(.|\n)* +SELECT approx_percentile_cont(c3, 0.95, c1) FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to approx_percentile_cont function: coercion from \[Int16, Float64, Float64\] to the signature OneOf(.*) failed(.|\n)* +SELECT approx_percentile_cont(c3, 0.95, 111.1) FROM aggregate_test_100 + +statement error DataFusion error: Error during planning: Failed to coerce arguments to satisfy a call to approx_percentile_cont function: coercion from \[Float64, Float64, Float64\] to the signature OneOf(.*) failed(.|\n)* +SELECT approx_percentile_cont(c12, 0.95, 111.1) FROM aggregate_test_100 + +statement error DataFusion error: This feature is not implemented: Percentile value for 'APPROX_PERCENTILE_CONT' must be a literal +SELECT approx_percentile_cont(c12, c12) FROM aggregate_test_100 + +statement error DataFusion error: This feature is not implemented: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be a literal +SELECT approx_percentile_cont(c12, 0.95, c5) FROM aggregate_test_100 + +# Not supported over sliding windows +query error This feature is not implemented: Aggregate can not be used as a sliding accumulator because `retract_batch` is not implemented +SELECT approx_percentile_cont(c3, 0.5) OVER (ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) +FROM aggregate_test_100 + +# array agg can use order by +query ? +SELECT array_agg(c13 ORDER BY c13) +FROM + (SELECT * + FROM aggregate_test_100 + ORDER BY c13 + LIMIT 5) as t1 +---- +[0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8] + +statement ok +CREATE EXTERNAL TABLE agg_order ( +c1 INT NOT NULL, +c2 INT NOT NULL, +c3 INT NOT NULL +) +STORED AS CSV +LOCATION '../core/tests/data/aggregate_agg_multi_order.csv' +OPTIONS ('format.has_header' 'true'); + +# test array_agg with order by multiple columns +query ? +select array_agg(c1 order by c2 desc, c3) from agg_order; +---- +[5, 6, 7, 8, 9, 1, 2, 3, 4, 10] + +query TT +explain select array_agg(c1 order by c2 desc, c3) from agg_order; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[array_agg(agg_order.c1) ORDER BY [agg_order.c2 DESC NULLS FIRST, agg_order.c3 ASC NULLS LAST]]] +02)--TableScan: agg_order projection=[c1, c2, c3] +physical_plan +01)AggregateExec: mode=Final, gby=[], aggr=[array_agg(agg_order.c1) ORDER BY [agg_order.c2 DESC NULLS FIRST, agg_order.c3 ASC NULLS LAST]] +02)--CoalescePartitionsExec +03)----AggregateExec: mode=Partial, gby=[], aggr=[array_agg(agg_order.c1) ORDER BY [agg_order.c2 DESC NULLS FIRST, agg_order.c3 ASC NULLS LAST]] +04)------SortExec: expr=[c2@1 DESC, c3@2 ASC NULLS LAST], preserve_partitioning=[true] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_agg_multi_order.csv]]}, projection=[c1, c2, c3], has_header=true + +# test array_agg_order with list data type +statement ok +CREATE TABLE array_agg_order_list_table AS VALUES + ('w', 2, [1,2,3], 10), + ('w', 1, [9,5,2], 20), + ('w', 1, [3,2,5], 30), + ('b', 2, [4,5,6], 20), + ('b', 1, [7,8,9], 30) +; + +query T? rowsort +select column1, array_agg(column3 order by column2, column4 desc) from array_agg_order_list_table group by column1; +---- +b [[7, 8, 9], [4, 5, 6]] +w [[3, 2, 5], [9, 5, 2], [1, 2, 3]] + +query T?? rowsort +select column1, first_value(column3 order by column2, column4 desc), last_value(column3 order by column2, column4 desc) from array_agg_order_list_table group by column1; +---- +b [7, 8, 9] [4, 5, 6] +w [3, 2, 5] [1, 2, 3] + +query T? rowsort +select column1, nth_value(column3, 2 order by column2, column4 desc) from array_agg_order_list_table group by column1; +---- +b [4, 5, 6] +w [9, 5, 2] + +statement ok +drop table array_agg_order_list_table; + +# test array_agg_distinct with list data type +statement ok +CREATE TABLE array_agg_distinct_list_table AS VALUES + ('w', [0,1]), + ('w', [0,1]), + ('w', [1,0]), + ('b', [1,0]), + ('b', [1,0]), + ('b', [1,0]), + ('b', [0,1]) +; + +# Apply array_sort to have deterministic result, higher dimension nested array also works but not for array sort, +# so they are covered in `datafusion/functions-aggregate/src/array_agg.rs` +query ?? +select array_sort(c1), array_sort(c2) from ( + select array_agg(distinct column1) as c1, array_agg(distinct column2) as c2 from array_agg_distinct_list_table +); +---- +[b, w] [[0, 1], [1, 0]] + +statement ok +drop table array_agg_distinct_list_table; + +statement error This feature is not implemented: Calling array_agg: LIMIT not supported in function arguments: 1 +SELECT array_agg(c13 LIMIT 1) FROM aggregate_test_100 + + +# Test distinct aggregate function with merge batch +query II +with A as ( + select 1 as id, 2 as foo + UNION ALL + select 1, null + UNION ALL + select 1, null + UNION ALL + select 1, 3 + UNION ALL + select 1, 2 + ---- The order is non-deterministic, verify with length +) select array_length(array_agg(distinct a.foo)), sum(distinct 1) from A a group by a.id; +---- +3 1 + +# It has only AggregateExec with FinalPartitioned mode, so `merge_batch` is used +# If the plan is changed, whether the `merge_batch` is used should be verified to ensure the test coverage +query TT +explain with A as ( + select 1 as id, 2 as foo + UNION ALL + select 1, null + UNION ALL + select 1, null + UNION ALL + select 1, 3 + UNION ALL + select 1, 2 +) select array_length(array_agg(distinct a.foo)), sum(distinct 1) from A a group by a.id; +---- +logical_plan +01)Projection: array_length(array_agg(DISTINCT a.foo)), sum(DISTINCT Int64(1)) +02)--Aggregate: groupBy=[[a.id]], aggr=[[array_agg(DISTINCT a.foo), sum(DISTINCT Int64(1))]] +03)----SubqueryAlias: a +04)------SubqueryAlias: a +05)--------Union +06)----------Projection: Int64(1) AS id, Int64(2) AS foo +07)------------EmptyRelation +08)----------Projection: Int64(1) AS id, Int64(NULL) AS foo +09)------------EmptyRelation +10)----------Projection: Int64(1) AS id, Int64(NULL) AS foo +11)------------EmptyRelation +12)----------Projection: Int64(1) AS id, Int64(3) AS foo +13)------------EmptyRelation +14)----------Projection: Int64(1) AS id, Int64(2) AS foo +15)------------EmptyRelation +physical_plan +01)ProjectionExec: expr=[array_length(array_agg(DISTINCT a.foo)@1) as array_length(array_agg(DISTINCT a.foo)), sum(DISTINCT Int64(1))@2 as sum(DISTINCT Int64(1))] +02)--AggregateExec: mode=FinalPartitioned, gby=[id@0 as id], aggr=[array_agg(DISTINCT a.foo), sum(DISTINCT Int64(1))], ordering_mode=Sorted +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=5 +05)--------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[array_agg(DISTINCT a.foo), sum(DISTINCT Int64(1))], ordering_mode=Sorted +06)----------UnionExec +07)------------ProjectionExec: expr=[1 as id, 2 as foo] +08)--------------PlaceholderRowExec +09)------------ProjectionExec: expr=[1 as id, NULL as foo] +10)--------------PlaceholderRowExec +11)------------ProjectionExec: expr=[1 as id, NULL as foo] +12)--------------PlaceholderRowExec +13)------------ProjectionExec: expr=[1 as id, 3 as foo] +14)--------------PlaceholderRowExec +15)------------ProjectionExec: expr=[1 as id, 2 as foo] +16)--------------PlaceholderRowExec + + +# FIX: custom absolute values +# csv_query_avg_multi_batch + +# csv_query_avg +query R +SELECT avg(c12) FROM aggregate_test_100 +---- +0.508972509913 + +# csv_query_bit_and +query IIIII +SELECT bit_and(c5), bit_and(c6), bit_and(c7), bit_and(c8), bit_and(c9) FROM aggregate_test_100 +---- +0 0 0 0 0 + +# csv_query_bit_and_distinct +query IIIII +SELECT bit_and(distinct c5), bit_and(distinct c6), bit_and(distinct c7), bit_and(distinct c8), bit_and(distinct c9) FROM aggregate_test_100 +---- +0 0 0 0 0 + +# csv_query_bit_or +query IIIII +SELECT bit_or(c5), bit_or(c6), bit_or(c7), bit_or(c8), bit_or(c9) FROM aggregate_test_100 +---- +-1 -1 255 65535 4294967295 + +# csv_query_bit_or_distinct +query IIIII +SELECT bit_or(distinct c5), bit_or(distinct c6), bit_or(distinct c7), bit_or(distinct c8), bit_or(distinct c9) FROM aggregate_test_100 +---- +-1 -1 255 65535 4294967295 + +# csv_query_bit_xor +query IIIII +SELECT bit_xor(c5), bit_xor(c6), bit_xor(c7), bit_xor(c8), bit_xor(c9) FROM aggregate_test_100 +---- +1632751011 5960911605712039654 148 54789 169634700 + +# csv_query_bit_xor_distinct (should be different than above) +query IIIII +SELECT bit_xor(distinct c5), bit_xor(distinct c6), bit_xor(distinct c7), bit_xor(distinct c8), bit_xor(distinct c9) FROM aggregate_test_100 +---- +1632751011 5960911605712039654 196 54789 169634700 + +# csv_query_bit_xor_distinct_expr +query I +SELECT bit_xor(distinct c5 % 2) FROM aggregate_test_100 +---- +-2 + +# csv_query_covariance_1 +query R +SELECT covar_pop(c2, c12) FROM aggregate_test_100 +---- +-0.079169322354 + +# csv_query_covariance_2 +query R +SELECT covar(c2, c12) FROM aggregate_test_100 +---- +-0.079969012479 + +# single_row_query_covar_1 +query R +select covar_samp(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq +---- +NULL + +# single_row_query_covar_2 +query R +select covar_pop(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq +---- +0 + +# all_nulls_query_covar +query RR +with data as ( + select null::int as f, null::int as b + union all + select null::int as f, null::int as b +) +select covar_samp(f, b), covar_pop(f, b) +from data +---- +NULL NULL + +# covar_query_with_nulls +query RR +with data as ( + select 1 as f, 4 as b + union all + select null as f, 99 as b + union all + select 2 as f, 5 as b + union all + select 98 as f, null as b + union all + select 3 as f, 6 as b + union all + select null as f, null as b +) +select covar_samp(f, b), covar_pop(f, b) +from data +---- +1 0.666666666667 + +# csv_query_correlation +query R +SELECT corr(c2, c12) FROM aggregate_test_100 +---- +-0.190645441906 + +# single_row_query_correlation +query R +select corr(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq +---- +0 + +# all_nulls_query_correlation +query R +with data as ( + select null::int as f, null::int as b + union all + select null::int as f, null::int as b +) +select corr(f, b) +from data +---- +NULL + +# correlation_query_with_nulls +query R +with data as ( + select 1 as f, 4 as b + union all + select null as f, 99 as b + union all + select 2 as f, 5 as b + union all + select 98 as f, null as b + union all + select 3 as f, 6 as b + union all + select null as f, null as b +) +select corr(f, b) +from data +---- +1 + +# csv_query_variance_1 +query R +SELECT var_pop(c2) FROM aggregate_test_100 +---- +1.8675 + +# csv_query_variance_2 +query R +SELECT var_pop(c6) FROM aggregate_test_100 +---- +26156334342021890000000000000000000000 + +# csv_query_variance_3 +query R +SELECT var_pop(c12) FROM aggregate_test_100 +---- +0.092342237216 + +# csv_query_variance_4 +query R +SELECT var(c2) FROM aggregate_test_100 +---- +1.886363636364 + +# csv_query_distinct_variance +query R +SELECT var(distinct c2) FROM aggregate_test_100 +---- +2.5 + +statement error DataFusion error: This feature is not implemented: VAR\(DISTINCT\) aggregations are not available +SELECT var(c2), var(distinct c2) FROM aggregate_test_100 + +# csv_query_distinct_variance_population +query R +SELECT var_pop(distinct c2) FROM aggregate_test_100 +---- +2 + +statement error DataFusion error: This feature is not implemented: VAR_POP\(DISTINCT\) aggregations are not available +SELECT var_pop(c2), var_pop(distinct c2) FROM aggregate_test_100 + +# csv_query_variance_5 +query R +SELECT var_samp(c2) FROM aggregate_test_100 +---- +1.886363636364 + +# csv_query_stddev_1 +query R +SELECT stddev_pop(c2) FROM aggregate_test_100 +---- +1.366565036872 + +# csv_query_stddev_2 +query R +SELECT stddev_pop(c6) FROM aggregate_test_100 +---- +5114326382039172000 + +# csv_query_stddev_3 +query R +SELECT stddev_pop(c12) FROM aggregate_test_100 +---- +0.303878655413 + +# csv_query_stddev_4 +query R +SELECT stddev(c12) FROM aggregate_test_100 +---- +0.305409539941 + +# csv_query_stddev_5 +query R +SELECT stddev_samp(c12) FROM aggregate_test_100 +---- +0.305409539941 + +# csv_query_stddev_6 +query R +select stddev(sq.column1) from (values (1.1), (2.0), (3.0)) as sq +---- +0.950438495292 + +# csv_query_stddev_7 +query IR +SELECT c2, stddev_samp(c12) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.303641032262 +2 0.284581967411 +3 0.296002660506 +4 0.284324609109 +5 0.331034486752 + +# csv_query_stddev_8 +query IR +SELECT c2, stddev_pop(c12) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.296659845456 +2 0.278038978602 +3 0.288107833475 +4 0.278074953424 +5 0.318992813225 + +# csv_query_stddev_9 +query IR +SELECT c2, var_pop(c12) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.088007063906 +2 0.077305673622 +3 0.083006123709 +4 0.077325679722 +5 0.101756414889 + +# csv_query_stddev_10 +query IR +SELECT c2, var_samp(c12) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.092197876473 +2 0.080986896176 +3 0.087617575027 +4 0.080840483345 +5 0.109583831419 + +# csv_query_stddev_11 +query IR +SELECT c2, var_samp(c12) FROM aggregate_test_100 WHERE c12 > 0.90 GROUP BY c2 ORDER BY c2 +---- +1 0.000889240174 +2 0.000785878272 +3 NULL +4 NULL +5 0.000269544643 + +# Use PostgresSQL dialect +statement ok +set datafusion.sql_parser.dialect = 'Postgres'; + +# csv_query_stddev_12 +query IR +SELECT c2, var_samp(c12) FILTER (WHERE c12 > 0.95) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.000791243479 +2 0.000061521903 +3 NULL +4 NULL +5 NULL + +# Restore the default dialect +statement ok +set datafusion.sql_parser.dialect = 'Generic'; + +# csv_query_stddev_13 +query IR +SELECT c2, var_samp(CASE WHEN c12 > 0.90 THEN c12 ELSE null END) FROM aggregate_test_100 GROUP BY c2 ORDER BY c2 +---- +1 0.000889240174 +2 0.000785878272 +3 NULL +4 NULL +5 0.000269544643 + + +# csv_query_approx_median_1 +query I +SELECT approx_median(c2) FROM aggregate_test_100 +---- +3 + +# csv_query_approx_median_2 +query I +SELECT approx_median(c6) FROM aggregate_test_100 +---- +1146409980542786560 + +# csv_query_approx_median_3 +query R +SELECT approx_median(c12) FROM aggregate_test_100 +---- +0.555006541052 + +# csv_query_approx_median_4 +# test with string, approx median only supports numeric +statement error +SELECT approx_median(c1) FROM aggregate_test_100 + +# csv_query_median_1 +query I +SELECT median(c2) FROM aggregate_test_100 +---- +3 + +# csv_query_median_2 +query I +SELECT median(c6) FROM aggregate_test_100 +---- +1125553990140691277 + +# csv_query_median_3 +query R +SELECT median(c12) FROM aggregate_test_100 +---- +0.551390054439 + +# median_i8 +query I +SELECT median(col_i8) FROM median_table +---- +-14 + +# distinct_median_i8 +query I +SELECT median(distinct col_i8) FROM median_table +---- +100 + +query II +SELECT median(col_i8), median(distinct col_i8) FROM median_table +---- +-14 100 + +# approx_distinct_median_i8 +query I +SELECT approx_median(distinct col_i8) FROM median_table +---- +100 + +statement error DataFusion error: This feature is not implemented: APPROX_MEDIAN\(DISTINCT\) aggregations are not available +SELECT approx_median(col_i8), approx_median(distinct col_i8) FROM median_table + +# median_i16 +query I +SELECT median(col_i16) FROM median_table +---- +-16334 + +# median_i32 +query I +SELECT median(col_i32) FROM median_table +---- +-1073741774 + +# median_i64 +query I +SELECT median(col_i64) FROM median_table +---- +-4611686018427387854 + +# median_u8 +query I +SELECT median(col_u8) FROM median_table +---- +50 + +# median_u16 +query I +SELECT median(col_u16) FROM median_table +---- +50 + +# median_u32 +query I +SELECT median(col_u32) FROM median_table +---- +50 + +# median_u64 +query I +SELECT median(col_u64) FROM median_table +---- +50 + +# median_f32 +query R +SELECT median(col_f32) FROM median_table +---- +2.75 + +# median_f64 +query R +SELECT median(col_f64) FROM median_table +---- +2.75 + +# median_f64_nan +query R +SELECT median(col_f64_nan) FROM median_table +---- +NaN + +# approx_median_f64_nan +query R +SELECT approx_median(col_f64_nan) FROM median_table +---- +NaN + +# median decimal +statement ok +create table t(c decimal(10, 4)) as values (0.0001), (0.0002), (0.0003), (0.0004), (0.0005), (0.0006); + +query RT +select median(c), arrow_typeof(median(c)) from t; +---- +0.0003 Decimal128(10, 4) + +query RT +select approx_median(c), arrow_typeof(approx_median(c)) from t; +---- +0.00035 Float64 + +statement ok +drop table t; + +# median decimal with nulls +statement ok +create table t(c decimal(10, 4)) as values (0.0001), (null), (0.0003), (0.0004), (0.0005); + +query RT +select median(c), arrow_typeof(median(c)) from t; +---- +0.0003 Decimal128(10, 4) + +statement ok +drop table t; + +# median decimal with all nulls +statement ok +create table t(c decimal(10, 4)) as values (null), (null), (null); + +query RT +select median(c), arrow_typeof(median(c)) from t; +---- +NULL Decimal128(10, 4) + +statement ok +drop table t; + +# median odd +statement ok +create table t(c int) as values (1), (2), (3), (4), (5); + +query I +select median(c) from t; +---- +3 + +statement ok +drop table t; + +# median even +statement ok +create table t(c int) as values (1), (2), (3), (4), (5), (6); + +query I +select median(c) from t; +---- +3 + +statement ok +drop table t; + +# median with nulls +statement ok +create table t(c int) as values (1), (null), (3), (4), (5); + +query I +select median(c) from t; +---- +3 + +statement ok +drop table t; + +# median with all nulls +statement ok +create table t(c int) as values (null), (null), (null); + +query I +select median(c) from t; +---- +NULL + +statement ok +drop table t; + +# median u32 +statement ok +create table t(c int unsigned) as values (1), (2), (3), (4), (5); + +query I +select median(c) from t; +---- +3 + +statement ok +drop table t; + +# median f32 +statement ok +create table t(c float) as values (1.1), (2.2), (3.3), (4.4), (5.5); + +query R +select median(c) from t; +---- +3.3 + +statement ok +drop table t; + +# median distinct decimal +statement ok +create table t(c decimal(10, 4)) as values (0.0001), (0.0001), (0.0001), (0.0001), (0.0002), (0.0002), (0.0003), (0.0003); + +query R +select median(distinct c) from t; +---- +0.0002 + +statement ok +drop table t; + +# median distinct decimal with nulls +statement ok +create table t(c decimal(10, 4)) as values (0.0001), (0.0001), (0.0001), (null), (null), (0.0002), (0.0003), (0.0003); + +query R +select median(distinct c) from t; +---- +0.0002 + +statement ok +drop table t; + +# distinct median i32 odd +statement ok +create table t(c int) as values (2), (1), (1), (2), (1), (3); + +query I +select median(distinct c) from t; +---- +2 + +statement ok +drop table t; + +# distinct median i32 even +statement ok +create table t(c int) as values (1), (1), (3), (1), (1); + +query I +select median(distinct c) from t; +---- +2 + +statement ok +drop table t; + +# distinct median i32 with nulls +statement ok +create table t(c int) as values (1), (null), (1), (1), (3); + +query I +select median(distinct c) from t; +---- +2 + +statement ok +drop table t; + +# distinct median u32 odd +statement ok +create table t(c int unsigned) as values (1), (1), (2), (1), (3); + +query I +select median(distinct c) from t; +---- +2 + +statement ok +drop table t; + +# distinct median u32 even +statement ok +create table t(c int unsigned) as values (1), (1), (1), (1), (3), (3); + +query I +select median(distinct c) from t; +---- +2 + +statement ok +drop table t; + +# distinct median f32 odd +statement ok +create table t(c float) as values (3), (2), (1), (1), (1); + +query R +select median(distinct c) from t; +---- +2 + +statement ok +drop table t; + +# distinct median f32 even +statement ok +create table t(c float) as values (1), (1), (1), (1), (2); + +query R +select median(distinct c) from t; +---- +1.5 + +statement ok +drop table t; + +# distinct median f64 odd +statement ok +create table t(c double) as values (1), (1), (1), (2), (3); + +query R +select median(distinct c) from t; +---- +2 + +statement ok +drop table t; + +# distinct median f64 even +statement ok +create table t(c double) as values (1), (1), (1), (1), (2); + +query R +select median(distinct c) from t; +---- +1.5 + +statement ok +drop table t; + +# distinct median i32 +statement ok +create table t(c int) as values (1), (1), (1), (1), (2), (2), (3), (3); + +query I +select median(distinct c) from t; +---- +2 + +statement ok +drop table t; + +# optimize distinct median to group by +statement ok +create table t(c int) as values (1), (1), (1), (1), (2), (2), (3), (3); + +query TT +explain select median(distinct c) from t; +---- +logical_plan +01)Projection: median(alias1) AS median(DISTINCT t.c) +02)--Aggregate: groupBy=[[]], aggr=[[median(alias1)]] +03)----Aggregate: groupBy=[[t.c AS alias1]], aggr=[[]] +04)------TableScan: t projection=[c] +physical_plan +01)ProjectionExec: expr=[median(alias1)@0 as median(DISTINCT t.c)] +02)--AggregateExec: mode=Final, gby=[], aggr=[median(alias1)] +03)----CoalescePartitionsExec +04)------AggregateExec: mode=Partial, gby=[], aggr=[median(alias1)] +05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[] +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=4 +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------AggregateExec: mode=Partial, gby=[c@0 as alias1], aggr=[] +10)------------------MemoryExec: partitions=1, partition_sizes=[1] + +statement ok +drop table t; + +# median_multi +# test case for https://github.com/apache/datafusion/issues/3105 +# has an intermediate grouping +statement ok +create table cpu (host string, usage float) as select * from (values +('host0', 90.1), +('host1', 90.2), +('host1', 90.4) +); + +query TR rowsort +select host, median(usage) from cpu group by host; +---- +host0 90.1 +host1 90.3 + +statement ok +drop table cpu; + +# this test is to show create table as and select into works in the same way +statement ok +SELECT * INTO cpu +FROM (VALUES + ('host0', 90.1), + ('host1', 90.2), + ('host1', 90.4) + ) AS cpu (host, usage); + +query TR rowsort +select host, median(usage) from cpu group by host; +---- +host0 90.1 +host1 90.3 + +query R +select median(usage) from cpu; +---- +90.2 + +statement ok +drop table cpu; + +# median_multi_odd + +# data is not sorted and has an odd number of values per group +statement ok +create table cpu (host string, usage float) as select * from (values + ('host0', 90.2), + ('host1', 90.1), + ('host1', 90.5), + ('host0', 90.5), + ('host1', 90.0), + ('host1', 90.3), + ('host0', 87.9), + ('host1', 89.3) +); + +query TR rowsort +select host, median(usage) from cpu group by host; +---- +host0 90.2 +host1 90.1 + + +statement ok +drop table cpu; + +# median_multi_even +# data is not sorted and has an odd number of values per group +statement ok +create table cpu (host string, usage float) as select * from (values ('host0', 90.2), ('host1', 90.1), ('host1', 90.5), ('host0', 90.5), ('host1', 90.0), ('host1', 90.3), ('host1', 90.2), ('host1', 90.3)); + +query TR rowsort +select host, median(usage) from cpu group by host; +---- +host0 90.35 +host1 90.25 + +statement ok +drop table cpu + +# csv_query_external_table_count +query I +SELECT COUNT(c12) FROM aggregate_test_100 +---- +100 + +# csv_query_external_table_sum +query II +SELECT SUM(CAST(c7 AS BIGINT)), SUM(CAST(c8 AS BIGINT)) FROM aggregate_test_100 +---- +13060 3017641 + +# csv_query_count +query I +SELECT count(c12) FROM aggregate_test_100 +---- +100 + +# csv_query_count_distinct +query I +SELECT count(distinct c2) FROM aggregate_test_100 +---- +5 + +# csv_query_count_distinct_expr +query I +SELECT count(distinct c2 % 2) FROM aggregate_test_100 +---- +2 + +# csv_query_count_star +query I +SELECT COUNT(*) FROM aggregate_test_100 +---- +100 + +query I +SELECT COUNT(aggregate_test_100.*) FROM aggregate_test_100 +---- +100 + +query error Error during planning: Invalid qualifier foo +SELECT COUNT(foo.*) FROM aggregate_test_100 + +# csv_query_count_literal +query I +SELECT COUNT(2) FROM aggregate_test_100 +---- +100 + +# csv_query_approx_count +# FIX: https://github.com/apache/datafusion/issues/3353 +# query II +# SELECT approx_distinct(c9) AS count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100 +# ---- +# 100 99 + +# csv_query_approx_count_dupe_expr_aliased +query II +SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS b FROM aggregate_test_100 +---- +100 100 + +## This test executes the APPROX_PERCENTILE_CONT aggregation against the test +## data, asserting the estimated quantiles are ±5% their actual values. +## +## Actual quantiles calculated with: +## +## ```r +## read_csv("./testing/data/csv/aggregate_test_100.csv") |> +## select_if(is.numeric) |> +## summarise_all(~ quantile(., c(0.1, 0.5, 0.9))) +## ``` +## +## Giving: +## +## ```text +## c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 +## +## 1 1 -95.3 -22925. -1882606710 -7.25e18 18.9 2671. 472608672. 1.83e18 0.109 0.0714 +## 2 3 15.5 4599 377164262 1.13e18 134. 30634 2365817608. 9.30e18 0.491 0.551 +## 3 5 102. 25334. 1991374996. 7.37e18 231 57518. 3776538487. 1.61e19 0.834 0.946 +## ``` +## +## Column `c12` is omitted due to a large relative error (~10%) due to the small +## float values. + +#csv_query_approx_percentile_cont (c2) +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.1) AS DOUBLE) / 1.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.5) AS DOUBLE) / 3.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.9) AS DOUBLE) / 5.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +# csv_query_approx_percentile_cont (c3) +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.1) AS DOUBLE) / -95.3) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.5) AS DOUBLE) / 15.5) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.9) AS DOUBLE) / 102.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +# csv_query_approx_percentile_cont (c4) +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.1) AS DOUBLE) / -22925.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.5) AS DOUBLE) / 4599.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.9) AS DOUBLE) / 25334.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +# csv_query_approx_percentile_cont (c5) +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.1) AS DOUBLE) / -1882606710.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.5) AS DOUBLE) / 377164262.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.9) AS DOUBLE) / 1991374996.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +# csv_query_approx_percentile_cont (c6) +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.1) AS DOUBLE) / -7250000000000000000) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.5) AS DOUBLE) / 1130000000000000000) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.9) AS DOUBLE) / 7370000000000000000) < 0.05) AS q FROM aggregate_test_100 +---- +true + +# csv_query_approx_percentile_cont (c7) +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.1) AS DOUBLE) / 18.9) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.5) AS DOUBLE) / 134.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.9) AS DOUBLE) / 231.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +# csv_query_approx_percentile_cont (c8) +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.1) AS DOUBLE) / 2671.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.5) AS DOUBLE) / 30634.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.9) AS DOUBLE) / 57518.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +# csv_query_approx_percentile_cont (c9) +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.1) AS DOUBLE) / 472608672.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.5) AS DOUBLE) / 2365817608.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.9) AS DOUBLE) / 3776538487.0) < 0.05) AS q FROM aggregate_test_100 +---- +true + +# csv_query_approx_percentile_cont (c10) +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.1) AS DOUBLE) / 1830000000000000000) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.5) AS DOUBLE) / 9300000000000000000) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.9) AS DOUBLE) / 16100000000000000000) < 0.05) AS q FROM aggregate_test_100 +---- +true + +# csv_query_approx_percentile_cont (c11) +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.1) AS DOUBLE) / 0.109) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.5) AS DOUBLE) / 0.491) < 0.05) AS q FROM aggregate_test_100 +---- +true + +query B +SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.9) AS DOUBLE) / 0.834) < 0.05) AS q FROM aggregate_test_100 +---- +true + +# percentile_cont_with_nulls +query I +SELECT APPROX_PERCENTILE_CONT(v, 0.5) FROM (VALUES (1), (2), (3), (NULL), (NULL), (NULL)) as t (v); +---- +2 + +# percentile_cont_with_nulls_only +query I +SELECT APPROX_PERCENTILE_CONT(v, 0.5) FROM (VALUES (CAST(NULL as INT))) as t (v); +---- +NULL + +# +# percentile_cont edge cases +# + +statement ok +CREATE TABLE tmp_percentile_cont(v1 INT, v2 DOUBLE); + +statement ok +INSERT INTO tmp_percentile_cont VALUES (1, 'NaN'::Double), (2, 'NaN'::Double), (3, 'NaN'::Double); + +# ISSUE: https://github.com/apache/datafusion/issues/11871 +# Note `approx_median()` is using the same implementation as `approx_percentile_cont()` +query R +select APPROX_MEDIAN(v2) from tmp_percentile_cont WHERE v1 = 1; +---- +NaN + +# ISSUE: https://github.com/apache/datafusion/issues/11870 +query R +select APPROX_PERCENTILE_CONT(v2, 0.8) from tmp_percentile_cont; +---- +NaN + +# ISSUE: https://github.com/apache/datafusion/issues/11869 +# Note: `approx_percentile_cont_with_weight()` uses the same implementation as `approx_percentile_cont()` +query R +SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT( + v2, + '+Inf'::Double, + 0.9 +) +FROM tmp_percentile_cont; +---- +NaN + +statement ok +DROP TABLE tmp_percentile_cont; + +# Test for issue where approx_percentile_cont_with_weight + +statement ok +CREATE TABLE t1(v1 BOOL); + +statement ok +INSERT INTO t1 VALUES (TRUE); + +# ISSUE: https://github.com/apache/datafusion/issues/12716 +# This test verifies that approx_percentile_cont_with_weight does not panic when given 'NaN' and returns 'inf' +query R +SELECT approx_percentile_cont_with_weight('NaN'::DOUBLE, 0, 0) FROM t1 WHERE t1.v1; +---- +Infinity + +statement ok +DROP TABLE t1; + +# csv_query_cube_avg +query TIR +SELECT c1, c2, AVG(c3) FROM aggregate_test_100 GROUP BY CUBE (c1, c2) ORDER BY c1, c2 +---- +a 1 -17.6 +a 2 -15.333333333333 +a 3 -4.5 +a 4 -32 +a 5 -32 +a NULL -18.333333333333 +b 1 31.666666666667 +b 2 25.5 +b 3 -42 +b 4 -44.6 +b 5 -0.2 +b NULL -5.842105263158 +c 1 47.5 +c 2 -55.571428571429 +c 3 47.5 +c 4 -10.75 +c 5 12 +c NULL -1.333333333333 +d 1 -8.142857142857 +d 2 109.333333333333 +d 3 41.333333333333 +d 4 54 +d 5 -49.5 +d NULL 25.444444444444 +e 1 75.666666666667 +e 2 37.8 +e 3 48 +e 4 37.285714285714 +e 5 -11 +e NULL 40.333333333333 +NULL 1 16.681818181818 +NULL 2 8.363636363636 +NULL 3 20.789473684211 +NULL 4 1.260869565217 +NULL 5 -13.857142857143 +NULL NULL 7.81 + +# csv_query_rollup_avg +query TIIR +SELECT c1, c2, c3, AVG(c4) FROM aggregate_test_100 WHERE c1 IN ('a', 'b', NULL) GROUP BY ROLLUP (c1, c2, c3) ORDER BY c1, c2, c3 +---- +a 1 -85 -15154 +a 1 -56 8692 +a 1 -25 15295 +a 1 -5 12636 +a 1 83 -14704 +a 1 NULL 1353 +a 2 -48 -18025 +a 2 -43 13080 +a 2 45 15673 +a 2 NULL 3576 +a 3 -72 -11122 +a 3 -12 -9168 +a 3 13 22338.5 +a 3 14 28162 +a 3 17 -22796 +a 3 NULL 4958.833333333333 +a 4 -101 11640 +a 4 -54 -2376 +a 4 -38 20744 +a 4 65 -28462 +a 4 NULL 386.5 +a 5 -101 -12484 +a 5 -31 -12907 +a 5 36 -16974 +a 5 NULL -14121.666666666666 +a NULL NULL 306.047619047619 +b 1 12 7652 +b 1 29 -18218 +b 1 54 -18410 +b 1 NULL -9658.666666666666 +b 2 -60 -21739 +b 2 31 23127 +b 2 63 21456 +b 2 68 15874 +b 2 NULL 9679.5 +b 3 -101 -13217 +b 3 17 14457 +b 3 NULL 620 +b 4 -117 19316 +b 4 -111 -1967 +b 4 -59 25286 +b 4 17 -28070 +b 4 47 20690 +b 4 NULL 7051 +b 5 -82 22080 +b 5 -44 15788 +b 5 -5 24896 +b 5 62 16337 +b 5 68 21576 +b 5 NULL 20135.4 +b NULL NULL 7732.315789473684 +NULL NULL NULL 3833.525 + +# csv_query_groupingsets_avg +query TIIR +SELECT c1, c2, c3, AVG(c4) +FROM aggregate_test_100 +WHERE c1 IN ('a', 'b', NULL) +GROUP BY GROUPING SETS ((c1), (c1,c2), (c1,c2,c3)) +ORDER BY c1, c2, c3 +---- +a 1 -85 -15154 +a 1 -56 8692 +a 1 -25 15295 +a 1 -5 12636 +a 1 83 -14704 +a 1 NULL 1353 +a 2 -48 -18025 +a 2 -43 13080 +a 2 45 15673 +a 2 NULL 3576 +a 3 -72 -11122 +a 3 -12 -9168 +a 3 13 22338.5 +a 3 14 28162 +a 3 17 -22796 +a 3 NULL 4958.833333333333 +a 4 -101 11640 +a 4 -54 -2376 +a 4 -38 20744 +a 4 65 -28462 +a 4 NULL 386.5 +a 5 -101 -12484 +a 5 -31 -12907 +a 5 36 -16974 +a 5 NULL -14121.666666666666 +a NULL NULL 306.047619047619 +b 1 12 7652 +b 1 29 -18218 +b 1 54 -18410 +b 1 NULL -9658.666666666666 +b 2 -60 -21739 +b 2 31 23127 +b 2 63 21456 +b 2 68 15874 +b 2 NULL 9679.5 +b 3 -101 -13217 +b 3 17 14457 +b 3 NULL 620 +b 4 -117 19316 +b 4 -111 -1967 +b 4 -59 25286 +b 4 17 -28070 +b 4 47 20690 +b 4 NULL 7051 +b 5 -82 22080 +b 5 -44 15788 +b 5 -5 24896 +b 5 62 16337 +b 5 68 21576 +b 5 NULL 20135.4 +b NULL NULL 7732.315789473684 + +# csv_query_singlecol_with_rollup_avg +query TIIR +SELECT c1, c2, c3, AVG(c4) +FROM aggregate_test_100 +WHERE c1 IN ('a', 'b', NULL) +GROUP BY c1, ROLLUP (c2, c3) +ORDER BY c1, c2, c3 +---- +a 1 -85 -15154 +a 1 -56 8692 +a 1 -25 15295 +a 1 -5 12636 +a 1 83 -14704 +a 1 NULL 1353 +a 2 -48 -18025 +a 2 -43 13080 +a 2 45 15673 +a 2 NULL 3576 +a 3 -72 -11122 +a 3 -12 -9168 +a 3 13 22338.5 +a 3 14 28162 +a 3 17 -22796 +a 3 NULL 4958.833333333333 +a 4 -101 11640 +a 4 -54 -2376 +a 4 -38 20744 +a 4 65 -28462 +a 4 NULL 386.5 +a 5 -101 -12484 +a 5 -31 -12907 +a 5 36 -16974 +a 5 NULL -14121.666666666666 +a NULL NULL 306.047619047619 +b 1 12 7652 +b 1 29 -18218 +b 1 54 -18410 +b 1 NULL -9658.666666666666 +b 2 -60 -21739 +b 2 31 23127 +b 2 63 21456 +b 2 68 15874 +b 2 NULL 9679.5 +b 3 -101 -13217 +b 3 17 14457 +b 3 NULL 620 +b 4 -117 19316 +b 4 -111 -1967 +b 4 -59 25286 +b 4 17 -28070 +b 4 47 20690 +b 4 NULL 7051 +b 5 -82 22080 +b 5 -44 15788 +b 5 -5 24896 +b 5 62 16337 +b 5 68 21576 +b 5 NULL 20135.4 +b NULL NULL 7732.315789473684 + +# csv_query_approx_percentile_cont_with_weight +query TI +SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a 73 +b 68 +c 122 +d 124 +e 115 + +# csv_query_approx_percentile_cont_with_weight (2) +query TI +SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a 73 +b 68 +c 122 +d 124 +e 115 + +# csv_query_approx_percentile_cont_with_histogram_bins +query TI +SELECT c1, approx_percentile_cont(c3, 0.95, 200) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a 73 +b 68 +c 122 +d 124 +e 115 + +query TI +SELECT c1, approx_percentile_cont_with_weight(c3, c2, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a 74 +b 68 +c 123 +d 124 +e 115 + +# csv_query_sum_crossjoin +query TTI +SELECT a.c1, b.c1, SUM(a.c2) FROM aggregate_test_100 as a CROSS JOIN aggregate_test_100 as b GROUP BY a.c1, b.c1 ORDER BY a.c1, b.c1 +---- +a a 1260 +a b 1140 +a c 1260 +a d 1080 +a e 1260 +b a 1302 +b b 1178 +b c 1302 +b d 1116 +b e 1302 +c a 1176 +c b 1064 +c c 1176 +c d 1008 +c e 1176 +d a 924 +d b 836 +d c 924 +d d 792 +d e 924 +e a 1323 +e b 1197 +e c 1323 +e d 1134 +e e 1323 + +# csv_query_cube_sum_crossjoin +query TTI +SELECT a.c1, b.c1, SUM(a.c2) FROM aggregate_test_100 as a CROSS JOIN aggregate_test_100 as b GROUP BY CUBE (a.c1, b.c1) ORDER BY a.c1, b.c1 +---- +a a 1260 +a b 1140 +a c 1260 +a d 1080 +a e 1260 +a NULL 6000 +b a 1302 +b b 1178 +b c 1302 +b d 1116 +b e 1302 +b NULL 6200 +c a 1176 +c b 1064 +c c 1176 +c d 1008 +c e 1176 +c NULL 5600 +d a 924 +d b 836 +d c 924 +d d 792 +d e 924 +d NULL 4400 +e a 1323 +e b 1197 +e c 1323 +e d 1134 +e e 1323 +e NULL 6300 +NULL a 5985 +NULL b 5415 +NULL c 5985 +NULL d 5130 +NULL e 5985 +NULL NULL 28500 + +# csv_query_cube_distinct_count +query TII +SELECT c1, c2, COUNT(DISTINCT c3) FROM aggregate_test_100 GROUP BY CUBE (c1,c2) ORDER BY c1,c2 +---- +a 1 5 +a 2 3 +a 3 5 +a 4 4 +a 5 3 +a NULL 19 +b 1 3 +b 2 4 +b 3 2 +b 4 5 +b 5 5 +b NULL 17 +c 1 4 +c 2 7 +c 3 4 +c 4 4 +c 5 2 +c NULL 21 +d 1 7 +d 2 3 +d 3 3 +d 4 3 +d 5 2 +d NULL 18 +e 1 3 +e 2 4 +e 3 4 +e 4 7 +e 5 2 +e NULL 18 +NULL 1 22 +NULL 2 20 +NULL 3 17 +NULL 4 23 +NULL 5 14 +NULL NULL 80 + +# csv_query_rollup_distinct_count +query TII +SELECT c1, c2, COUNT(DISTINCT c3) FROM aggregate_test_100 GROUP BY ROLLUP (c1,c2) ORDER BY c1,c2 +---- +a 1 5 +a 2 3 +a 3 5 +a 4 4 +a 5 3 +a NULL 19 +b 1 3 +b 2 4 +b 3 2 +b 4 5 +b 5 5 +b NULL 17 +c 1 4 +c 2 7 +c 3 4 +c 4 4 +c 5 2 +c NULL 21 +d 1 7 +d 2 3 +d 3 3 +d 4 3 +d 5 2 +d NULL 18 +e 1 3 +e 2 4 +e 3 4 +e 4 7 +e 5 2 +e NULL 18 +NULL NULL 80 + +# csv_query_rollup_sum_crossjoin +query TTI +SELECT a.c1, b.c1, SUM(a.c2) FROM aggregate_test_100 as a CROSS JOIN aggregate_test_100 as b GROUP BY ROLLUP (a.c1, b.c1) ORDER BY a.c1, b.c1 +---- +a a 1260 +a b 1140 +a c 1260 +a d 1080 +a e 1260 +a NULL 6000 +b a 1302 +b b 1178 +b c 1302 +b d 1116 +b e 1302 +b NULL 6200 +c a 1176 +c b 1064 +c c 1176 +c d 1008 +c e 1176 +c NULL 5600 +d a 924 +d b 836 +d c 924 +d d 792 +d e 924 +d NULL 4400 +e a 1323 +e b 1197 +e c 1323 +e d 1134 +e e 1323 +e NULL 6300 +NULL NULL 28500 + +# query_count_without_from +query I +SELECT count(1 + 1) +---- +1 + +# csv_query_array_agg +query ? +SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 ORDER BY c13 LIMIT 2) test +---- +[0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB] + +# csv_query_array_agg_empty +query ? +SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 LIMIT 0) test +---- +NULL + +# csv_query_array_agg_one +query ? +SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 ORDER BY c13 LIMIT 1) test +---- +[0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm] + +# csv_query_array_agg_with_overflow +query IIRIII +select c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3, count(c3) count_c3 from aggregate_test_100 group by c2 order by c2 +---- +1 367 16.681818181818 125 -99 22 +2 184 8.363636363636 122 -117 22 +3 395 20.789473684211 123 -101 19 +4 29 1.260869565217 123 -117 23 +5 -194 -13.857142857143 118 -101 14 + +# csv_query_array_cube_agg_with_overflow +query TIIRIII +select c1, c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3, count(c3) count_c3 from aggregate_test_100 group by CUBE (c1,c2) order by c1, c2 +---- +a 1 -88 -17.6 83 -85 5 +a 2 -46 -15.333333333333 45 -48 3 +a 3 -27 -4.5 17 -72 6 +a 4 -128 -32 65 -101 4 +a 5 -96 -32 36 -101 3 +a NULL -385 -18.333333333333 83 -101 21 +b 1 95 31.666666666667 54 12 3 +b 2 102 25.5 68 -60 4 +b 3 -84 -42 17 -101 2 +b 4 -223 -44.6 47 -117 5 +b 5 -1 -0.2 68 -82 5 +b NULL -111 -5.842105263158 68 -117 19 +c 1 190 47.5 103 -24 4 +c 2 -389 -55.571428571429 29 -117 7 +c 3 190 47.5 97 -2 4 +c 4 -43 -10.75 123 -90 4 +c 5 24 12 118 -94 2 +c NULL -28 -1.333333333333 123 -117 21 +d 1 -57 -8.142857142857 125 -99 7 +d 2 328 109.333333333333 122 93 3 +d 3 124 41.333333333333 123 -76 3 +d 4 162 54 102 5 3 +d 5 -99 -49.5 -40 -59 2 +d NULL 458 25.444444444444 125 -99 18 +e 1 227 75.666666666667 120 36 3 +e 2 189 37.8 97 -61 5 +e 3 192 48 112 -95 4 +e 4 261 37.285714285714 97 -56 7 +e 5 -22 -11 64 -86 2 +e NULL 847 40.333333333333 120 -95 21 +NULL 1 367 16.681818181818 125 -99 22 +NULL 2 184 8.363636363636 122 -117 22 +NULL 3 395 20.789473684211 123 -101 19 +NULL 4 29 1.260869565217 123 -117 23 +NULL 5 -194 -13.857142857143 118 -101 14 +NULL NULL 781 7.81 125 -117 100 + +# select with count to forces array_agg_distinct function, since single distinct expression is converted to group by by optimizer +# csv_query_array_agg_distinct +query ?I +SELECT array_sort(array_agg(distinct c2)), count(1) FROM aggregate_test_100 +---- +[1, 2, 3, 4, 5] 100 + +# aggregate_time_min_and_max +query TT +select min(t), max(t) from (select '00:00:00' as t union select '00:00:01' union select '00:00:02') +---- +00:00:00 00:00:02 + +# aggregate Interval(MonthDayNano) min/max +query T?? +select + arrow_typeof(min(column1)), min(column1), max(column1) +from values + (interval '1 month'), + (interval '2 months'), + (interval '2 month 15 days'), + (interval '-2 month') +---- +Interval(MonthDayNano) -2 mons 2 mons 15 days + +# aggregate Interval(DayTime) min/max +query T?? +select + arrow_typeof(min(column1)), min(column1), max(column1) +from values + (arrow_cast('60 minutes', 'Interval(DayTime)')), + (arrow_cast('-3 minutes', 'Interval(DayTime)')), + (arrow_cast('30 minutes', 'Interval(DayTime)')); +---- +Interval(DayTime) -3 mins 1 hours + +# aggregate Interval(YearMonth) min/max +query T?? +select + arrow_typeof(min(column1)), min(column1), max(column1) +from values + (arrow_cast('-1 year', 'Interval(YearMonth)')), + (arrow_cast('13 months', 'Interval(YearMonth)')), + (arrow_cast('1 year', 'Interval(YearMonth)')); +---- +Interval(YearMonth) -1 years 0 mons 1 years 1 mons + +# aggregate +query II +SELECT SUM(c1), SUM(c2) FROM test +---- +7 6 + +# aggregate_empty + +query II +SELECT SUM(c1), SUM(c2) FROM test where c1 > 100000 +---- +NULL NULL + +# aggregate_avg +query RR +SELECT AVG(c1), AVG(c2) FROM test +---- +1.75 1.5 + +# aggregate_max +query II +SELECT MAX(c1), MAX(c2) FROM test +---- +3 2 + +# aggregate_min +query II +SELECT MIN(c1), MIN(c2) FROM test +---- +0 1 + +query error min/max was called with 2 arguments. It requires only 1. +SELECT MIN(c1, c2) FROM test + +query error min/max was called with 2 arguments. It requires only 1. +SELECT MAX(c1, c2) FROM test + +# aggregate_grouped +query II +SELECT c1, SUM(c2) FROM test GROUP BY c1 order by c1 +---- +0 NULL +1 1 +3 4 +NULL 1 + +# aggregate_grouped_avg +query IR +SELECT c1, AVG(c2) FROM test GROUP BY c1 order by c1 +---- +0 NULL +1 1 +3 2 +NULL 1 + +# aggregate_grouped_empty +query IR +SELECT c1, AVG(c2) FROM test WHERE c1 = 123 GROUP BY c1 +---- + +# aggregate_grouped_max +query II +SELECT c1, MAX(c2) FROM test GROUP BY c1 order by c1 +---- +0 NULL +1 1 +3 2 +NULL 1 + +# aggregate_grouped_min +query II +SELECT c1, MIN(c2) FROM test GROUP BY c1 order by c1 +---- +0 NULL +1 1 +3 2 +NULL 1 + +# aggregate_min_max_w_custom_window_frames +query RR +SELECT +MIN(c12) OVER (ORDER BY C12 RANGE BETWEEN 0.3 PRECEDING AND 0.2 FOLLOWING) as min1, +MAX(c12) OVER (ORDER BY C11 RANGE BETWEEN 0.1 PRECEDING AND 0.2 FOLLOWING) as max1 +FROM aggregate_test_100 +ORDER BY C9 +LIMIT 5 +---- +0.014793053078 0.996540038759 +0.014793053078 0.980019341044 +0.014793053078 0.970671228336 +0.266717779508 0.996540038759 +0.360076636233 0.970671228336 + +# aggregate_min_max_with_custom_window_frames_unbounded_start +query RR +SELECT +MIN(c12) OVER (ORDER BY C12 RANGE BETWEEN UNBOUNDED PRECEDING AND 0.2 FOLLOWING) as min1, +MAX(c12) OVER (ORDER BY C11 RANGE BETWEEN UNBOUNDED PRECEDING AND 0.2 FOLLOWING) as max1 +FROM aggregate_test_100 +ORDER BY C9 +LIMIT 5 +---- +0.014793053078 0.996540038759 +0.014793053078 0.980019341044 +0.014793053078 0.980019341044 +0.014793053078 0.996540038759 +0.014793053078 0.980019341044 + +# aggregate_avg_add +query RRRR +SELECT AVG(c1), AVG(c1) + 1, AVG(c1) + 2, 1 + AVG(c1) FROM test +---- +1.75 2.75 3.75 2.75 + +# case_sensitive_identifiers_aggregates +query I +SELECT max(c1) FROM test; +---- +3 + + + +# count_basic +statement ok +create table t (c int) as values (1), (2), (null), (3), (null), (4), (5); + +query IT +select count(c), arrow_typeof(count(c)) from t; +---- +5 Int64 + +statement ok +drop table t; + +# test count with all nulls +statement ok +create table t (c int) as values (null), (null), (null), (null), (null); + +query IT +select count(c), arrow_typeof(count(c)) from t; +---- +0 Int64 + +statement ok +drop table t; + +# test with empty +statement ok +create table t (c int); + +query IT +select count(c), arrow_typeof(count(c)) from t; +---- +0 Int64 + +statement ok +drop table t; + +# test count with string +statement ok +create table t (c string) as values ('a'), ('b'), (null), ('c'), (null), ('d'), ('e'); + +query IT +select count(c), arrow_typeof(count(c)) from t; +---- +5 Int64 + +statement ok +drop table t; + +# test count with largeutf8 +statement ok +create table t (c string) as values + (arrow_cast('a', 'LargeUtf8')), + (arrow_cast('b', 'LargeUtf8')), + (arrow_cast(null, 'LargeUtf8')), + (arrow_cast('c', 'LargeUtf8')) +; + +query T +select arrow_typeof(c) from t; +---- +Utf8 +Utf8 +Utf8 +Utf8 + +query IT +select count(c), arrow_typeof(count(c)) from t; +---- +3 Int64 + +statement ok +drop table t; + +# test count with multiple columns +statement ok +create table t (c1 int, c2 int) as values (1, 1), (2, null), (null, 2), (null, null), (3, 3), (null, 4); + +query IT +select count(c1, c2), arrow_typeof(count(c1, c2)) from t; +---- +2 Int64 + +statement ok +drop table t; + + +query II +SELECT COUNT(c1), COUNT(c2) FROM test +---- +4 4 + + +statement ok +CREATE EXTERNAL TABLE partitioned_test (c1 int, c2 bigint, c3 boolean) +STORED AS CSV LOCATION '../core/tests/data/partitioned_csv' +OPTIONS('format.has_header' 'false'); + +# count partitioned +query II +SELECT COUNT(c1), COUNT(c2) FROM partitioned_test +---- +44 44 + +# count partitioned with multiple columns +query I +SELECT COUNT(c1,c2) FROM partitioned_test +---- +44 + +statement ok +DROP TABLE partitioned_test; + + +# count aggregated +query II +SELECT c1, count(c2) FROM test WHERE c1 IS NOT NULL group by c1 order by c1 +---- +0 0 +1 1 +3 2 + +statement ok +create table table_agg_cube (c1 int, c2 int, c3 int) as values (1, 1, 1), (1, 2, 2), (1, 3, 3), (2, 1, 1), (2, 2, 2), (2, 3, 3), (3, 1, 1), (3, 2, 2), (3, 3, 3); + +# count aggregated cube +query III +SELECT c1, c2, count(c3) FROM table_agg_cube GROUP BY CUBE (c1, c2) ORDER BY c1, c2 +---- +1 1 1 +1 2 1 +1 3 1 +1 NULL 3 +2 1 1 +2 2 1 +2 3 1 +2 NULL 3 +3 1 1 +3 2 1 +3 3 1 +3 NULL 3 +NULL 1 3 +NULL 2 3 +NULL 3 3 +NULL NULL 9 + +statement ok +drop table table_agg_cube; + +# count_multi_expr +query I +SELECT count(c1, c2) FROM test +---- +3 + +# count(distinct) with multiple arguments +query error DataFusion error: This feature is not implemented: COUNT DISTINCT with multiple arguments +SELECT count(distinct c1, c2) FROM test + +# count_null +query III +SELECT count(null), count(null, null), count(distinct null) FROM test +---- +0 0 0 + +# count_multi_expr_group_by +query I +SELECT count(c1, c2) FROM test group by c1 order by c1 +---- +0 +1 +2 +0 + +# count_null_group_by +query III +SELECT count(null), count(null, null), count(distinct null) FROM test group by c1 order by c1 +---- +0 0 0 +0 0 0 +0 0 0 +0 0 0 + +# aggreggte_with_alias +query II +select c1, sum(c2) as `Total Salary` from test group by c1 order by c1 +---- +0 NULL +1 1 +3 4 +NULL 1 + +# simple_avg + +query R +select avg(c1) from test +---- +1.75 + +# avg_decimal +statement ok +create table t (c1 decimal(10, 0)) as values (1), (2), (3), (4), (5), (6); + +query RT +select avg(c1), arrow_typeof(avg(c1)) from t; +---- +3.5 Decimal128(14, 4) + +statement ok +drop table t; + +# avg_decimal_with_nulls +statement ok +create table t (c1 decimal(10, 0)) as values (1), (NULL), (3), (4), (5); + +query RT +select avg(c1), arrow_typeof(avg(c1)) from t; +---- +3.25 Decimal128(14, 4) + +statement ok +drop table t; + +# avg_decimal_all_nulls +statement ok +create table t (c1 decimal(10, 0)) as values (NULL), (NULL), (NULL), (NULL), (NULL), (NULL); + +query RT +select avg(c1), arrow_typeof(avg(c1)) from t; +---- +NULL Decimal128(14, 4) + +statement ok +drop table t; + +# avg_i32 +statement ok +create table t (c1 int) as values (1), (2), (3), (4), (5); + +query RT +select avg(c1), arrow_typeof(avg(c1)) from t; +---- +3 Float64 + +statement ok +drop table t; + +# avg_i32_with_nulls +statement ok +create table t (c1 int) as values (1), (NULL), (3), (4), (5); + +query RT +select avg(c1), arrow_typeof(avg(c1)) from t; +---- +3.25 Float64 + +statement ok +drop table t; + +# avg_i32_all_nulls +statement ok +create table t (c1 int) as values (NULL), (NULL); + +query RT +select avg(c1), arrow_typeof(avg(c1)) from t; +---- +NULL Float64 + +statement ok +drop table t; + +# avg_u32 +statement ok +create table t (c1 int unsigned) as values (1), (2), (3), (4), (5); + +query RT +select avg(c1), arrow_typeof(avg(c1)) from t; +---- +3 Float64 + +statement ok +drop table t; + +# avg_f32 +statement ok +create table t (c1 float) as values (1), (2), (3), (4), (5); + +query RT +select avg(c1), arrow_typeof(avg(c1)) from t; +---- +3 Float64 + +statement ok +drop table t; + +# avg_f64 +statement ok +create table t (c1 double) as values (1), (2), (3), (4), (5); + +query RT +select avg(c1), arrow_typeof(avg(c1)) from t; +---- +3 Float64 + +statement ok +drop table t; + +# covariance_f64_1 +statement ok +create table t (c1 double, c2 double) as values (1, 4), (2, 5), (3, 6); + +query RT +select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t; +---- +0.666666666667 Float64 + +statement ok +drop table t; + +# covariance_f64_2 +statement ok +create table t (c1 double, c2 double) as values (1, 4), (2, 5), (3, 6); + +query RT +select covar_samp(c1, c2), arrow_typeof(covar_samp(c1, c2)) from t; +---- +1 Float64 + +statement ok +drop table t; + +# covariance_f64_4 +statement ok +create table t (c1 double, c2 double) as values (1.1, 4.1), (2.0, 5.0), (3.0, 6.0); + +query RT +select covar_samp(c1, c2), arrow_typeof(covar_samp(c1, c2)) from t; +---- +0.903333333333 Float64 + +statement ok +drop table t; + +# covariance_f64_5 +statement ok +create table t (c1 double, c2 double) as values (1.1, 4.1), (2.0, 5.0), (3.0, 6.0); + +query RT +select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t; +---- +0.602222222222 Float64 + +statement ok +drop table t; + +# covariance_f64_6 +statement ok +create table t (c1 double, c2 double) as values (1.0, 4.0), (2.0, 5.0), (3.0, 6.0), (1.1, 4.4), (2.2, 5.5), (3.3, 6.6); + +query RT +select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t; +---- +0.761666666667 Float64 + +statement ok +drop table t; + +# covariance_i32 +statement ok +create table t (c1 int, c2 int) as values (1, 4), (2, 5), (3, 6); + +query RT +select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t; +---- +0.666666666667 Float64 + +statement ok +drop table t; + +# covariance_u32 +statement ok +create table t (c1 int unsigned, c2 int unsigned) as values (1, 4), (2, 5), (3, 6); + +query RT +select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t; +---- +0.666666666667 Float64 + +statement ok +drop table t; + +# covariance_f32 +statement ok +create table t (c1 float, c2 float) as values (1, 4), (2, 5), (3, 6); + +query RT +select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t; +---- +0.666666666667 Float64 + +statement ok +drop table t; + +# covariance_i32_with_nulls_1 +statement ok +create table t (c1 int, c2 int) as values (1, 4), (null, null), (3, 6); + +query RT +select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t; +---- +1 Float64 + +statement ok +drop table t; + +# covariance_i32_with_nulls_2 +statement ok +create table t (c1 int, c2 int) as values (1, 4), (null, 9), (2, 5), (null, 8), (3, 6), (null, null); + +query RT +select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t; +---- +0.666666666667 Float64 + +statement ok +drop table t; + +# covariance_i32_with_nulls_3 +statement ok +create table t (c1 int, c2 int) as values (1, 4), (null, 9), (2, 5), (null, 8), (3, 6), (null, null); + +query RT +select covar_samp(c1, c2), arrow_typeof(covar_samp(c1, c2)) from t; +---- +1 Float64 + +statement ok +drop table t; + +# covariance_i32_all_nulls +statement ok +create table t (c1 int, c2 int) as values (null, null), (null, null); + +query RT +select covar_samp(c1, c2), arrow_typeof(covar_samp(c1, c2)) from t; +---- +NULL Float64 + +statement ok +drop table t; + +# covariance_pop_i32_all_nulls +statement ok +create table t (c1 int, c2 int) as values (null, null), (null, null); + +query RT +select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t; +---- +NULL Float64 + +statement ok +drop table t; + +# covariance_1_input +statement ok +create table t (c1 double, c2 double) as values (1, 2); + +query RT +select covar_samp(c1, c2), arrow_typeof(covar_samp(c1, c2)) from t; +---- +NULL Float64 + +statement ok +drop table t; + +# covariance_pop_1_input +statement ok +create table t (c1 double, c2 double) as values (1, 2); + +query RT +select covar_pop(c1, c2), arrow_typeof(covar_pop(c1, c2)) from t; +---- +0 Float64 + +statement ok +drop table t; + +# variance_f64_1 +statement ok +create table t (c double) as values (1), (2), (3), (4), (5); + +query RT +select var(c), arrow_typeof(var(c)) from t; +---- +2.5 Float64 + +statement ok +drop table t; + +# aggregate stddev f64_1 +statement ok +create table t (c1 double) as values (1), (2); + +query RT +select stddev_pop(c1), arrow_typeof(stddev_pop(c1)) from t; +---- +0.5 Float64 + +statement ok +drop table t; + +# aggregate stddev f64_2 +statement ok +create table t (c1 double) as values (1.1), (2), (3); + +query RT +select stddev_pop(c1), arrow_typeof(stddev_pop(c1)) from t; +---- +0.776029781788 Float64 + +statement ok +drop table t; + +# aggregate stddev f64_3 +statement ok +create table t (c1 double) as values (1), (2), (3), (4), (5); + +query RT +select stddev_pop(c1), arrow_typeof(stddev_pop(c1)) from t; +---- +1.414213562373 Float64 + +statement ok +drop table t; + +# aggregate stddev f64_4 +statement ok +create table t (c1 double) as values (1.1), (2), (3); + +query RT +select stddev(c1), arrow_typeof(stddev(c1)) from t; +---- +0.950438495292 Float64 + +statement ok +drop table t; + +# aggregate stddev i32 +statement ok +create table t (c1 int) as values (1), (2), (3), (4), (5); + +query RT +select stddev_pop(c1), arrow_typeof(stddev_pop(c1)) from t; +---- +1.414213562373 Float64 + +statement ok +drop table t; + +# aggregate stddev u32 +statement ok +create table t (c1 int unsigned) as values (1), (2), (3), (4), (5); + +query RT +select stddev_pop(c1), arrow_typeof(stddev_pop(c1)) from t; +---- +1.414213562373 Float64 + +statement ok +drop table t; + +# aggregate stddev f32 +statement ok +create table t (c1 float) as values (1), (2), (3), (4), (5); + +query RT +select stddev_pop(c1), arrow_typeof(stddev_pop(c1)) from t; +---- +1.414213562373 Float64 + +statement ok +drop table t; + +# aggregate stddev single_input +statement ok +create table t (c1 double) as values (1); + +query RT +select stddev_pop(c1), arrow_typeof(stddev_pop(c1)) from t; +---- +0 Float64 + +statement ok +drop table t; + +# aggregate stddev with_nulls +statement ok +create table t (c1 int) as values (1), (null), (3), (4), (5); + +query RT +select stddev_pop(c1), arrow_typeof(stddev_pop(c1)) from t; +---- +1.479019945775 Float64 + +statement ok +drop table t; + +# aggregate stddev all_nulls +statement ok +create table t (c1 int) as values (null), (null); + +query RT +select stddev_pop(c1), arrow_typeof(stddev_pop(c1)) from t; +---- +NULL Float64 + +statement ok +drop table t; + +# aggregate variance f64_1 +statement ok +create table t (c1 double) as values (1), (2); + +query RT +select var_pop(c1), arrow_typeof(var_pop(c1)) from t; +---- +0.25 Float64 + +statement ok +drop table t; + +# aggregate variance f64_2 +statement ok +create table t (c1 double) as values (1), (2), (3), (4), (5); + +query RT +select var_pop(c1), arrow_typeof(var_pop(c1)) from t; +---- +2 Float64 + +statement ok +drop table t; + +# aggregate variance f64_3 +statement ok +create table t (c1 double) as values (1), (2), (3), (4), (5); + +query RT +select var(c1), arrow_typeof(var(c1)) from t; +---- +2.5 Float64 + +statement ok +drop table t; + +# variance_f64_2 +statement ok +create table t (c double) as values (1.1), (2), (3); + +query RT +select var(c), arrow_typeof(var(c)) from t; +---- +0.903333333333 Float64 + +statement ok +drop table t; + +# aggregate variance f64_4 +statement ok +create table t (c1 double) as values (1.1), (2), (3); + +query RT +select var(c1), arrow_typeof(var(c1)) from t; +---- +0.903333333333 Float64 + +statement ok +drop table t; + +# variance_1_input +statement ok +create table t (a double not null) as values (1); + +query RT +select var(a), arrow_typeof(var(a)) from t; +---- +NULL Float64 + +statement ok +drop table t; + +# variance_i32_all_nulls +statement ok +create table t (a int) as values (null), (null); + +query RT +select var(a), arrow_typeof(var(a)) from t; +---- +NULL Float64 + +statement ok +drop table t; + +# aggregate variance i32 +statement ok +create table t (c1 int) as values (1), (2), (3), (4), (5); + +query RT +select var_pop(c1), arrow_typeof(var_pop(c1)) from t; +---- +2 Float64 + +statement ok +drop table t; + +# aggregate variance u32 +statement ok +create table t (c1 int unsigned) as values (1), (2), (3), (4), (5); + +query RT +select var_pop(c1), arrow_typeof(var_pop(c1)) from t; +---- +2 Float64 + +statement ok +drop table t; + +# aggregate variance f32 +statement ok +create table t (c1 float) as values (1), (2), (3), (4), (5); + +query RT +select var_pop(c1), arrow_typeof(var_pop(c1)) from t; +---- +2 Float64 + +statement ok +drop table t; + +# aggregate single input +statement ok +create table t (c1 double) as values (1); + +query RT +select var_pop(c1), arrow_typeof(var_pop(c1)) from t; +---- +0 Float64 + +statement ok +drop table t; + +# aggregate i32 with nulls +statement ok +create table t (c1 int) as values (1), (null), (3), (4), (5); + +query RT +select var_pop(c1), arrow_typeof(var_pop(c1)) from t; +---- +2.1875 Float64 + +statement ok +drop table t; + +# aggregate i32 all nulls +statement ok +create table t (c1 int) as values (null), (null); + +query RT +select var_pop(c1), arrow_typeof(var_pop(c1)) from t; +---- +NULL Float64 + +statement ok +drop table t; + +# simple_mean +query R +select mean(c1) from test +---- +1.75 + +# aggregate sum distinct, coerced result from i32 to i64 +statement ok +create table t (c int) as values (1), (2), (1), (3), (null), (null), (-3), (-3); + +query IT +select sum(distinct c), arrow_typeof(sum(distinct c)) from t; +---- +3 Int64 + +statement ok +drop table t; + +# aggregate sum distinct, coerced result from u32 to u64 +statement ok +create table t (c int unsigned) as values (1), (2), (1), (3), (null), (null), (3); + +query IT +select sum(distinct c), arrow_typeof(sum(distinct c)) from t; +---- +6 UInt64 + +statement ok +drop table t; + +# aggregate sum distinct, coerced result from f32 to f64 +statement ok +create table t (c float) as values (1.0), (2.2), (1.0), (3.3), (null), (null), (3.3), (-2.0); + +query RT +select sum(distinct c), arrow_typeof(sum(distinct c)) from t; +---- +4.5 Float64 + +statement ok +drop table t; + +# aggregate sum distinct with decimal +statement ok +create table t (c decimal(35, 0)) as values (1), (2), (1), (3), (null), (null), (3), (-2); + +query RT +select sum(distinct c), arrow_typeof(sum(distinct c)) from t; +---- +4 Decimal128(38, 0) + +statement ok +drop table t; + +# query_sum_distinct - 2 different aggregate functions: avg and sum(distinct) +query RI +SELECT AVG(c1), SUM(DISTINCT c2) FROM test +---- +1.75 3 + +# query_sum_distinct - 2 sum(distinct) functions +query II +SELECT SUM(DISTINCT c1), SUM(DISTINCT c2) FROM test +---- +4 3 + +# # query_count_distinct +query I +SELECT COUNT(DISTINCT c1) FROM test +---- +3 + +# TODO: count_distinct_integers_aggregated_single_partition + +# TODO: count_distinct_integers_aggregated_multiple_partitions + +# TODO: aggregate_with_alias + +# test_approx_percentile_cont_decimal_support +query TI +SELECT c1, approx_percentile_cont(c2, cast(0.85 as decimal(10,2))) apc FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a 4 +b 5 +c 4 +d 4 +e 4 + +# array_agg_zero +query ? +SELECT ARRAY_AGG([]) +---- +[[]] + +# array_agg_one +query ? +SELECT ARRAY_AGG([1]) +---- +[[1]] + +# test array_agg with no row qualified +statement ok +create table t(a int, b float, c bigint) as values (1, 1.2, 2); + +# returns NULL, follows DuckDB's behaviour +query ? +select array_agg(a) from t where a > 2; +---- +NULL + +query ? +select array_agg(b) from t where b > 3.1; +---- +NULL + +query ? +select array_agg(c) from t where c > 3; +---- +NULL + +query ?I +select array_agg(c), count(1) from t where c > 3; +---- +NULL 0 + +# returns 0 rows if group by is applied, follows DuckDB's behaviour +query ? +select array_agg(a) from t where a > 3 group by a; +---- + +query ?I +select array_agg(a), count(1) from t where a > 3 group by a; +---- + +# returns NULL, follows DuckDB's behaviour +query ? +select array_agg(distinct a) from t where a > 3; +---- +NULL + +query ?I +select array_agg(distinct a), count(1) from t where a > 3; +---- +NULL 0 + +# returns 0 rows if group by is applied, follows DuckDB's behaviour +query ? +select array_agg(distinct a) from t where a > 3 group by a; +---- + +query ?I +select array_agg(distinct a), count(1) from t where a > 3 group by a; +---- + +# test order sensitive array agg +query ? +select array_agg(a order by a) from t where a > 3; +---- +NULL + +query ? +select array_agg(a order by a) from t where a > 3 group by a; +---- + +query ?I +select array_agg(a order by a), count(1) from t where a > 3 group by a; +---- + +statement ok +drop table t; + +# test with no values +statement ok +create table t(a int, b float, c bigint); + +query ? +select array_agg(a) from t; +---- +NULL + +query ? +select array_agg(b) from t; +---- +NULL + +query ? +select array_agg(c) from t; +---- +NULL + +query ?I +select array_agg(distinct a), count(1) from t; +---- +NULL 0 + +query ?I +select array_agg(distinct b), count(1) from t; +---- +NULL 0 + +query ?I +select array_agg(distinct b), count(1) from t; +---- +NULL 0 + +statement ok +drop table t; + + +# array_agg_i32 +statement ok +create table t (c1 int) as values (1), (2), (3), (4), (5); + +query ? +select array_agg(c1) from t; +---- +[1, 2, 3, 4, 5] + +statement ok +drop table t; + +# array_agg_nested +statement ok +create table t as values (make_array([1, 2, 3], [4, 5])), (make_array([6], [7, 8])), (make_array([9])); + +query ? +select array_agg(column1) from t; +---- +[[[1, 2, 3], [4, 5]], [[6], [7, 8]], [[9]]] + +statement ok +drop table t; + +# variance_single_value +query RRRR +select var(sq.column1), var_pop(sq.column1), stddev(sq.column1), stddev_pop(sq.column1) from (values (1.0)) as sq; +---- +NULL 0 NULL 0 + +# variance_two_values +query RRRR +select var(sq.column1), var_pop(sq.column1), stddev(sq.column1), stddev_pop(sq.column1) from (values (1.0), (3.0)) as sq; +---- +2 1 1.414213562373 1 + + + +# aggregates on empty tables +statement ok +CREATE TABLE empty (column1 bigint, column2 int); + +# no group by column +query IIRIIIII +SELECT + count(column1), -- counts should be zero, even for nulls + sum(column1), -- other aggregates should be null + avg(column1), + min(column1), + max(column1), + bit_and(column1), + bit_or(column1), + bit_xor(column1) +FROM empty +---- +0 NULL NULL NULL NULL NULL NULL NULL + +# Same query but with grouping (no groups, so no output) +query IIRIIIIII +SELECT + count(column1), + sum(column1), + avg(column1), + min(column1), + max(column1), + bit_and(column1), + bit_or(column1), + bit_xor(column1), + column2 +FROM empty +GROUP BY column2 +ORDER BY column2; +---- + + +statement ok +drop table empty + +# aggregates on all nulls +statement ok +CREATE TABLE the_nulls +AS VALUES + (null::bigint, 1), + (null::bigint, 1), + (null::bigint, 2); + +query II +select * from the_nulls +---- +NULL 1 +NULL 1 +NULL 2 + +# no group by column +query IIRIIIII +SELECT + count(column1), -- counts should be zero, even for nulls + sum(column1), -- other aggregates should be null + avg(column1), + min(column1), + max(column1), + bit_and(column1), + bit_or(column1), + bit_xor(column1) +FROM the_nulls +---- +0 NULL NULL NULL NULL NULL NULL NULL + +# Same query but with grouping +query IIRIIIIII +SELECT + count(column1), -- counts should be zero, even for nulls + sum(column1), -- other aggregates should be null + avg(column1), + min(column1), + max(column1), + bit_and(column1), + bit_or(column1), + bit_xor(column1), + column2 +FROM the_nulls +GROUP BY column2 +ORDER BY column2; +---- +0 NULL NULL NULL NULL NULL NULL NULL 1 +0 NULL NULL NULL NULL NULL NULL NULL 2 + + +statement ok +drop table the_nulls; + +statement ok +create table bit_aggregate_functions ( + c1 SMALLINT NOT NULL, + c2 SMALLINT NOT NULL, + c3 SMALLINT, + tag varchar +) +as values + (5, 10, 11, 'A'), + (33, 11, null, 'B'), + (9, 12, null, 'A'); + +# query_bit_and, query_bit_or, query_bit_xor +query IIIIIIIII +SELECT + bit_and(c1), + bit_and(c2), + bit_and(c3), + bit_or(c1), + bit_or(c2), + bit_or(c3), + bit_xor(c1), + bit_xor(c2), + bit_xor(c3) +FROM bit_aggregate_functions +---- +1 8 11 45 15 11 45 13 11 + +# query_bit_and, query_bit_or, query_bit_xor, with group +query IIIIIIIIIT +SELECT + bit_and(c1), + bit_and(c2), + bit_and(c3), + bit_or(c1), + bit_or(c2), + bit_or(c3), + bit_xor(c1), + bit_xor(c2), + bit_xor(c3), + tag +FROM bit_aggregate_functions +GROUP BY tag +ORDER BY tag +---- +1 8 11 13 14 11 12 6 11 A +33 11 NULL 33 11 NULL 33 11 NULL B + + +# bit_and_i32 +statement ok +create table t (c int) as values (4), (7), (15); + +query IT +Select bit_and(c), arrow_typeof(bit_and(c)) from t; +---- +4 Int32 + +statement ok +drop table t; + +# bit_and_i32_with_nulls +statement ok +create table t (c int) as values (1), (NULL), (3), (5); + +query IT +Select bit_and(c), arrow_typeof(bit_and(c)) from t; +---- +1 Int32 + +statement ok +drop table t; + +# bit_and_i32_all_nulls +statement ok +create table t (c int) as values (NULL), (NULL); + +query IT +Select bit_and(c), arrow_typeof(bit_and(c)) from t; +---- +NULL Int32 + +statement ok +drop table t; + +# bit_and_u32 +statement ok +create table t (c int unsigned) as values (4), (7), (15); + +query IT +Select bit_and(c), arrow_typeof(bit_and(c)) from t; +---- +4 UInt32 + +statement ok +drop table t; + +# bit_or_i32 +statement ok +create table t (c int) as values (4), (7), (15); + +query IT +Select bit_or(c), arrow_typeof(bit_or(c)) from t; +---- +15 Int32 + +statement ok +drop table t; + +# bit_or_i32_with_nulls +statement ok +create table t (c int) as values (1), (NULL), (3), (5); + +query IT +Select bit_or(c), arrow_typeof(bit_or(c)) from t; +---- +7 Int32 + +statement ok +drop table t; + +#bit_or_i32_all_nulls +statement ok +create table t (c int) as values (NULL), (NULL); + +query IT +Select bit_or(c), arrow_typeof(bit_or(c)) from t; +---- +NULL Int32 + +statement ok +drop table t; + + +#bit_or_u32 +statement ok +create table t (c int unsigned) as values (4), (7), (15); + +query IT +Select bit_or(c), arrow_typeof(bit_or(c)) from t; +---- +15 UInt32 + +statement ok +drop table t; + +#bit_xor_i32 +statement ok +create table t (c int) as values (4), (7), (4), (7), (15); + +query IT +Select bit_xor(c), arrow_typeof(bit_xor(c)) from t; +---- +15 Int32 + +statement ok +drop table t; + +# bit_xor_i32_with_nulls +statement ok +create table t (c int) as values (1), (1), (NULL), (3), (5); + +query IT +Select bit_xor(c), arrow_typeof(bit_xor(c)) from t; +---- +6 Int32 + +statement ok +drop table t; + +# bit_xor_i32_all_nulls +statement ok +create table t (c int) as values (NULL), (NULL); + +query IT +Select bit_xor(c), arrow_typeof(bit_xor(c)) from t; +---- +NULL Int32 + +statement ok +drop table t; + +# bit_xor_u32 +statement ok +create table t (c int unsigned) as values (4), (7), (4), (7), (15); + +query IT +Select bit_xor(c), arrow_typeof(bit_xor(c)) from t; +---- +15 UInt32 + +statement ok +drop table t; + +# bit_xor_distinct_i32 +statement ok +create table t (c int) as values (4), (7), (4), (7), (15); + +query IT +Select bit_xor(DISTINCT c), arrow_typeof(bit_xor(DISTINCT c)) from t; +---- +12 Int32 + +statement ok +drop table t; + +# bit_xor_distinct_i32_with_nulls +statement ok +create table t (c int) as values (1), (1), (NULL), (3), (5); + +query IT +Select bit_xor(DISTINCT c), arrow_typeof(bit_xor(DISTINCT c)) from t; +---- +7 Int32 + + +statement ok +drop table t; + +# bit_xor_distinct_i32_all_nulls +statement ok +create table t (c int ) as values (NULL), (NULL); + +query IT +Select bit_xor(DISTINCT c), arrow_typeof(bit_xor(DISTINCT c)) from t; +---- +NULL Int32 + + +statement ok +drop table t; + +# bit_xor_distinct_u32 +statement ok +create table t (c int unsigned) as values (4), (7), (4), (7), (15); + +query IT +Select bit_xor(DISTINCT c), arrow_typeof(bit_xor(DISTINCT c)) from t; +---- +12 UInt32 + +statement ok +drop table t; + +################# +# Min_Max Begin # +################# +# min_decimal, max_decimal +statement ok +CREATE TABLE decimals (value DECIMAL(10, 2)); + +statement ok +INSERT INTO decimals VALUES (123.0001), (124.00); + +query RR +SELECT MIN(value), MAX(value) FROM decimals; +---- +123 124 + +statement ok +DROP TABLE decimals; + +statement ok +CREATE TABLE decimals_batch (value DECIMAL(10, 0)); + +statement ok +INSERT INTO decimals_batch VALUES (1), (2), (3), (4), (5); + +query RR +SELECT MIN(value), MAX(value) FROM decimals_batch; +---- +1 5 + +statement ok +DROP TABLE decimals_batch; + +statement ok +CREATE TABLE decimals_empty (value DECIMAL(10, 0)); + +query RR +SELECT MIN(value), MAX(value) FROM decimals_empty; +---- +NULL NULL + +statement ok +DROP TABLE decimals_empty; + +# min_decimal_all_nulls, max_decimal_all_nulls +statement ok +CREATE TABLE decimals_all_nulls (value DECIMAL(10, 0)); + +statement ok +INSERT INTO decimals_all_nulls VALUES (NULL), (NULL), (NULL), (NULL), (NULL), (NULL); + +query RR +SELECT MIN(value), MAX(value) FROM decimals_all_nulls; +---- +NULL NULL + +statement ok +DROP TABLE decimals_all_nulls; + +# min_decimal_with_nulls, max_decimal_with_nulls +statement ok +CREATE TABLE decimals_with_nulls (value DECIMAL(10, 0)); + +statement ok +INSERT INTO decimals_with_nulls VALUES (1), (NULL), (3), (4), (5); + +query RR +SELECT MIN(value), MAX(value) FROM decimals_with_nulls; +---- +1 5 + +statement ok +DROP TABLE decimals_with_nulls; + +statement ok +CREATE TABLE decimals_error (value DECIMAL(10, 2)); + +statement ok +INSERT INTO decimals_error VALUES (123.00), (arrow_cast(124.001, 'Decimal128(10, 3)')); + +query RR +SELECT MIN(value), MAX(value) FROM decimals_error; +---- +123 124 + +statement ok +DROP TABLE decimals_error; + +statement ok +CREATE TABLE decimals_agg (value DECIMAL(10, 0)); + +statement ok +INSERT INTO decimals_agg VALUES (1), (2), (3), (4), (5); + +query RR +SELECT MIN(value), MAX(value) FROM decimals_agg; +---- +1 5 + +statement ok +DROP TABLE decimals_agg; + +# min_i32, max_i32 +statement ok +CREATE TABLE integers (value INT); + +statement ok +INSERT INTO integers VALUES (1), (2), (3), (4), (5); + +query II +SELECT MIN(value), MAX(value) FROM integers +---- +1 5 + +statement ok +DROP TABLE integers; + +# min_utf8, max_utf8 +statement ok +CREATE TABLE strings (value TEXT); + +statement ok +INSERT INTO strings VALUES ('d'), ('a'), ('c'), ('b'); + +query TT +SELECT MIN(value), MAX(value) FROM strings +---- +a d + +statement ok +DROP TABLE strings; + +# min_i32_with_nulls, max_i32_with_nulls +statement ok +CREATE TABLE integers_with_nulls (value INT); + +statement ok +INSERT INTO integers_with_nulls VALUES (1), (NULL), (3), (4), (5); + +query II +SELECT MIN(value), MAX(value) FROM integers_with_nulls +---- +1 5 + +# grouping_sets with null values +query II rowsort +SELECT value, min(value) FROM integers_with_nulls GROUP BY CUBE(value) +---- +1 1 +3 3 +4 4 +5 5 +NULL 1 +NULL NULL + + +statement ok +DROP TABLE integers_with_nulls; + +# min_i32_all_nulls, max_i32_all_nulls +statement ok +CREATE TABLE integers_all_nulls (value INT); + +query II +SELECT MIN(value), MAX(value) FROM integers_all_nulls +---- +NULL NULL + +statement ok +DROP TABLE integers_all_nulls; + +# min_u32, max_u32 +statement ok +CREATE TABLE uintegers (value INT UNSIGNED); + +statement ok +INSERT INTO uintegers VALUES (1), (2), (3), (4), (5); + +query II +SELECT MIN(value), MAX(value) FROM uintegers +---- +1 5 + +statement ok +DROP TABLE uintegers; + +# min_f32, max_f32 +statement ok +CREATE TABLE floats (value FLOAT); + +statement ok +INSERT INTO floats VALUES (1.0), (2.0), (3.0), (4.0), (5.0); + +query RR +SELECT MIN(value), MAX(value) FROM floats +---- +1 5 + +statement ok +DROP TABLE floats; + +# min_f64, max_f64 +statement ok +CREATE TABLE doubles (value DOUBLE); + +statement ok +INSERT INTO doubles VALUES (1.0), (2.0), (3.0), (4.0), (5.0); + +query RR +SELECT MIN(value), MAX(value) FROM doubles +---- +1 5 + +statement ok +DROP TABLE doubles; + +# min_date, max_date +statement ok +CREATE TABLE dates (value DATE); + +statement ok +INSERT INTO dates VALUES ('1970-01-02'), ('1970-01-03'), ('1970-01-04'), ('1970-01-05'), ('1970-01-06'); + +query DD +SELECT MIN(value), MAX(value) FROM dates +---- +1970-01-02 1970-01-06 + +statement ok +DROP TABLE dates; + +# min_seconds, max_seconds +statement ok +CREATE TABLE times (value TIME); + +statement ok +INSERT INTO times VALUES ('00:00:01'), ('00:00:02'), ('00:00:03'), ('00:00:04'), ('00:00:05'); + +query DD +SELECT MIN(value), MAX(value) FROM times +---- +00:00:01 00:00:05 + +statement ok +DROP TABLE times; + +# min_milliseconds, max_milliseconds +statement ok +CREATE TABLE time32millisecond (value TIME); + +statement ok +INSERT INTO time32millisecond VALUES ('00:00:00.001'), ('00:00:00.002'), ('00:00:00.003'), ('00:00:00.004'), ('00:00:00.005'); + +query DD +SELECT MIN(value), MAX(value) FROM time32millisecond +---- +00:00:00.001 00:00:00.005 + +statement ok +DROP TABLE time32millisecond; + +# min_microseconds, max_microseconds +statement ok +CREATE TABLE time64microsecond (value TIME); + +statement ok +INSERT INTO time64microsecond VALUES ('00:00:00.000001'), ('00:00:00.000002'), ('00:00:00.000003'), ('00:00:00.000004'), ('00:00:00.000005'); + +query DD +SELECT MIN(value), MAX(value) FROM time64microsecond +---- +00:00:00.000001 00:00:00.000005 + +statement ok +DROP TABLE time64microsecond; + +# min_nanoseconds, max_nanoseconds +statement ok +CREATE TABLE time64nanosecond (value TIME); + +statement ok +INSERT INTO time64nanosecond VALUES ('00:00:00.000000001'), ('00:00:00.000000002'), ('00:00:00.000000003'), ('00:00:00.000000004'), ('00:00:00.000000005'); + +query DD +SELECT MIN(value), MAX(value) FROM time64nanosecond +---- +00:00:00.000000001 00:00:00.000000005 + +statement ok +DROP TABLE time64nanosecond; + +# min_timestamp, max_timestamp +statement ok +CREATE TABLE timestampmicrosecond (value TIMESTAMP); + +statement ok +INSERT INTO timestampmicrosecond VALUES ('1970-01-01 00:00:00.000001'), ('1970-01-01 00:00:00.000002'), ('1970-01-01 00:00:00.000003'), ('1970-01-01 00:00:00.000004'), ('1970-01-01 00:00:00.000005'); + +query PP +SELECT MIN(value), MAX(value) FROM timestampmicrosecond +---- +1970-01-01T00:00:00.000001 1970-01-01T00:00:00.000005 + +statement ok +DROP TABLE timestampmicrosecond; + +# max_bool +statement ok +CREATE TABLE max_bool (value BOOLEAN); + +statement ok +INSERT INTO max_bool VALUES (false), (false); + +query B +SELECT MAX(value) FROM max_bool +---- +false + +statement ok +DROP TABLE max_bool; + +statement ok +CREATE TABLE max_bool (value BOOLEAN); + +statement ok +INSERT INTO max_bool VALUES (true), (true); + +query B +SELECT MAX(value) FROM max_bool +---- +true + +statement ok +DROP TABLE max_bool; + +statement ok +CREATE TABLE max_bool (value BOOLEAN); + +statement ok +INSERT INTO max_bool VALUES (false), (true), (false); + +query B +SELECT MAX(value) FROM max_bool +---- +true + +statement ok +DROP TABLE max_bool; + +statement ok +CREATE TABLE max_bool (value BOOLEAN); + +statement ok +INSERT INTO max_bool VALUES (true), (false), (true); + +query B +SELECT MAX(value) FROM max_bool +---- +true + +statement ok +DROP TABLE max_bool; + +# min_bool +statement ok +CREATE TABLE min_bool (value BOOLEAN); + +statement ok +INSERT INTO min_bool VALUES (false), (false); + +query B +SELECT MIN(value) FROM min_bool +---- +false + +statement ok +DROP TABLE min_bool; + +statement ok +CREATE TABLE min_bool (value BOOLEAN); + +statement ok +INSERT INTO min_bool VALUES (true), (true); + +query B +SELECT MIN(value) FROM min_bool +---- +true + +statement ok +DROP TABLE min_bool; + +statement ok +CREATE TABLE min_bool (value BOOLEAN); + +statement ok +INSERT INTO min_bool VALUES (false), (true), (false); + +query B +SELECT MIN(value) FROM min_bool +---- +false + +statement ok +DROP TABLE min_bool; + +statement ok +CREATE TABLE min_bool (value BOOLEAN); + +statement ok +INSERT INTO min_bool VALUES (true), (false), (true); + +query B +SELECT MIN(value) FROM min_bool +---- +false + +statement ok +DROP TABLE min_bool; + +################# +# Min_Max End # +################# + + + +################# +# min_max on strings/binary with null values and groups +################# + +statement ok +CREATE TABLE strings (value TEXT, id int); + +statement ok +INSERT INTO strings VALUES + ('c', 1), + ('d', 1), + ('a', 3), + ('c', 1), + ('b', 1), + (NULL, 1), + (NULL, 4), + ('d', 1), + ('z', 2), + ('c', 1), + ('a', 2); + +############ Utf8 ############ + +query IT +SELECT id, MIN(value) FROM strings GROUP BY id ORDER BY id; +---- +1 b +2 a +3 a +4 NULL + +query IT +SELECT id, MAX(value) FROM strings GROUP BY id ORDER BY id; +---- +1 d +2 z +3 a +4 NULL + +############ LargeUtf8 ############ + +statement ok +CREATE VIEW large_strings AS SELECT id, arrow_cast(value, 'LargeUtf8') as value FROM strings; + + +query IT +SELECT id, MIN(value) FROM large_strings GROUP BY id ORDER BY id; +---- +1 b +2 a +3 a +4 NULL + +query IT +SELECT id, MAX(value) FROM large_strings GROUP BY id ORDER BY id; +---- +1 d +2 z +3 a +4 NULL + +statement ok +DROP VIEW large_strings + +############ Utf8View ############ + +statement ok +CREATE VIEW string_views AS SELECT id, arrow_cast(value, 'Utf8View') as value FROM strings; + + +query IT +SELECT id, MIN(value) FROM string_views GROUP BY id ORDER BY id; +---- +1 b +2 a +3 a +4 NULL + +query IT +SELECT id, MAX(value) FROM string_views GROUP BY id ORDER BY id; +---- +1 d +2 z +3 a +4 NULL + +statement ok +DROP VIEW string_views + +############ Binary ############ + +statement ok +CREATE VIEW binary AS SELECT id, arrow_cast(value, 'Binary') as value FROM strings; + + +query I? +SELECT id, MIN(value) FROM binary GROUP BY id ORDER BY id; +---- +1 62 +2 61 +3 61 +4 NULL + +query I? +SELECT id, MAX(value) FROM binary GROUP BY id ORDER BY id; +---- +1 64 +2 7a +3 61 +4 NULL + +statement ok +DROP VIEW binary + +############ LargeBinary ############ + +statement ok +CREATE VIEW large_binary AS SELECT id, arrow_cast(value, 'LargeBinary') as value FROM strings; + + +query I? +SELECT id, MIN(value) FROM large_binary GROUP BY id ORDER BY id; +---- +1 62 +2 61 +3 61 +4 NULL + +query I? +SELECT id, MAX(value) FROM large_binary GROUP BY id ORDER BY id; +---- +1 64 +2 7a +3 61 +4 NULL + +statement ok +DROP VIEW large_binary + +############ BinaryView ############ + +statement ok +CREATE VIEW binary_views AS SELECT id, arrow_cast(value, 'BinaryView') as value FROM strings; + + +query I? +SELECT id, MIN(value) FROM binary_views GROUP BY id ORDER BY id; +---- +1 62 +2 61 +3 61 +4 NULL + +query I? +SELECT id, MAX(value) FROM binary_views GROUP BY id ORDER BY id; +---- +1 64 +2 7a +3 61 +4 NULL + +statement ok +DROP VIEW binary_views + +statement ok +DROP TABLE strings; + +################# +# End min_max on strings/binary with null values and groups +################# + + +statement ok +create table bool_aggregate_functions ( + c1 boolean not null, + c2 boolean not null, + c3 boolean not null, + c4 boolean not null, + c5 boolean, + c6 boolean, + c7 boolean, + c8 boolean +) +as values + (true, true, false, false, true, true, null, null), + (true, false, true, false, false, null, false, null), + (true, true, false, false, null, true, false, null); + +# query_bool_and +query BBBBBBBB +SELECT bool_and(c1), bool_and(c2), bool_and(c3), bool_and(c4), bool_and(c5), bool_and(c6), bool_and(c7), bool_and(c8) FROM bool_aggregate_functions +---- +true false false false false true false NULL + +# query_bool_and_distinct +query BBBBBBBB +SELECT bool_and(distinct c1), bool_and(distinct c2), bool_and(distinct c3), bool_and(distinct c4), bool_and(distinct c5), bool_and(distinct c6), bool_and(distinct c7), bool_and(distinct c8) FROM bool_aggregate_functions +---- +true false false false false true false NULL + +# query_bool_or +query BBBBBBBB +SELECT bool_or(c1), bool_or(c2), bool_or(c3), bool_or(c4), bool_or(c5), bool_or(c6), bool_or(c7), bool_or(c8) FROM bool_aggregate_functions +---- +true true true false true true false NULL + +# query_bool_or_distinct +query BBBBBBBB +SELECT bool_or(distinct c1), bool_or(distinct c2), bool_or(distinct c3), bool_or(distinct c4), bool_or(distinct c5), bool_or(distinct c6), bool_or(distinct c7), bool_or(distinct c8) FROM bool_aggregate_functions +---- +true true true false true true false NULL + +# Test issue: https://github.com/apache/datafusion/issues/11846 +statement ok +create table t1(v1 int, v2 boolean); + +statement ok +insert into t1 values (1, true), (1, true); + +statement ok +insert into t1 values (3, null), (3, true); + +statement ok +insert into t1 values (2, false), (2, true); + +statement ok +insert into t1 values (6, false), (6, false); + +statement ok +insert into t1 values (4, null), (4, null); + +statement ok +insert into t1 values (5, false), (5, null); + +query IB +select v1, bool_and(v2) from t1 group by v1 order by v1; +---- +1 true +2 false +3 true +4 NULL +5 false +6 false + +query IB +select v1, bool_or(v2) from t1 group by v1 order by v1; +---- +1 true +2 true +3 true +4 NULL +5 false +6 false + +statement ok +drop table t1; + +# All supported timestamp types + +# "nanos" --> TimestampNanosecondArray +# "micros" --> TimestampMicrosecondArray +# "millis" --> TimestampMillisecondArray +# "secs" --> TimestampSecondArray +# "names" --> StringArray + +statement ok +create table t_source +as values + ('2018-11-13T17:11:10.011375885995', 'Row 0', 'X'), + ('2011-12-13T11:13:10.12345', 'Row 1', 'X'), + (null, 'Row 2', 'Y'), + ('2021-01-01T05:11:10.432', 'Row 3', 'Y'); + +statement ok +create table t as +select + arrow_cast(column1, 'Timestamp(Nanosecond, None)') as nanos, + arrow_cast(column1, 'Timestamp(Microsecond, None)') as micros, + arrow_cast(column1, 'Timestamp(Millisecond, None)') as millis, + arrow_cast(column1, 'Timestamp(Second, None)') as secs, + arrow_cast(column1, 'Timestamp(Nanosecond, Some("UTC"))') as nanos_utc, + arrow_cast(column1, 'Timestamp(Microsecond, Some("UTC"))') as micros_utc, + arrow_cast(column1, 'Timestamp(Millisecond, Some("UTC"))') as millis_utc, + arrow_cast(column1, 'Timestamp(Second, Some("UTC"))') as secs_utc, + column2 as names, + column3 as tag +from t_source; + +# Demonstrate the contents +query PPPPPPPPTT +select * from t; +---- +2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z 2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0 X +2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1 X +NULL NULL NULL NULL NULL NULL NULL NULL Row 2 Y +2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10Z Row 3 Y + + +# aggregate_timestamps_sum +query error +SELECT sum(nanos), sum(micros), sum(millis), sum(secs) FROM t; + +query error +SELECT tag, sum(nanos), sum(micros), sum(millis), sum(secs) FROM t GROUP BY tag ORDER BY tag; + +# aggregate_timestamps_count +query IIII +SELECT count(nanos), count(micros), count(millis), count(secs) FROM t; +---- +3 3 3 3 + +query TIIII +SELECT tag, count(nanos), count(micros), count(millis), count(secs) FROM t GROUP BY tag ORDER BY tag; +---- +X 2 2 2 2 +Y 1 1 1 1 + +# aggregate_timestamps_min +query PPPP +SELECT min(nanos), min(micros), min(millis), min(secs) FROM t; +---- +2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 + +query TPPPP +SELECT tag, min(nanos), min(micros), min(millis), min(secs) FROM t GROUP BY tag ORDER BY tag; +---- +X 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 +Y 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 + +# aggregate_timestamps_max +query PPPP +SELECT max(nanos), max(micros), max(millis), max(secs) FROM t; +---- +2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 + +query TPPPP +SELECT tag, max(nanos), max(micros), max(millis), max(secs) FROM t GROUP BY tag ORDER BY tag +---- +X 2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 +Y 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 + +# aggregate_timestamps_count_distinct_with_tz +query IIII +SELECT count(DISTINCT nanos_utc), count(DISTINCT micros_utc), count(DISTINCT millis_utc), count(DISTINCT secs_utc) FROM t; +---- +3 3 3 3 + +query TIIII +SELECT tag, count(DISTINCT nanos_utc), count(DISTINCT micros_utc), count(DISTINCT millis_utc), count(DISTINCT secs_utc) FROM t GROUP BY tag ORDER BY tag; +---- +X 2 2 2 2 +Y 1 1 1 1 + +# aggregate_timestamps_avg +query error +SELECT avg(nanos), avg(micros), avg(millis), avg(secs) FROM t + +query error +SELECT tag, avg(nanos), avg(micros), avg(millis), avg(secs) FROM t GROUP BY tag ORDER BY tag; + +# aggregate_duration_array_agg +query T? +SELECT tag, array_agg(millis - arrow_cast(secs, 'Timestamp(Millisecond, None)')) FROM t GROUP BY tag ORDER BY tag; +---- +X [0 days 0 hours 0 mins 0.011 secs, 0 days 0 hours 0 mins 0.123 secs] +Y [NULL, 0 days 0 hours 0 mins 0.432 secs] + +statement ok +drop table t_source; + +statement ok +drop table t; + + +# All supported Date tpes + +# "date32" --> Date32Array +# "date64" --> Date64Array +# "names" --> StringArray + +statement ok +create table t_source +as values + ('2018-11-13', 'Row 0', 'X'), + ('2011-12-13', 'Row 1', 'X'), + (null, 'Row 2', 'Y'), + ('2021-01-01', 'Row 3', 'Y'); + +statement ok +create table t as +select + arrow_cast(column1, 'Date32') as date32, + -- Workaround https://github.com/apache/arrow-rs/issues/4512 is fixed, can use this + -- arrow_cast(column1, 'Date64') as date64, + arrow_cast(arrow_cast(column1, 'Date32'), 'Date64') as date64, + column2 as names, + column3 as tag +from t_source; + +# Demonstrate the contents +query DDTT +select * from t; +---- +2018-11-13 2018-11-13T00:00:00 Row 0 X +2011-12-13 2011-12-13T00:00:00 Row 1 X +NULL NULL Row 2 Y +2021-01-01 2021-01-01T00:00:00 Row 3 Y + + +# aggregate_timestamps_sum +query error +SELECT sum(date32), sum(date64) FROM t; + +query error +SELECT tag, sum(date32), sum(date64) FROM t GROUP BY tag ORDER BY tag; + +# aggregate_timestamps_count +query II +SELECT count(date32), count(date64) FROM t; +---- +3 3 + +query TII +SELECT tag, count(date32), count(date64) FROM t GROUP BY tag ORDER BY tag; +---- +X 2 2 +Y 1 1 + +# aggregate_timestamps_min +query DD +SELECT min(date32), min(date64) FROM t; +---- +2011-12-13 2011-12-13T00:00:00 + +query TDD +SELECT tag, min(date32), min(date64) FROM t GROUP BY tag ORDER BY tag; +---- +X 2011-12-13 2011-12-13T00:00:00 +Y 2021-01-01 2021-01-01T00:00:00 + +# aggregate_timestamps_max +query DD +SELECT max(date32), max(date64) FROM t; +---- +2021-01-01 2021-01-01T00:00:00 + +query TDD +SELECT tag, max(date32), max(date64) FROM t GROUP BY tag ORDER BY tag +---- +X 2018-11-13 2018-11-13T00:00:00 +Y 2021-01-01 2021-01-01T00:00:00 + + +# aggregate_timestamps_avg +query error +SELECT avg(date32), avg(date64) FROM t + +query error +SELECT tag, avg(date32), avg(date64) FROM t GROUP BY tag ORDER BY tag; + + +statement ok +drop table t_source; + +statement ok +drop table t; + + +# All supported time types + +# Columns are named: +# "nanos" --> Time64NanosecondArray +# "micros" --> Time64MicrosecondArray +# "millis" --> Time32MillisecondArray +# "secs" --> Time32SecondArray +# "names" --> StringArray + +statement ok +create table t_source +as values + ('18:06:30.243620451', 'Row 0', 'A'), + ('20:08:28.161121654', 'Row 1', 'A'), + ('19:11:04.156423842', 'Row 2', 'B'), + ('21:06:28.247821084', 'Row 3', 'B'); + + +statement ok +create table t as +select + arrow_cast(column1, 'Time64(Nanosecond)') as nanos, + arrow_cast(column1, 'Time64(Microsecond)') as micros, + arrow_cast(column1, 'Time32(Millisecond)') as millis, + arrow_cast(column1, 'Time32(Second)') as secs, + column2 as names, + column3 as tag +from t_source; + +# Demonstrate the contents +query DDDDTT +select * from t; +---- +18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30 Row 0 A +20:08:28.161121654 20:08:28.161121 20:08:28.161 20:08:28 Row 1 A +19:11:04.156423842 19:11:04.156423 19:11:04.156 19:11:04 Row 2 B +21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28 Row 3 B + +# aggregate_times_sum +query error +SELECT sum(nanos), sum(micros), sum(millis), sum(secs) FROM t + +query error +SELECT tag, sum(nanos), sum(micros), sum(millis), sum(secs) FROM t GROUP BY tag ORDER BY tag + +# aggregate_times_count +query IIII +SELECT count(nanos), count(micros), count(millis), count(secs) FROM t +---- +4 4 4 4 + +query TIIII +SELECT tag, count(nanos), count(micros), count(millis), count(secs) FROM t GROUP BY tag ORDER BY tag +---- +A 2 2 2 2 +B 2 2 2 2 + + +# aggregate_times_min +query DDDD +SELECT min(nanos), min(micros), min(millis), min(secs) FROM t +---- +18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30 + +query TDDDD +SELECT tag, min(nanos), min(micros), min(millis), min(secs) FROM t GROUP BY tag ORDER BY tag +---- +A 18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30 +B 19:11:04.156423842 19:11:04.156423 19:11:04.156 19:11:04 + +# aggregate_times_max +query DDDD +SELECT max(nanos), max(micros), max(millis), max(secs) FROM t +---- +21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28 + +query TDDDD +SELECT tag, max(nanos), max(micros), max(millis), max(secs) FROM t GROUP BY tag ORDER BY tag +---- +A 20:08:28.161121654 20:08:28.161121 20:08:28.161 20:08:28 +B 21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28 + + +# aggregate_times_avg +query error +SELECT avg(nanos), avg(micros), avg(millis), avg(secs) FROM t + +query error +SELECT tag, avg(nanos), avg(micros), avg(millis), avg(secs) FROM t GROUP BY tag ORDER BY tag; + +statement ok +drop table t_source; + +statement ok +drop table t; + + +# aggregates on strings +statement ok +create table t_source +as values + ('Foo', 1), + ('Bar', 2), + (null, 2), + ('Baz', 1); + +statement ok +create table t as +select + arrow_cast(column1, 'Utf8') as utf8, + arrow_cast(column1, 'LargeUtf8') as largeutf8, + column2 as tag +from t_source; + +# No groupy +query TTITTI +SELECT + min(utf8), + max(utf8), + count(utf8), + min(largeutf8), + max(largeutf8), + count(largeutf8) +FROM t +---- +Bar Foo 3 Bar Foo 3 + + +# with groupby +query TTITTI +SELECT + min(utf8), + max(utf8), + count(utf8), + min(largeutf8), + max(largeutf8), + count(largeutf8) +FROM t +GROUP BY tag +ORDER BY tag +---- +Baz Foo 2 Baz Foo 2 +Bar Bar 1 Bar Bar 1 + + +statement ok +drop table t_source; + +statement ok +drop table t; + + +# aggregates on binary +statement ok +create table t_source +as values + ('Foo', 1), + ('Bar', 2), + (null, 2), + ('Baz', 1); + +statement ok +create table t as +select + arrow_cast(column1, 'Binary') as binary, + arrow_cast(column1, 'LargeBinary') as largebinary, + column2 as tag +from t_source; + +# No groupy +query ??I??I +SELECT + min(binary), + max(binary), + count(binary), + min(largebinary), + max(largebinary), + count(largebinary) +FROM t +---- +426172 466f6f 3 426172 466f6f 3 + +# with groupby +query ??I??I +SELECT + min(binary), + max(binary), + count(binary), + min(largebinary), + max(largebinary), + count(largebinary) +FROM t +GROUP BY tag +ORDER BY tag +---- +42617a 466f6f 2 42617a 466f6f 2 +426172 426172 1 426172 426172 1 + + + +statement ok +drop table t_source; + +statement ok +drop table t; + + +query I +select median(a) from (select 1 as a where 1=0); +---- +NULL + +query I +select approx_median(a) from (select 1 as a where 1=0); +---- +NULL + +# aggregate_decimal_sum +query RT +select sum(c1), arrow_typeof(sum(c1)) from d_table; +---- +100 Decimal128(20, 3) + +# aggregate sum with decimal +statement ok +create table t (c decimal(35, 3)) as values (10), (null), (20); + +query RT +select sum(c), arrow_typeof(sum(c)) from t; +---- +30 Decimal128(38, 3) + +statement ok +drop table t; + +# aggregate sum with i32, sum coerced result to i64 +statement ok +create table t (c int) as values (1), (-1), (10), (null), (-11); + +query IT +select sum(c), arrow_typeof(sum(c)) from t; +---- +-1 Int64 + +statement ok +drop table t; + +# aggregate sum with all nulls +statement ok +create table t (c1 decimal(10, 0), c2 int) as values (null, null), (null, null), (null, null); + +query RTIT +select + sum(c1), arrow_typeof(sum(c1)), + sum(c2), arrow_typeof(sum(c2)) +from t; +---- +NULL Decimal128(20, 0) NULL Int64 + +statement ok +drop table t; + +# aggregate sum with u32, sum coerced result to u64 +statement ok +create table t (c int unsigned) as values (1), (0), (10), (null), (4); + +query IT +select sum(c), arrow_typeof(sum(c)) from t; +---- +15 UInt64 + +statement ok +drop table t; + +# aggregate sum with f32, sum coerced result to f64 +statement ok +create table t (c float) as values (1.2), (0.2), (-1.2), (null), (-1.0); + +query RT +select sum(c), arrow_typeof(sum(c)) from t; +---- +-0.79999999702 Float64 + +statement ok +drop table t; + +# aggregate sum with f64 +statement ok +create table t (c double) as values (1.2), (0.2), (-1.2), (null), (-1.0); + +query RT +select sum(c), arrow_typeof(sum(c)) from t; +---- +-0.8 Float64 + +statement ok +drop table t; + +query TRT +select c2, sum(c1), arrow_typeof(sum(c1)) from d_table GROUP BY c2 ORDER BY c2; +---- +A 1100.045 Decimal128(20, 3) +B -1000.045 Decimal128(20, 3) + + +# aggregate_decimal_avg +query RT +select avg(c1), arrow_typeof(avg(c1)) from d_table +---- +5 Decimal128(14, 7) + +query TRT +select c2, avg(c1), arrow_typeof(avg(c1)) from d_table GROUP BY c2 ORDER BY c2 +---- +A 110.0045 Decimal128(14, 7) +B -100.0045 Decimal128(14, 7) + +# aggregate_decimal_count_distinct +query I +select count(DISTINCT cast(c1 AS DECIMAL(10, 2))) from d_table +---- +4 + +query TI +select c2, count(DISTINCT cast(c1 AS DECIMAL(10, 2))) from d_table GROUP BY c2 ORDER BY c2 +---- +A 2 +B 2 + +# Use PostgresSQL dialect +statement ok +set datafusion.sql_parser.dialect = 'Postgres'; + +# Creating the table +statement ok +CREATE TABLE test_table (c1 INT, c2 INT, c3 INT) + +# Inserting data +statement ok +INSERT INTO test_table VALUES + (1, 10, 50), + (1, 20, 60), + (2, 10, 70), + (2, 20, 80), + (3, 10, NULL) + +# query_group_by_with_filter +query III rowsort +SELECT + c1, + SUM(c2) FILTER (WHERE c2 >= 20), + SUM(c2) FILTER (WHERE c2 < 1) -- no rows pass filter, so the output should be NULL +FROM test_table GROUP BY c1 +---- +1 20 NULL +2 20 NULL +3 NULL NULL + +# query_group_by_avg_with_filter +query IRR rowsort +SELECT + c1, + AVG(c2) FILTER (WHERE c2 >= 20), + AVG(c2) FILTER (WHERE c2 < 1) -- no rows pass filter, so output should be null +FROM test_table GROUP BY c1 +---- +1 20 NULL +2 20 NULL +3 NULL NULL + +# query_group_by_with_multiple_filters +query IIR rowsort +SELECT + c1, + SUM(c2) FILTER (WHERE c2 >= 20) AS sum_c2, + AVG(c3) FILTER (WHERE c3 <= 70) AS avg_c3 +FROM test_table GROUP BY c1 +---- +1 20 55 +2 20 70 +3 NULL NULL + +# query_group_by_distinct_with_filter +query II rowsort +SELECT + c1, + COUNT(DISTINCT c2) FILTER (WHERE c2 >= 20) AS distinct_c2_count +FROM test_table GROUP BY c1 +---- +1 1 +2 1 +3 0 + +# query_without_group_by_with_filter +query I rowsort +SELECT + SUM(c2) FILTER (WHERE c2 >= 20) AS sum_c2 +FROM test_table +---- +40 + +# count_without_group_by_with_filter +query I rowsort +SELECT + COUNT(c2) FILTER (WHERE c2 >= 20) AS count_c2 +FROM test_table +---- +2 + +# query_with_and_without_filter +query III rowsort +SELECT + c1, + SUM(c2) FILTER (WHERE c2 >= 20) as result, + SUM(c2) as result_no_filter +FROM test_table GROUP BY c1; +---- +1 20 30 +2 20 30 +3 NULL 10 + +# query_filter_on_different_column_than_aggregate +query I rowsort +select + sum(c1) FILTER (WHERE c2 < 30) +FROM test_table; +---- +9 + +# query_test_empty_filter +query I rowsort +SELECT + SUM(c2) FILTER (WHERE c2 >= 20000000) AS sum_c2 +FROM test_table; +---- +NULL + +# Creating the decimal table +statement ok +CREATE TABLE test_decimal_table (c1 INT, c2 DECIMAL(5, 2), c3 DECIMAL(5, 1), c4 DECIMAL(5, 1)) + +# Inserting data +statement ok +INSERT INTO test_decimal_table VALUES (1, 10.10, 100.1, NULL), (1, 20.20, 200.2, NULL), (2, 10.10, 700.1, NULL), (2, 20.20, 700.1, NULL), (3, 10.1, 100.1, NULL), (3, 10.1, NULL, NULL) + +# aggregate_decimal_with_group_by +query IIRRRRIIRR rowsort +select c1, count(c2), avg(c2), sum(c2), min(c2), max(c2), count(c3), count(c4), sum(c4), avg(c4) from test_decimal_table group by c1 +---- +1 2 15.15 30.3 10.1 20.2 2 0 NULL NULL +2 2 15.15 30.3 10.1 20.2 2 0 NULL NULL +3 2 10.1 20.2 10.1 10.1 1 0 NULL NULL + +# aggregate_decimal_with_group_by_decimal +query RIRRRRIR rowsort +select c3, count(c2), avg(c2), sum(c2), min(c2), max(c2), count(c4), sum(c4) from test_decimal_table group by c3 +---- +100.1 2 10.1 20.2 10.1 10.1 0 NULL +200.2 1 20.2 20.2 20.2 20.2 0 NULL +700.1 2 15.15 30.3 10.1 20.2 0 NULL +NULL 1 10.1 10.1 10.1 10.1 0 NULL + +# Restore the default dialect +statement ok +set datafusion.sql_parser.dialect = 'Generic'; + +## Multiple distinct aggregates and dictionaries +statement ok +create table dict_test as values (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)')); + +query IT +select * from dict_test; +---- +1 foo +2 bar + +query II +select count(distinct column1), count(distinct column2) from dict_test group by column1; +---- +1 1 +1 1 + +statement ok +drop table dict_test; + + +# Prepare the table with dictionary values for testing +statement ok +CREATE TABLE value(x bigint) AS VALUES (1), (2), (3), (1), (3), (4), (5), (2); + +statement ok +CREATE TABLE value_dict AS SELECT arrow_cast(x, 'Dictionary(Int64, Int32)') AS x_dict FROM value; + +query ? +select x_dict from value_dict; +---- +1 +2 +3 +1 +3 +4 +5 +2 + +query I +select sum(x_dict) from value_dict; +---- +21 + +query R +select avg(x_dict) from value_dict; +---- +2.625 + +# distinct_average +query R +select avg(distinct x_dict) from value_dict; +---- +3 + +query error +select avg(x_dict), avg(distinct x_dict) from value_dict; + +query I +select min(x_dict) from value_dict; +---- +1 + +query I +select max(x_dict) from value_dict; +---- +5 + +query I +select sum(x_dict) from value_dict where x_dict > 3; +---- +9 + +query R +select avg(x_dict) from value_dict where x_dict > 3; +---- +4.5 + +query I +select min(x_dict) from value_dict where x_dict > 3; +---- +4 + +query I +select max(x_dict) from value_dict where x_dict > 3; +---- +5 + +query I +select sum(x_dict) from value_dict group by x_dict % 2 order by sum(x_dict); +---- +8 +13 + +query R +select avg(x_dict) from value_dict group by x_dict % 2 order by avg(x_dict); +---- +2.6 +2.666666666667 + +query I +select min(x_dict) from value_dict group by x_dict % 2 order by min(x_dict); +---- +1 +2 + +query I +select max(x_dict) from value_dict group by x_dict % 2 order by max(x_dict); +---- +4 +5 + +query T +select arrow_typeof(x_dict) from value_dict group by x_dict; +---- +Dictionary(Int64, Int32) +Dictionary(Int64, Int32) +Dictionary(Int64, Int32) +Dictionary(Int64, Int32) +Dictionary(Int64, Int32) + +statement ok +drop table value + +statement ok +drop table value_dict + + +# bool aggregation +statement ok +CREATE TABLE value_bool(x boolean, g int) AS VALUES (NULL, 0), (false, 0), (true, 0), (false, 1), (true, 2), (NULL, 3); + +query B +select min(x) from value_bool; +---- +false + +query B +select max(x) from value_bool; +---- +true + +query B +select min(x) from value_bool group by g order by g; +---- +false +false +true +NULL + +query B +select max(x) from value_bool group by g order by g; +---- +true +false +true +NULL + +# +# Add valid distinct case as aggregation plan test +# + +query TT +EXPLAIN SELECT DISTINCT c3, min(c1) FROM aggregate_test_100 group by c3 limit 5; +---- +logical_plan +01)Limit: skip=0, fetch=5 +02)--Aggregate: groupBy=[[aggregate_test_100.c3, min(aggregate_test_100.c1)]], aggr=[[]] +03)----Aggregate: groupBy=[[aggregate_test_100.c3]], aggr=[[min(aggregate_test_100.c1)]] +04)------TableScan: aggregate_test_100 projection=[c1, c3] +physical_plan +01)GlobalLimitExec: skip=0, fetch=5 +02)--CoalescePartitionsExec +03)----AggregateExec: mode=FinalPartitioned, gby=[c3@0 as c3, min(aggregate_test_100.c1)@1 as min(aggregate_test_100.c1)], aggr=[], lim=[5] +04)------CoalesceBatchesExec: target_batch_size=8192 +05)--------RepartitionExec: partitioning=Hash([c3@0, min(aggregate_test_100.c1)@1], 4), input_partitions=4 +06)----------AggregateExec: mode=Partial, gby=[c3@0 as c3, min(aggregate_test_100.c1)@1 as min(aggregate_test_100.c1)], aggr=[], lim=[5] +07)------------AggregateExec: mode=FinalPartitioned, gby=[c3@0 as c3], aggr=[min(aggregate_test_100.c1)] +08)--------------CoalesceBatchesExec: target_batch_size=8192 +09)----------------RepartitionExec: partitioning=Hash([c3@0], 4), input_partitions=4 +10)------------------AggregateExec: mode=Partial, gby=[c3@1 as c3], aggr=[min(aggregate_test_100.c1)] +11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +12)----------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3], has_header=true + + +# +# Push limit into distinct group-by aggregation tests +# + +# Make results deterministic +statement ok +set datafusion.optimizer.repartition_aggregations = false; + +# +query TT +EXPLAIN SELECT DISTINCT c3 FROM aggregate_test_100 group by c3 limit 5; +---- +logical_plan +01)Limit: skip=0, fetch=5 +02)--Aggregate: groupBy=[[aggregate_test_100.c3]], aggr=[[]] +03)----TableScan: aggregate_test_100 projection=[c3] +physical_plan +01)GlobalLimitExec: skip=0, fetch=5 +02)--AggregateExec: mode=Final, gby=[c3@0 as c3], aggr=[], lim=[5] +03)----CoalescePartitionsExec +04)------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[], lim=[5] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3], has_header=true + +query I +SELECT DISTINCT c3 FROM aggregate_test_100 group by c3 limit 5; +---- +1 +-40 +29 +-85 +-82 + +query TT +EXPLAIN SELECT c2, c3 FROM aggregate_test_100 group by c2, c3 limit 5 offset 4; +---- +logical_plan +01)Limit: skip=4, fetch=5 +02)--Aggregate: groupBy=[[aggregate_test_100.c2, aggregate_test_100.c3]], aggr=[[]] +03)----TableScan: aggregate_test_100 projection=[c2, c3] +physical_plan +01)GlobalLimitExec: skip=4, fetch=5 +02)--AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[], lim=[9] +03)----CoalescePartitionsExec +04)------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[], lim=[9] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true + +query II +SELECT c2, c3 FROM aggregate_test_100 group by c2, c3 limit 5 offset 4; +---- +5 -82 +4 -111 +3 104 +3 13 +1 38 + +# The limit should only apply to the aggregations which group by c3 +query TT +EXPLAIN SELECT DISTINCT c3 FROM aggregate_test_100 WHERE c3 between 10 and 20 group by c2, c3 limit 4; +---- +logical_plan +01)Limit: skip=0, fetch=4 +02)--Aggregate: groupBy=[[aggregate_test_100.c3]], aggr=[[]] +03)----Projection: aggregate_test_100.c3 +04)------Aggregate: groupBy=[[aggregate_test_100.c2, aggregate_test_100.c3]], aggr=[[]] +05)--------Filter: aggregate_test_100.c3 >= Int16(10) AND aggregate_test_100.c3 <= Int16(20) +06)----------TableScan: aggregate_test_100 projection=[c2, c3], partial_filters=[aggregate_test_100.c3 >= Int16(10), aggregate_test_100.c3 <= Int16(20)] +physical_plan +01)GlobalLimitExec: skip=0, fetch=4 +02)--AggregateExec: mode=Final, gby=[c3@0 as c3], aggr=[], lim=[4] +03)----CoalescePartitionsExec +04)------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[], lim=[4] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)----------ProjectionExec: expr=[c3@1 as c3] +07)------------AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[] +08)--------------CoalescePartitionsExec +09)----------------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[] +10)------------------CoalesceBatchesExec: target_batch_size=8192 +11)--------------------FilterExec: c3@1 >= 10 AND c3@1 <= 20 +12)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +13)------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true + +query I +SELECT DISTINCT c3 FROM aggregate_test_100 WHERE c3 between 10 and 20 group by c2, c3 limit 4; +---- +13 +17 +12 +14 + +# An aggregate expression causes the limit to not be pushed to the aggregation +query TT +EXPLAIN SELECT max(c1), c2, c3 FROM aggregate_test_100 group by c2, c3 limit 5; +---- +logical_plan +01)Projection: max(aggregate_test_100.c1), aggregate_test_100.c2, aggregate_test_100.c3 +02)--Limit: skip=0, fetch=5 +03)----Aggregate: groupBy=[[aggregate_test_100.c2, aggregate_test_100.c3]], aggr=[[max(aggregate_test_100.c1)]] +04)------TableScan: aggregate_test_100 projection=[c1, c2, c3] +physical_plan +01)ProjectionExec: expr=[max(aggregate_test_100.c1)@2 as max(aggregate_test_100.c1), c2@0 as c2, c3@1 as c3] +02)--GlobalLimitExec: skip=0, fetch=5 +03)----AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[max(aggregate_test_100.c1)] +04)------CoalescePartitionsExec +05)--------AggregateExec: mode=Partial, gby=[c2@1 as c2, c3@2 as c3], aggr=[max(aggregate_test_100.c1)] +06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +07)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], has_header=true + +# TODO(msirek): Extend checking in LimitedDistinctAggregation equal groupings to ignore the order of columns +# in the group-by column lists, so the limit could be pushed to the lowest AggregateExec in this case +query TT +EXPLAIN SELECT DISTINCT c3, c2 FROM aggregate_test_100 group by c2, c3 limit 3 offset 10; +---- +logical_plan +01)Limit: skip=10, fetch=3 +02)--Aggregate: groupBy=[[aggregate_test_100.c3, aggregate_test_100.c2]], aggr=[[]] +03)----Projection: aggregate_test_100.c3, aggregate_test_100.c2 +04)------Aggregate: groupBy=[[aggregate_test_100.c2, aggregate_test_100.c3]], aggr=[[]] +05)--------TableScan: aggregate_test_100 projection=[c2, c3] +physical_plan +01)GlobalLimitExec: skip=10, fetch=3 +02)--AggregateExec: mode=Final, gby=[c3@0 as c3, c2@1 as c2], aggr=[], lim=[13] +03)----CoalescePartitionsExec +04)------AggregateExec: mode=Partial, gby=[c3@0 as c3, c2@1 as c2], aggr=[], lim=[13] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)----------ProjectionExec: expr=[c3@1 as c3, c2@0 as c2] +07)------------AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[] +08)--------------CoalescePartitionsExec +09)----------------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[] +10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +11)--------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true + +query II +SELECT DISTINCT c3, c2 FROM aggregate_test_100 group by c2, c3 limit 3 offset 10; +---- +57 1 +-54 4 +112 3 + +query TT +EXPLAIN SELECT c2, c3 FROM aggregate_test_100 group by rollup(c2, c3) limit 3; +---- +logical_plan +01)Projection: aggregate_test_100.c2, aggregate_test_100.c3 +02)--Limit: skip=0, fetch=3 +03)----Aggregate: groupBy=[[ROLLUP (aggregate_test_100.c2, aggregate_test_100.c3)]], aggr=[[]] +04)------TableScan: aggregate_test_100 projection=[c2, c3] +physical_plan +01)ProjectionExec: expr=[c2@0 as c2, c3@1 as c3] +02)--GlobalLimitExec: skip=0, fetch=3 +03)----AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3, __grouping_id@2 as __grouping_id], aggr=[], lim=[3] +04)------CoalescePartitionsExec +05)--------AggregateExec: mode=Partial, gby=[(NULL as c2, NULL as c3), (c2@0 as c2, NULL as c3), (c2@0 as c2, c3@1 as c3)], aggr=[] +06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +07)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true + +query II +SELECT c2, c3 FROM aggregate_test_100 group by rollup(c2, c3) limit 3; +---- +NULL NULL +2 NULL +5 NULL + + +statement ok +set datafusion.optimizer.enable_distinct_aggregation_soft_limit = false; + +# The limit should not be pushed into the aggregations +query TT +EXPLAIN SELECT DISTINCT c3 FROM aggregate_test_100 group by c3 limit 5; +---- +logical_plan +01)Limit: skip=0, fetch=5 +02)--Aggregate: groupBy=[[aggregate_test_100.c3]], aggr=[[]] +03)----TableScan: aggregate_test_100 projection=[c3] +physical_plan +01)GlobalLimitExec: skip=0, fetch=5 +02)--AggregateExec: mode=Final, gby=[c3@0 as c3], aggr=[] +03)----CoalescePartitionsExec +04)------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3], has_header=true + +statement ok +set datafusion.optimizer.enable_distinct_aggregation_soft_limit = true; + +statement ok +set datafusion.optimizer.repartition_aggregations = true; + +# +# regr_*() tests +# + +# regr_*() invalid input +statement error +select regr_slope(); + +statement error +select regr_intercept(*); + +statement error +select regr_count(*) from aggregate_test_100; + +statement error +select regr_r2(1); + +statement error +select regr_avgx(1,2,3); + +statement error +select regr_avgy(1, 'foo'); + +statement error +select regr_sxx('foo', 1); + +statement error +select regr_syy('foo', 'bar'); + +statement error +select regr_sxy(NULL, 'bar'); + + + +# regr_*() NULL results +query RRIRRRRRR +select regr_slope(1,1), regr_intercept(1,1), regr_count(1,1), regr_r2(1,1), regr_avgx(1,1), regr_avgy(1,1), regr_sxx(1,1), regr_syy(1,1), regr_sxy(1,1); +---- +NULL NULL 1 NULL 1 1 0 0 0 + +query RRIRRRRRR +select regr_slope(1, NULL), regr_intercept(1, NULL), regr_count(1, NULL), regr_r2(1, NULL), regr_avgx(1, NULL), regr_avgy(1, NULL), regr_sxx(1, NULL), regr_syy(1, NULL), regr_sxy(1, NULL); +---- +NULL NULL 0 NULL NULL NULL NULL NULL NULL + +query RRIRRRRRR +select regr_slope(NULL, 1), regr_intercept(NULL, 1), regr_count(NULL, 1), regr_r2(NULL, 1), regr_avgx(NULL, 1), regr_avgy(NULL, 1), regr_sxx(NULL, 1), regr_syy(NULL, 1), regr_sxy(NULL, 1); +---- +NULL NULL 0 NULL NULL NULL NULL NULL NULL + +query RRIRRRRRR +select regr_slope(NULL, NULL), regr_intercept(NULL, NULL), regr_count(NULL, NULL), regr_r2(NULL, NULL), regr_avgx(NULL, NULL), regr_avgy(NULL, NULL), regr_sxx(NULL, NULL), regr_syy(NULL, NULL), regr_sxy(NULL, NULL); +---- +NULL NULL 0 NULL NULL NULL NULL NULL NULL + +query RRIRRRRRR +select regr_slope(column2, column1), regr_intercept(column2, column1), regr_count(column2, column1), regr_r2(column2, column1), regr_avgx(column2, column1), regr_avgy(column2, column1), regr_sxx(column2, column1), regr_syy(column2, column1), regr_sxy(column2, column1) from (values (1,2), (1,4), (1,6)); +---- +NULL NULL 3 NULL 1 4 0 8 0 + + + +# regr_*() basic tests +query RRIRRRRRR +select + regr_slope(column2, column1), + regr_intercept(column2, column1), + regr_count(column2, column1), + regr_r2(column2, column1), + regr_avgx(column2, column1), + regr_avgy(column2, column1), + regr_sxx(column2, column1), + regr_syy(column2, column1), + regr_sxy(column2, column1) +from (values (1,2), (2,4), (3,6)); +---- +2 0 3 1 2 4 2 8 4 + +query RRIRRRRRR +select + regr_slope(c12, c11), + regr_intercept(c12, c11), + regr_count(c12, c11), + regr_r2(c12, c11), + regr_avgx(c12, c11), + regr_avgy(c12, c11), + regr_sxx(c12, c11), + regr_syy(c12, c11), + regr_sxy(c12, c11) +from aggregate_test_100; +---- +0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695 + + + +# regr_*() functions ignore NULLs +query RRIRRRRRR +select + regr_slope(column2, column1), + regr_intercept(column2, column1), + regr_count(column2, column1), + regr_r2(column2, column1), + regr_avgx(column2, column1), + regr_avgy(column2, column1), + regr_sxx(column2, column1), + regr_syy(column2, column1), + regr_sxy(column2, column1) +from (values (1,NULL), (2,4), (3,6)); +---- +2 0 2 1 2.5 5 0.5 2 1 + +query RRIRRRRRR +select + regr_slope(column2, column1), + regr_intercept(column2, column1), + regr_count(column2, column1), + regr_r2(column2, column1), + regr_avgx(column2, column1), + regr_avgy(column2, column1), + regr_sxx(column2, column1), + regr_syy(column2, column1), + regr_sxy(column2, column1) +from (values (1,NULL), (NULL,4), (3,6)); +---- +NULL NULL 1 NULL 3 6 0 0 0 + +query RRIRRRRRR +select + regr_slope(column2, column1), + regr_intercept(column2, column1), + regr_count(column2, column1), + regr_r2(column2, column1), + regr_avgx(column2, column1), + regr_avgy(column2, column1), + regr_sxx(column2, column1), + regr_syy(column2, column1), + regr_sxy(column2, column1) +from (values (1,NULL), (NULL,4), (NULL,NULL)); +---- +NULL NULL 0 NULL NULL NULL NULL NULL NULL + +query TRRIRRRRRR rowsort +select + column3, + regr_slope(column2, column1), + regr_intercept(column2, column1), + regr_count(column2, column1), + regr_r2(column2, column1), + regr_avgx(column2, column1), + regr_avgy(column2, column1), + regr_sxx(column2, column1), + regr_syy(column2, column1), + regr_sxy(column2, column1) +from (values (1,2,'a'), (2,4,'a'), (1,3,'b'), (3,9,'b'), (1,10,'c'), (NULL,100,'c')) +group by column3; +---- +a 2 0 2 1 1.5 3 0.5 2 1 +b 3 0 2 1 2 6 2 18 6 +c NULL NULL 1 NULL 1 10 0 0 0 + + + +# regr_*() testing merge_batch() from RegrAccumulator's internal implementation +statement ok +set datafusion.execution.batch_size = 1; + +query RRIRRRRRR +select + regr_slope(c12, c11), + regr_intercept(c12, c11), + regr_count(c12, c11), + regr_r2(c12, c11), + regr_avgx(c12, c11), + regr_avgy(c12, c11), + regr_sxx(c12, c11), + regr_syy(c12, c11), + regr_sxy(c12, c11) +from aggregate_test_100; +---- +0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695 + +statement ok +set datafusion.execution.batch_size = 2; + +query RRIRRRRRR +select + regr_slope(c12, c11), + regr_intercept(c12, c11), + regr_count(c12, c11), + regr_r2(c12, c11), + regr_avgx(c12, c11), + regr_avgy(c12, c11), + regr_sxx(c12, c11), + regr_syy(c12, c11), + regr_sxy(c12, c11) +from aggregate_test_100; +---- +0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695 + +statement ok +set datafusion.execution.batch_size = 3; + +query RRIRRRRRR +select + regr_slope(c12, c11), + regr_intercept(c12, c11), + regr_count(c12, c11), + regr_r2(c12, c11), + regr_avgx(c12, c11), + regr_avgy(c12, c11), + regr_sxx(c12, c11), + regr_syy(c12, c11), + regr_sxy(c12, c11) +from aggregate_test_100; +---- +0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695 + +statement ok +set datafusion.execution.batch_size = 8192; + + + +# regr_*() testing retract_batch() from RegrAccumulator's internal implementation +query RRIRRRRRR +SELECT + regr_slope(column2, column1) OVER w AS slope, + regr_intercept(column2, column1) OVER w AS intercept, + regr_count(column2, column1) OVER w AS count, + regr_r2(column2, column1) OVER w AS r2, + regr_avgx(column2, column1) OVER w AS avgx, + regr_avgy(column2, column1) OVER w AS avgy, + regr_sxx(column2, column1) OVER w AS sxx, + regr_syy(column2, column1) OVER w AS syy, + regr_sxy(column2, column1) OVER w AS sxy +FROM (VALUES (1,2), (2,4), (3,6), (4,12), (5,15), (6,18)) AS t(column1, column2) +WINDOW w AS (ORDER BY column1 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW); +---- +NULL NULL 1 NULL 1 2 0 0 0 +2 0 2 1 1.5 3 0.5 2 1 +2 0 3 1 2 4 2 8 4 +4 -4.666666666667 3 0.923076923077 3 7.333333333333 2 34.666666666667 8 +4.5 -7 3 0.964285714286 4 11 2 42 9 +3 0 3 1 5 15 2 18 6 + +query RRIRRRRRR +SELECT + regr_slope(column2, column1) OVER w AS slope, + regr_intercept(column2, column1) OVER w AS intercept, + regr_count(column2, column1) OVER w AS count, + regr_r2(column2, column1) OVER w AS r2, + regr_avgx(column2, column1) OVER w AS avgx, + regr_avgy(column2, column1) OVER w AS avgy, + regr_sxx(column2, column1) OVER w AS sxx, + regr_syy(column2, column1) OVER w AS syy, + regr_sxy(column2, column1) OVER w AS sxy +FROM (VALUES (1,2), (2,4), (3,6), (3, NULL), (4, NULL), (5,15), (6,18), (7, 21)) AS t(column1, column2) +WINDOW w AS (ORDER BY column1 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW); +---- +NULL NULL 1 NULL 1 2 0 0 0 +2 0 2 1 1.5 3 0.5 2 1 +2 0 3 1 2 4 2 8 4 +2 0 2 1 2.5 5 0.5 2 1 +NULL NULL 1 NULL 3 6 0 0 0 +NULL NULL 1 NULL 5 15 0 0 0 +3 0 2 1 5.5 16.5 0.5 4.5 1.5 +3 0 3 1 6 18 2 18 6 + +statement error +SELECT STRING_AGG() + +statement error +SELECT STRING_AGG(1,2,3) + +statement error +SELECT STRING_AGG(STRING_AGG('a', ',')) + +query T +SELECT STRING_AGG('a', ',') +---- +a + +query TTTT +SELECT STRING_AGG('a',','), STRING_AGG('a', NULL), STRING_AGG(NULL, ','), STRING_AGG(NULL, NULL) +---- +a a NULL NULL + +query TT +select string_agg('', '|'), string_agg('a', ''); +---- +(empty) a + +query T +SELECT STRING_AGG(column1, '|') FROM (values (''), (null), ('')); +---- +| + +statement ok +CREATE TABLE strings(g INTEGER, x VARCHAR, y VARCHAR) + +query I +INSERT INTO strings VALUES (1,'a','/'), (1,'b','-'), (2,'i','/'), (2,NULL,'-'), (2,'j','+'), (3,'p','/'), (4,'x','/'), (4,'y','-'), (4,'z','+') +---- +9 + +query IT +SELECT g, STRING_AGG(x,'|') FROM strings GROUP BY g ORDER BY g +---- +1 a|b +2 i|j +3 p +4 x|y|z + +query T +SELECT STRING_AGG(x,',') FROM strings WHERE g > 100 +---- +NULL + +statement ok +drop table strings + +query T +WITH my_data as ( +SELECT 'text1'::varchar(1000) as my_column union all +SELECT 'text1'::varchar(1000) as my_column union all +SELECT 'text1'::varchar(1000) as my_column +) +SELECT string_agg(my_column,', ') as my_string_agg +FROM my_data +---- +text1, text1, text1 + +query T +WITH my_data as ( +SELECT 1 as dummy, 'text1'::varchar(1000) as my_column union all +SELECT 1 as dummy, 'text1'::varchar(1000) as my_column union all +SELECT 1 as dummy, 'text1'::varchar(1000) as my_column +) +SELECT string_agg(my_column,', ') as my_string_agg +FROM my_data +GROUP BY dummy +---- +text1, text1, text1 + +# Tests for aggregating with NaN values +statement ok +CREATE TABLE float_table ( + col_f32 FLOAT, + col_f32_nan FLOAT, + col_f64 DOUBLE, + col_f64_nan DOUBLE +) as VALUES +( -128.2, -128.2, -128.2, -128.2 ), +( 32768.3, arrow_cast('NAN','Float32'), 32768.3, 32768.3 ), +( 27.3, 27.3, 27.3, arrow_cast('NAN','Float64') ); + +# Test string_agg with largeutf8 +statement ok +create table string_agg_large_utf8 (c string) as values + (arrow_cast('a', 'LargeUtf8')), + (arrow_cast('b', 'LargeUtf8')), + (arrow_cast('c', 'LargeUtf8')) +; + +query T +SELECT STRING_AGG(c, ',') FROM string_agg_large_utf8; +---- +a,b,c + +statement ok +drop table string_agg_large_utf8; + +query RRRRI +select min(col_f32), max(col_f32), avg(col_f32), sum(col_f32), count(col_f32) from float_table; +---- +-128.2 32768.3 10889.13359451294 32667.40078353882 3 + +query RRRRI +select min(col_f32_nan), max(col_f32_nan), avg(col_f32_nan), sum(col_f32_nan), count(col_f32_nan) from float_table; +---- +-128.2 NaN NaN NaN 3 + +query RRRRI +select min(col_f64), max(col_f64), avg(col_f64), sum(col_f64), count(col_f64) from float_table; +---- +-128.2 32768.3 10889.133333333333 32667.4 3 + +query RRRRI +select min(col_f64_nan), max(col_f64_nan), avg(col_f64_nan), sum(col_f64_nan), count(col_f64_nan) from float_table; +---- +-128.2 NaN NaN NaN 3 + +statement ok +drop table float_table + + +# Queries with nested count(*) + +query I +select count(*) from (select count(*) from (select 1)); +---- +1 + +query I +select count(*) from (select count(*) a, count(*) b from (select 1)); +---- +1 + +# Distinct Count for string +# (test for the specialized implementation of distinct count for strings) + +# UTF8 string matters for string to &[u8] conversion, add it to prevent regression +statement ok +create table distinct_count_string_table as values + (1, 'a', 'longstringtest_a', '台灣'), + (2, 'b', 'longstringtest_b1', '日本'), + (2, 'b', 'longstringtest_b2', '中國'), + (3, 'c', 'longstringtest_c1', '美國'), + (3, 'c', 'longstringtest_c2', '歐洲'), + (3, 'c', 'longstringtest_c3', '韓國') +; + +# run through update_batch +query IIII +select count(distinct column1), count(distinct column2), count(distinct column3), count(distinct column4) from distinct_count_string_table; +---- +3 3 6 6 + +# run through merge_batch +query IIII rowsort +select count(distinct column1), count(distinct column2), count(distinct column3), count(distinct column4) from distinct_count_string_table group by column1; +---- +1 1 1 1 +1 1 2 2 +1 1 3 3 + + +# test with long strings as well +statement ok +create table distinct_count_long_string_table as +SELECT column1, + arrow_cast(column2, 'LargeUtf8') as column2, + arrow_cast(column3, 'LargeUtf8') as column3, + arrow_cast(column4, 'LargeUtf8') as column4 +FROM distinct_count_string_table; + +# run through update_batch +query IIII +select count(distinct column1), count(distinct column2), count(distinct column3), count(distinct column4) from distinct_count_long_string_table; +---- +3 3 6 6 + +# run through merge_batch +query IIII rowsort +select count(distinct column1), count(distinct column2), count(distinct column3), count(distinct column4) from distinct_count_long_string_table group by column1; +---- +1 1 1 1 +1 1 2 2 +1 1 3 3 + +statement ok +drop table distinct_count_long_string_table; + + +# test with binary strings as well +statement ok +create table distinct_count_binary_table as +SELECT column1, + arrow_cast(column2, 'Binary') as column2, + arrow_cast(column3, 'Binary') as column3, + arrow_cast(column4, 'Binary') as column4 +FROM distinct_count_string_table; + +# run through update_batch +query IIII +select count(distinct column1), count(distinct column2), count(distinct column3), count(distinct column4) from distinct_count_binary_table; +---- +3 3 6 6 + +# run through merge_batch +query IIII rowsort +select count(distinct column1), count(distinct column2), count(distinct column3), count(distinct column4) from distinct_count_binary_table group by column1; +---- +1 1 1 1 +1 1 2 2 +1 1 3 3 + +statement ok +drop table distinct_count_binary_table; + + +# test with large binary strings as well +statement ok +create table distinct_count_large_binary_table as +SELECT column1, + arrow_cast(column2, 'LargeBinary') as column2, + arrow_cast(column3, 'LargeBinary') as column3, + arrow_cast(column4, 'LargeBinary') as column4 +FROM distinct_count_string_table; + +# run through update_batch +query IIII +select count(distinct column1), count(distinct column2), count(distinct column3), count(distinct column4) from distinct_count_large_binary_table; +---- +3 3 6 6 + +# run through merge_batch +query IIII rowsort +select count(distinct column1), count(distinct column2), count(distinct column3), count(distinct column4) from distinct_count_large_binary_table group by column1; +---- +1 1 1 1 +1 1 2 2 +1 1 3 3 + +statement ok +drop table distinct_count_large_binary_table; + + + +## Cleanup from distinct count tests +statement ok +drop table distinct_count_string_table; + + + +# rule `aggregate_statistics` should not optimize MIN/MAX to wrong values on empty relation + +statement ok +CREATE TABLE empty(col0 INTEGER); + +query I +SELECT MIN(col0) FROM empty WHERE col0=1; +---- +NULL + +query I +SELECT MAX(col0) FROM empty WHERE col0=1; +---- +NULL + +query TT +EXPLAIN SELECT MIN(col0) FROM empty; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[min(empty.col0)]] +02)--TableScan: empty projection=[col0] +physical_plan +01)ProjectionExec: expr=[NULL as min(empty.col0)] +02)--PlaceholderRowExec + +query TT +EXPLAIN SELECT MAX(col0) FROM empty; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[max(empty.col0)]] +02)--TableScan: empty projection=[col0] +physical_plan +01)ProjectionExec: expr=[NULL as max(empty.col0)] +02)--PlaceholderRowExec + +statement ok +DROP TABLE empty; + +# verify count aggregate function should not be nullable +statement ok +create table empty; + +query I +select distinct count() from empty; +---- +0 + +statement ok +DROP TABLE empty; + +statement ok +CREATE TABLE t(col0 INTEGER) as VALUES(2); + +query I +SELECT MIN(col0) FROM t WHERE col0=1; +---- +NULL + +query I +SELECT MAX(col0) FROM t WHERE col0=1; +---- +NULL + +statement ok +DROP TABLE t; + + +# Test for the case when the column name is ambiguous +statement ok +CREATE TABLE t(a BIGINT) AS VALUES(1), (2), (3); + +# The column name referenced by GROUP-BY is ambiguous, prefer the column in base plan +query I +SELECT 0 as "t.a" FROM t GROUP BY t.a; +---- +0 +0 +0 + +# The column name referenced by HAVING is ambiguous, prefer the column in the base plan +query I +SELECT 0 AS "t.a" FROM t HAVING MAX(t.a) = 0; +---- + +# Test issue: https://github.com/apache/datafusion/issues/9161 +query I rowsort +SELECT CAST(a AS INT) FROM t GROUP BY t.a; +---- +1 +2 +3 + +statement ok +DROP TABLE t; + +# Test for ignore null in FIRST_VALUE +statement ok +CREATE TABLE t AS VALUES (null::bigint), (3), (4); + +query I +SELECT FIRST_VALUE(column1) FROM t; +---- +NULL + +query I +SELECT FIRST_VALUE(column1) RESPECT NULLS FROM t; +---- +NULL + +query I +SELECT FIRST_VALUE(column1) IGNORE NULLS FROM t; +---- +3 + +statement ok +DROP TABLE t; + +# Test for ignore null with ORDER BY in FIRST_VALUE +statement ok +CREATE TABLE t AS VALUES (3, 4), (4, 3), (null::bigint, 1), (null::bigint, 1); + +query I +SELECT column1 FROM t ORDER BY column2; +---- +NULL +NULL +4 +3 + +query I +SELECT FIRST_VALUE(column1 ORDER BY column2) FROM t; +---- +NULL + +query I +SELECT FIRST_VALUE(column1 ORDER BY column2) RESPECT NULLS FROM t; +---- +NULL + +query I +SELECT FIRST_VALUE(column1 ORDER BY column2) IGNORE NULLS FROM t; +---- +4 + +statement ok +DROP TABLE t; + +# Test for ignore null in LAST_VALUE +statement ok +CREATE TABLE t AS VALUES (3), (4), (null::bigint); + +query I +SELECT LAST_VALUE(column1) FROM t; +---- +NULL + +query I +SELECT LAST_VALUE(column1) RESPECT NULLS FROM t; +---- +NULL + +query I +SELECT LAST_VALUE(column1) IGNORE NULLS FROM t; +---- +4 + +statement ok +DROP TABLE t; + +# Test for ignore null with ORDER BY in LAST_VALUE +statement ok +CREATE TABLE t AS VALUES (3, 3), (4, 4), (null::bigint, 1), (null::bigint, 2); + +query I +SELECT column1 FROM t ORDER BY column2 DESC; +---- +4 +3 +NULL +NULL + +query I +SELECT LAST_VALUE(column1 ORDER BY column2 DESC) FROM t; +---- +NULL + +query I +SELECT LAST_VALUE(column1 ORDER BY column2 DESC) RESPECT NULLS FROM t; +---- +NULL + +query I +SELECT LAST_VALUE(column1 ORDER BY column2 DESC) IGNORE NULLS FROM t; +---- +3 + +statement ok +DROP TABLE t; + +# Test for CASE with NULL in aggregate function +statement ok +CREATE TABLE example(data double precision); + +statement ok +INSERT INTO example VALUES (1), (2), (NULL), (4); + +query RR +SELECT + sum(CASE WHEN data is NULL THEN NULL ELSE data+1 END) as then_null, + sum(CASE WHEN data is NULL THEN data+1 ELSE NULL END) as else_null +FROM example; +---- +10 NULL + +query R +SELECT + CASE data WHEN 1 THEN NULL WHEN 2 THEN 3.3 ELSE NULL END as case_null +FROM example; +---- +NULL +3.3 +NULL +NULL + +statement ok +drop table example; + +# Test Convert FirstLast optimizer rule +statement ok +CREATE EXTERNAL TABLE convert_first_last_table ( +c1 INT NOT NULL, +c2 INT NOT NULL, +c3 INT NOT NULL +) +STORED AS CSV +WITH ORDER (c1 ASC) +WITH ORDER (c2 DESC) +WITH ORDER (c3 ASC) +LOCATION '../core/tests/data/convert_first_last.csv' +OPTIONS ('format.has_header' 'true'); + +# test first to last, the result does not show difference, we need to check the conversion by `explain` +query TT +explain select first_value(c1 order by c3 desc) from convert_first_last_table; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]] +02)--TableScan: convert_first_last_table projection=[c1, c3] +physical_plan +01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]] +02)--CoalescePartitionsExec +03)----AggregateExec: mode=Partial, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]] +04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +05)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c3], output_orderings=[[c1@0 ASC NULLS LAST], [c3@1 ASC NULLS LAST]], has_header=true + +# test last to first +query TT +explain select last_value(c1 order by c2 asc) from convert_first_last_table; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]] +02)--TableScan: convert_first_last_table projection=[c1, c2] +physical_plan +01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]] +02)--CoalescePartitionsExec +03)----AggregateExec: mode=Partial, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]] +04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +05)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c2], output_orderings=[[c1@0 ASC NULLS LAST], [c2@1 DESC]], has_header=true + +# test building plan with aggreagte sum + +statement ok +create table employee_csv(id int, first_name string, last_name varchar, state varchar, salary bigint) as values (1, 'jenson', 'huang', 'unemployed', 10); + +query TI +select state, sum(salary) total_salary from employee_csv group by state; +---- +unemployed 10 + +statement ok +set datafusion.explain.logical_plan_only = true; + +query TT +explain select state, sum(salary) as total_salary from employee_csv group by state; +---- +logical_plan +01)Projection: employee_csv.state, sum(employee_csv.salary) AS total_salary +02)--Aggregate: groupBy=[[employee_csv.state]], aggr=[[sum(employee_csv.salary)]] +03)----TableScan: employee_csv projection=[state, salary] + +# fail if there is duplicate name +query error DataFusion error: Schema error: Schema contains qualified field name employee_csv\.state and unqualified field name state which would be ambiguous +select state, sum(salary) as state from employee_csv group by state; + +statement ok +set datafusion.explain.logical_plan_only = false; + +statement ok +drop table employee_csv; + +# test null literal handling in supported aggregate functions +query I??III?T +select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(NULL), nth_value(NULL, 1), string_agg(NULL, ','); +---- +0 NULL NULL NULL NULL NULL NULL NULL + +statement ok +create table having_test(v1 int, v2 int) + +statement ok +create table join_table(v1 int, v2 int) + +statement ok +insert into having_test values (1, 2), (2, 3), (3, 4) + +statement ok +insert into join_table values (1, 2), (2, 3), (3, 4) + + +query II +select * from having_test group by v1, v2 having max(v1) = 3 +---- +3 4 + +query TT +EXPLAIN select * from having_test group by v1, v2 having max(v1) = 3 +---- +logical_plan +01)Projection: having_test.v1, having_test.v2 +02)--Filter: max(having_test.v1) = Int32(3) +03)----Aggregate: groupBy=[[having_test.v1, having_test.v2]], aggr=[[max(having_test.v1)]] +04)------TableScan: having_test projection=[v1, v2] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: max(having_test.v1)@2 = 3, projection=[v1@0, v2@1] +03)----AggregateExec: mode=FinalPartitioned, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)] +04)------CoalesceBatchesExec: target_batch_size=8192 +05)--------RepartitionExec: partitioning=Hash([v1@0, v2@1], 4), input_partitions=4 +06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +07)------------AggregateExec: mode=Partial, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)] +08)--------------MemoryExec: partitions=1, partition_sizes=[1] + + +query error +select * from having_test having max(v1) = 3 + +query I +select max(v1) from having_test having max(v1) = 3 +---- +3 + +query I +select max(v1), * exclude (v1, v2) from having_test having max(v1) = 3 +---- +3 + +# because v1, v2 is not in the group by clause, the sql is invalid +query III +select max(v1), * replace ('v1' as v3) from having_test group by v1, v2 having max(v1) = 3 +---- +3 3 4 + +query III +select max(v1), t.* from having_test t group by v1, v2 having max(v1) = 3 +---- +3 3 4 + +# j.* should also be included in the group-by clause +query error +select max(t.v1), j.* from having_test t join join_table j on t.v1 = j.v1 group by t.v1, t.v2 having max(t.v1) = 3 + +query III +select max(t.v1), j.* from having_test t join join_table j on t.v1 = j.v1 group by j.v1, j.v2 having max(t.v1) = 3 +---- +3 3 4 + +# If the select items only contain scalar expressions, the having clause is valid. +query P +select now() from having_test having max(v1) = 4 +---- + +# If the select items only contain scalar expressions, the having clause is valid. +query I +select 0 from having_test having max(v1) = 4 +---- + +# v2 should also be included in group-by clause +query error +select * from having_test group by v1 having max(v1) = 3 + +statement ok +drop table having_test + +statement ok +drop table join_table + +# test min/max Float16 without group expression +query RRTT +WITH data AS ( + SELECT arrow_cast(1, 'Float16') AS f + UNION ALL + SELECT arrow_cast(6, 'Float16') AS f +) +SELECT MIN(f), MAX(f), arrow_typeof(MIN(f)), arrow_typeof(MAX(f)) FROM data; +---- +1 6 Float16 Float16 + +# test min/max Float16 with group expression +query IRRTT +WITH data AS ( + SELECT 1 as k, arrow_cast(1.8125, 'Float16') AS f + UNION ALL + SELECT 1 as k, arrow_cast(6.8007813, 'Float16') AS f + UNION ALL + SELECT 2 AS k, arrow_cast(8.5, 'Float16') AS f +) +SELECT k, MIN(f), MAX(f), arrow_typeof(MIN(f)), arrow_typeof(MAX(f)) +FROM data +GROUP BY k +ORDER BY k; +---- +1 1.8125 6.8007813 Float16 Float16 +2 8.5 8.5 Float16 Float16 + +statement ok +CREATE TABLE t1(v1 int); + +# issue: https://github.com/apache/datafusion/issues/12814 +statement error DataFusion error: Error during planning: Aggregate functions are not allowed in the WHERE clause. Consider using HAVING instead +SELECT v1 FROM t1 WHERE ((count(v1) % 1) << 1) > 0; + +statement ok +DROP TABLE t1; + +# Test last function with merge batch +query II +with A as ( + select 1 as id, 10 as foo + UNION ALL + select 1, 10 + UNION ALL + select 1, 10 + UNION ALL + select 1, 10 + UNION ALL + select 1, 10 + ---- The order is non-deterministic, keep the value the same +) select last_value(a.foo), sum(distinct 1) from A a group by a.id; +---- +10 1 + +# It has only AggregateExec with FinalPartitioned mode, so `merge_batch` is used +# If the plan is changed, whether the `merge_batch` is used should be verified to ensure the test coverage +query TT +explain with A as ( + select 1 as id, 2 as foo + UNION ALL + select 1, 4 + UNION ALL + select 1, 5 + UNION ALL + select 1, 3 + UNION ALL + select 1, 2 +) select last_value(a.foo order by a.foo), sum(distinct 1) from A a group by a.id; +---- +logical_plan +01)Projection: last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1)) +02)--Aggregate: groupBy=[[a.id]], aggr=[[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))]] +03)----SubqueryAlias: a +04)------SubqueryAlias: a +05)--------Union +06)----------Projection: Int64(1) AS id, Int64(2) AS foo +07)------------EmptyRelation +08)----------Projection: Int64(1) AS id, Int64(4) AS foo +09)------------EmptyRelation +10)----------Projection: Int64(1) AS id, Int64(5) AS foo +11)------------EmptyRelation +12)----------Projection: Int64(1) AS id, Int64(3) AS foo +13)------------EmptyRelation +14)----------Projection: Int64(1) AS id, Int64(2) AS foo +15)------------EmptyRelation +physical_plan +01)ProjectionExec: expr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST]@1 as last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))@2 as sum(DISTINCT Int64(1))] +02)--AggregateExec: mode=FinalPartitioned, gby=[id@0 as id], aggr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))], ordering_mode=Sorted +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------RepartitionExec: partitioning=Hash([id@0], 4), input_partitions=5 +05)--------AggregateExec: mode=Partial, gby=[id@0 as id], aggr=[last_value(a.foo) ORDER BY [a.foo ASC NULLS LAST], sum(DISTINCT Int64(1))], ordering_mode=Sorted +06)----------UnionExec +07)------------ProjectionExec: expr=[1 as id, 2 as foo] +08)--------------PlaceholderRowExec +09)------------ProjectionExec: expr=[1 as id, 4 as foo] +10)--------------PlaceholderRowExec +11)------------ProjectionExec: expr=[1 as id, 5 as foo] +12)--------------PlaceholderRowExec +13)------------ProjectionExec: expr=[1 as id, 3 as foo] +14)--------------PlaceholderRowExec +15)------------ProjectionExec: expr=[1 as id, 2 as foo] +16)--------------PlaceholderRowExec diff --git a/datafusion/sqllogictest/test_files/aggregate/init.slt.part b/datafusion/sqllogictest/test_files/aggregate/init.slt.part new file mode 100644 index 000000000000..be70d18ac8fd --- /dev/null +++ b/datafusion/sqllogictest/test_files/aggregate/init.slt.part @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####### +# Setup test data table +####### +statement ok +CREATE EXTERNAL TABLE aggregate_test_100 ( + c1 VARCHAR NOT NULL, + c2 TINYINT NOT NULL, + c3 SMALLINT NOT NULL, + c4 SMALLINT, + c5 INT, + c6 BIGINT NOT NULL, + c7 SMALLINT NOT NULL, + c8 INT NOT NULL, + c9 INT UNSIGNED NOT NULL, + c10 BIGINT UNSIGNED NOT NULL, + c11 FLOAT NOT NULL, + c12 DOUBLE NOT NULL, + c13 VARCHAR NOT NULL +) +STORED AS CSV +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); + +statement ok +CREATE TABLE d_table (c1 decimal(10,3), c2 varchar) +as values +(110.000, 'A'), (110.001, 'A'), (110.002, 'A'), (110.003, 'A'), (110.004, 'A'), (110.005, 'A'), (110.006, 'A'), (110.007, 'A'), (110.008, 'A'), (110.009, 'A'), +(-100.000, 'B'),(-100.001, 'B'),(-100.002, 'B'),(-100.003, 'B'),(-100.004, 'B'),(-100.005, 'B'),(-100.006, 'B'),(-100.007, 'B'),(-100.008, 'B'),(-100.009, 'B') + +statement ok +CREATE TABLE median_table ( + col_i8 TINYINT, + col_i16 SMALLINT, + col_i32 INT, + col_i64 BIGINT, + col_u8 TINYINT UNSIGNED, + col_u16 SMALLINT UNSIGNED, + col_u32 INT UNSIGNED, + col_u64 BIGINT UNSIGNED, + col_f32 FLOAT, + col_f64 DOUBLE, + col_f64_nan DOUBLE +) as VALUES +( -128, -32768, -2147483648, arrow_cast(-9223372036854775808,'Int64'), 0, 0, 0, arrow_cast(0,'UInt64'), 1.1, 1.1, 1.1 ), +( -128, -32768, -2147483648, arrow_cast(-9223372036854775808,'Int64'), 0, 0, 0, arrow_cast(0,'UInt64'), 4.4, 4.4, arrow_cast('NAN','Float64') ), +( 100, 100, 100, arrow_cast(100,'Int64'), 100,100,100, arrow_cast(100,'UInt64'), 3.3, 3.3, arrow_cast('NAN','Float64') ), +( 127, 32767, 2147483647, arrow_cast(9223372036854775807,'Int64'), 255, 65535, 4294967295, 18446744073709551615, 2.2, 2.2, arrow_cast('NAN','Float64') ) + +statement ok +CREATE TABLE test (c1 BIGINT,c2 BIGINT) as values +(0,null), (1,1), (null,1), (3,2), (3,2)