From 91951855566a75447f0866b170755b677ed54a7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Sat, 10 Jan 2026 23:14:41 +0100 Subject: [PATCH 1/3] fix: null in array_agg with DISTINCT and IGNORE --- .../optimizer/src/single_distinct_to_groupby.rs | 12 ++++++++---- datafusion/sqllogictest/test_files/aggregate.slt | 7 +++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 05edd230daccb..e2d95773c6e79 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -184,7 +184,11 @@ impl OptimizerRule for SingleDistinctToGroupBy { func, params: AggregateFunctionParams { - mut args, distinct, .. + mut args, + distinct, + filter, + order_by, + null_treatment, }, }) => { if distinct { @@ -204,9 +208,9 @@ impl OptimizerRule for SingleDistinctToGroupBy { func, vec![col(SINGLE_DISTINCT_ALIAS)], false, // intentional to remove distinct here - None, - vec![], - None, + filter, + order_by, + null_treatment, ))) // if the aggregate function is not distinct, we need to rewrite it like two phase aggregation } else { diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 3c962a0f87f36..13cdc70c15d3e 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -379,6 +379,13 @@ select array_sort(c1), array_sort(c2) from ( statement ok drop table array_agg_distinct_list_table; +# Test array_agg with DISTINCT and IGNORE NULLS (regression test for issue #19735) +query ? +SELECT array_sort(ARRAY_AGG(DISTINCT x IGNORE NULLS)) as result +FROM (VALUES (1), (2), (NULL), (2), (NULL), (1)) AS t(x); +---- +[1, 2] + statement error This feature is not implemented: Calling array_agg: LIMIT not supported in function arguments: 1 SELECT array_agg(c13 LIMIT 1) FROM aggregate_test_100 From dcb05954015cd83ac5dd69dbbf43f058f93ba99c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Sun, 11 Jan 2026 10:30:11 +0100 Subject: [PATCH 2/3] suggestion --- .../optimizer/src/single_distinct_to_groupby.rs | 6 +++--- .../sqllogictest/test_files/aggregate.slt | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index e2d95773c6e79..00c8fab228117 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -221,9 +221,9 @@ impl OptimizerRule for SingleDistinctToGroupBy { Arc::clone(&func), args, false, - None, - vec![], - None, + filter, + order_by, + null_treatment, )) .alias(&alias_str), ); diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 13cdc70c15d3e..333b21bcbddd9 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -386,6 +386,23 @@ FROM (VALUES (1), (2), (NULL), (2), (NULL), (1)) AS t(x); ---- [1, 2] +# Test that non-DISTINCT aggregates also preserve IGNORE NULLS when mixed with DISTINCT +# This tests the two-phase aggregation rewrite in SingleDistinctToGroupBy +query I? +SELECT + COUNT(DISTINCT x) as distinct_count, + array_sort(ARRAY_AGG(y IGNORE NULLS)) as y_agg +FROM (VALUES + (1, 10), + (1, 20), + (2, 30), + (3, NULL), + (3, 40), + (NULL, 50) +) AS t(x, y) +---- +3 [10, 20, 30, 40, 50] + statement error This feature is not implemented: Calling array_agg: LIMIT not supported in function arguments: 1 SELECT array_agg(c13 LIMIT 1) FROM aggregate_test_100 From cc4fbe4fd6ed4259be8aa6cc8a23cdf649bb481b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20L=C3=B3pez?= Date: Sun, 11 Jan 2026 14:01:27 +0100 Subject: [PATCH 3/3] suggestion test --- .../sqllogictest/test_files/aggregate.slt | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 333b21bcbddd9..cff451644d16f 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -403,6 +403,35 @@ FROM (VALUES ---- 3 [10, 20, 30, 40, 50] +# Test that FILTER clause is preserved in two-phase aggregation rewrite +query II +SELECT + COUNT(DISTINCT x) as distinct_count, + SUM(y) FILTER (WHERE y > 15) as filtered_sum +FROM (VALUES + (1, 10), + (1, 20), + (2, 5), + (2, 30), + (3, 25) +) AS t(x, y) +---- +3 75 + +# Test that ORDER BY is preserved in two-phase aggregation rewrite +query I? +SELECT + COUNT(DISTINCT x) as distinct_count, + ARRAY_AGG(y ORDER BY y DESC) as ordered_agg +FROM (VALUES + (1, 10), + (1, 30), + (2, 20), + (2, 40) +) AS t(x, y) +---- +2 [40, 30, 20, 10] + statement error This feature is not implemented: Calling array_agg: LIMIT not supported in function arguments: 1 SELECT array_agg(c13 LIMIT 1) FROM aggregate_test_100