From 0a2ec540eeaa80cc869923cd9a3d1fa0bacc0b10 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 24 Feb 2026 19:48:23 -0500 Subject: [PATCH 01/10] fix: Fix `array_to_string` with columnar third arg --- datafusion/functions-nested/src/string.rs | 49 +++++++++----------- datafusion/sqllogictest/test_files/array.slt | 17 +++++++ 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index 1c8d58fca80d0..c296f1969e258 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -347,21 +347,20 @@ fn array_to_string_inner(args: &[ArrayRef]) -> Result { } }; - let mut null_string = String::from(""); - let mut with_null_string = false; - if args.len() == 3 { - null_string = match args[2].data_type() { - Utf8 => args[2].as_string::().value(0).to_string(), - Utf8View => args[2].as_string_view().value(0).to_string(), - LargeUtf8 => args[2].as_string::().value(0).to_string(), + let null_strings = if args.len() == 3 { + Some(match args[2].data_type() { + Utf8 => args[2].as_string::().iter().collect(), + Utf8View => args[2].as_string_view().iter().collect(), + LargeUtf8 => args[2].as_string::().iter().collect(), other => { return exec_err!( - "unsupported type for second argument to array_to_string function as {other:?}" + "unsupported type for third argument to array_to_string function as {other:?}" ); } - }; - with_null_string = true; - } + }) + } else { + None + }; /// Creates a single string from single element of a ListArray (which is /// itself another Array) @@ -469,18 +468,24 @@ fn array_to_string_inner(args: &[ArrayRef]) -> Result { fn generate_string_array( list_arr: &GenericListArray, delimiters: &[Option<&str>], - null_string: &str, - with_null_string: bool, + null_strings: &Option>>, ) -> Result { let mut res: Vec> = Vec::new(); - for (arr, &delimiter) in list_arr.iter().zip(delimiters.iter()) { + for (i, (arr, &delimiter)) in list_arr.iter().zip(delimiters.iter()).enumerate() { if let (Some(arr), Some(delimiter)) = (arr, delimiter) { + let (null_string, with_null_string) = match null_strings { + Some(ns) => match ns[i] { + Some(s) => (s.to_string(), true), + None => (String::new(), false), + }, + None => (String::new(), false), + }; let mut arg = String::from(""); let s = compute_array_to_string( &mut arg, &arr, delimiter.to_string(), - null_string.to_string(), + null_string, with_null_string, )? .clone(); @@ -501,21 +506,11 @@ fn array_to_string_inner(args: &[ArrayRef]) -> Result { let string_arr = match arr.data_type() { List(_) => { let list_array = as_list_array(&arr)?; - generate_string_array::( - list_array, - &delimiters, - &null_string, - with_null_string, - )? + generate_string_array::(list_array, &delimiters, &null_strings)? } LargeList(_) => { let list_array = as_large_list_array(&arr)?; - generate_string_array::( - list_array, - &delimiters, - &null_string, - with_null_string, - )? + generate_string_array::(list_array, &delimiters, &null_strings)? } // Signature guards against this arm _ => return exec_err!("array_to_string expects list as first argument"), diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 8eb351bb7706d..0ec2a187540e7 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -5143,6 +5143,23 @@ NULL 1.2.3 51_52_*_54_55_56_57_58_59_60 1.2.3 61_62_63_64_65_66_67_68_69_70 1.2.3 +# array_to_string with per-row null_string column +statement ok +CREATE TABLE test_null_str_col AS VALUES + (make_array(1, NULL, 3), ',', 'N/A'), + (make_array(NULL, 5, NULL), ',', 'MISSING'), + (make_array(10, NULL, 12), '-', 'X'); + +query T +SELECT array_to_string(column1, column2, column3) FROM test_null_str_col; +---- +1,N/A,3 +MISSING,5,MISSING +10-X-12 + +statement ok +DROP TABLE test_null_str_col; + ## cardinality # cardinality scalar function From 7c2175267503406739d9a18a5eaa5b2d196bdc63 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 25 Feb 2026 09:28:11 -0500 Subject: [PATCH 02/10] Add benchmark for array_to_string --- datafusion/functions-nested/Cargo.toml | 4 + .../benches/array_to_string.rs | 161 ++++++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 datafusion/functions-nested/benches/array_to_string.rs diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index d885a2ca96dac..98c0e0a5abb21 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -97,3 +97,7 @@ name = "array_repeat" [[bench]] harness = false name = "array_set_ops" + +[[bench]] +harness = false +name = "array_to_string" diff --git a/datafusion/functions-nested/benches/array_to_string.rs b/datafusion/functions-nested/benches/array_to_string.rs new file mode 100644 index 0000000000000..3896b87bec48a --- /dev/null +++ b/datafusion/functions-nested/benches/array_to_string.rs @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, Int64Array, ListArray, StringArray}; +use arrow::buffer::OffsetBuffer; +use arrow::datatypes::{DataType, Field}; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion_common::ScalarValue; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions_nested::string::ArrayToString; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use std::hint::black_box; +use std::sync::Arc; + +const NUM_ROWS: usize = 1000; +const ARRAY_SIZES: &[usize] = &[5, 20, 100]; +const NESTED_ARRAY_SIZE: usize = 3; +const SEED: u64 = 42; +const NULL_DENSITY: f64 = 0.1; + +fn criterion_benchmark(c: &mut Criterion) { + bench_array_to_string(c, "array_to_string_int64", create_int64_list_array); + bench_array_to_string(c, "array_to_string_string", create_string_list_array); + bench_array_to_string( + c, + "array_to_string_nested_int64", + create_nested_int64_list_array, + ); +} + +fn bench_array_to_string( + c: &mut Criterion, + group_name: &str, + make_array: impl Fn(usize) -> ArrayRef, +) { + let mut group = c.benchmark_group(group_name); + + for &array_size in ARRAY_SIZES { + let list_array = make_array(array_size); + let args = vec![ + ColumnarValue::Array(list_array.clone()), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))), + ]; + let arg_fields = vec![ + Field::new("array", list_array.data_type().clone(), true).into(), + Field::new("delimiter", DataType::Utf8, false).into(), + ]; + + group.bench_with_input( + BenchmarkId::from_parameter(array_size), + &array_size, + |b, _| { + let udf = ArrayToString::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: Field::new("result", DataType::Utf8, true) + .into(), + config_options: Arc::new(ConfigOptions::default()), + }) + .unwrap(), + ) + }) + }, + ); + } + + group.finish(); +} + +fn create_int64_list_array(array_size: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let values = (0..NUM_ROWS * array_size) + .map(|_| { + if rng.random::() < NULL_DENSITY { + None + } else { + Some(rng.random_range(0..1000)) + } + }) + .collect::(); + let offsets = (0..=NUM_ROWS) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Int64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} + +fn create_nested_int64_list_array(array_size: usize) -> ArrayRef { + let inner = create_int64_list_array(array_size); + let inner_rows = NUM_ROWS; + let outer_rows = inner_rows / NESTED_ARRAY_SIZE; + let offsets = (0..=outer_rows) + .map(|i| (i * NESTED_ARRAY_SIZE) as i32) + .collect::>(); + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", inner.data_type().clone(), true)), + OffsetBuffer::new(offsets.into()), + inner, + None, + ) + .unwrap(), + ) +} + +fn create_string_list_array(array_size: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let values = (0..NUM_ROWS * array_size) + .map(|_| { + if rng.random::() < NULL_DENSITY { + None + } else { + Some(format!("value_{}", rng.random_range(0..100))) + } + }) + .collect::(); + let offsets = (0..=NUM_ROWS) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Utf8, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 9a67b2630d55b082d051ddd87f07839288c3bfcc Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 25 Feb 2026 09:34:21 -0500 Subject: [PATCH 03/10] perf: Optimize array_to_string --- datafusion/functions-nested/src/string.rs | 394 +++++++++--------- datafusion/sqllogictest/test_files/array.slt | 6 + .../source/user-guide/sql/scalar_functions.md | 2 +- 3 files changed, 195 insertions(+), 207 deletions(-) diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index c296f1969e258..ff3a38e60c748 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -29,6 +29,7 @@ use datafusion_common::utils::ListCoercion; use datafusion_common::{DataFusionError, Result, not_impl_err}; use std::any::Any; +use std::fmt::Write; use crate::utils::make_scalar_function; use arrow::array::{ @@ -54,69 +55,6 @@ use datafusion_functions::downcast_arg; use datafusion_macros::user_doc; use std::sync::Arc; -macro_rules! call_array_function { - ($DATATYPE:expr, false) => { - match $DATATYPE { - DataType::Utf8 => array_function!(StringArray), - DataType::Utf8View => array_function!(StringViewArray), - DataType::LargeUtf8 => array_function!(LargeStringArray), - DataType::Boolean => array_function!(BooleanArray), - DataType::Float32 => array_function!(Float32Array), - DataType::Float64 => array_function!(Float64Array), - DataType::Int8 => array_function!(Int8Array), - DataType::Int16 => array_function!(Int16Array), - DataType::Int32 => array_function!(Int32Array), - DataType::Int64 => array_function!(Int64Array), - DataType::UInt8 => array_function!(UInt8Array), - DataType::UInt16 => array_function!(UInt16Array), - DataType::UInt32 => array_function!(UInt32Array), - DataType::UInt64 => array_function!(UInt64Array), - dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"), - } - }; - ($DATATYPE:expr, $INCLUDE_LIST:expr) => {{ - match $DATATYPE { - DataType::List(_) => array_function!(ListArray), - DataType::Utf8 => array_function!(StringArray), - DataType::Utf8View => array_function!(StringViewArray), - DataType::LargeUtf8 => array_function!(LargeStringArray), - DataType::Boolean => array_function!(BooleanArray), - DataType::Float32 => array_function!(Float32Array), - DataType::Float64 => array_function!(Float64Array), - DataType::Int8 => array_function!(Int8Array), - DataType::Int16 => array_function!(Int16Array), - DataType::Int32 => array_function!(Int32Array), - DataType::Int64 => array_function!(Int64Array), - DataType::UInt8 => array_function!(UInt8Array), - DataType::UInt16 => array_function!(UInt16Array), - DataType::UInt32 => array_function!(UInt32Array), - DataType::UInt64 => array_function!(UInt64Array), - dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"), - } - }}; -} - -macro_rules! to_string { - ($ARG:expr, $ARRAY:expr, $DELIMITER:expr, $NULL_STRING:expr, $WITH_NULL_STRING:expr, $ARRAY_TYPE:ident) => {{ - let arr = downcast_arg!($ARRAY, $ARRAY_TYPE); - for x in arr { - match x { - Some(x) => { - $ARG.push_str(&x.to_string()); - $ARG.push_str($DELIMITER); - } - None => { - if $WITH_NULL_STRING { - $ARG.push_str($NULL_STRING); - $ARG.push_str($DELIMITER); - } - } - } - } - Ok($ARG) - }}; -} - // Create static instances of ScalarUDFs for each function make_udf_expr_and_func!( ArrayToString, @@ -145,7 +83,7 @@ make_udf_expr_and_func!( argument(name = "delimiter", description = "Array element separator."), argument( name = "null_string", - description = "Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior." + description = "Optional. String to use for null values in the output. If not provided, nulls will be omitted." ) )] #[derive(Debug, PartialEq, Eq, Hash)] @@ -347,8 +285,8 @@ fn array_to_string_inner(args: &[ArrayRef]) -> Result { } }; - let null_strings = if args.len() == 3 { - Some(match args[2].data_type() { + let null_strings: Vec> = if args.len() == 3 { + match args[2].data_type() { Utf8 => args[2].as_string::().iter().collect(), Utf8View => args[2].as_string_view().iter().collect(), LargeUtf8 => args[2].as_string::().iter().collect(), @@ -357,166 +295,210 @@ fn array_to_string_inner(args: &[ArrayRef]) -> Result { "unsupported type for third argument to array_to_string function as {other:?}" ); } - }) + } } else { - None + // If `null_strings` is not specified, we treat it as equivalent to + // explicitly passing a NULL value for `null_strings` in every row. + vec![None; args[0].len()] }; - /// Creates a single string from single element of a ListArray (which is - /// itself another Array) - fn compute_array_to_string<'a>( - arg: &'a mut String, - arr: &ArrayRef, - delimiter: String, - null_string: String, - with_null_string: bool, - ) -> Result<&'a mut String> { - match arr.data_type() { - List(..) => { - let list_array = as_list_array(&arr)?; - for i in 0..list_array.len() { - if !list_array.is_null(i) { - compute_array_to_string( - arg, - &list_array.value(i), - delimiter.clone(), - null_string.clone(), - with_null_string, - )?; - } else if with_null_string { - arg.push_str(&null_string); - arg.push_str(&delimiter); - } - } + let string_arr = match arr.data_type() { + List(_) => { + let list_array = as_list_array(&arr)?; + generate_string_array::(list_array, &delimiters, &null_strings)? + } + LargeList(_) => { + let list_array = as_large_list_array(&arr)?; + generate_string_array::(list_array, &delimiters, &null_strings)? + } + // Signature guards against this arm + _ => return exec_err!("array_to_string expects list as first argument"), + }; - Ok(arg) - } - FixedSizeList(..) => { - let list_array = as_fixed_size_list_array(&arr)?; - - for i in 0..list_array.len() { - if !list_array.is_null(i) { - compute_array_to_string( - arg, - &list_array.value(i), - delimiter.clone(), - null_string.clone(), - with_null_string, - )?; - } else if with_null_string { - arg.push_str(&null_string); - arg.push_str(&delimiter); - } - } + Ok(Arc::new(string_arr)) +} - Ok(arg) - } - LargeList(..) => { - let list_array = as_large_list_array(&arr)?; - for i in 0..list_array.len() { - if !list_array.is_null(i) { - compute_array_to_string( - arg, - &list_array.value(i), - delimiter.clone(), - null_string.clone(), - with_null_string, - )?; - } else if with_null_string { - arg.push_str(&null_string); - arg.push_str(&delimiter); +fn generate_string_array( + list_arr: &GenericListArray, + delimiters: &[Option<&str>], + null_strings: &[Option<&str>], +) -> Result { + let mut builder = StringBuilder::with_capacity(list_arr.len(), 0); + let mut buf = String::new(); + + for ((arr, &delimiter), &null_string) in list_arr + .iter() + .zip(delimiters.iter()) + .zip(null_strings.iter()) + { + let (Some(arr), Some(delimiter)) = (arr, delimiter) else { + builder.append_null(); + continue; + }; + + buf.clear(); + let mut first = true; + compute_array_to_string(&mut buf, &arr, delimiter, null_string, &mut first)?; + builder.append_value(&buf); + } + + Ok(builder.finish()) +} + +fn compute_array_to_string( + buf: &mut String, + arr: &ArrayRef, + delimiter: &str, + null_string: Option<&str>, + first: &mut bool, +) -> Result<()> { + // Handle lists by recursing on each list element. + macro_rules! handle_list { + ($list_array:expr) => { + for i in 0..$list_array.len() { + if !$list_array.is_null(i) { + compute_array_to_string( + buf, + &$list_array.value(i), + delimiter, + null_string, + first, + )?; + } else if let Some(ns) = null_string { + if *first { + *first = false; + } else { + buf.push_str(delimiter); } + buf.push_str(ns); } + } + }; + } - Ok(arg) + match arr.data_type() { + List(..) => { + let list_array = as_list_array(arr)?; + handle_list!(list_array); + Ok(()) + } + FixedSizeList(..) => { + let list_array = as_fixed_size_list_array(arr)?; + handle_list!(list_array); + Ok(()) + } + LargeList(..) => { + let list_array = as_large_list_array(arr)?; + handle_list!(list_array); + Ok(()) + } + Dictionary(_key_type, value_type) => { + // Call cast to unwrap the dictionary. This could be optimized if we wanted + // to accept the overhead of extra code + let values = cast(arr, value_type.as_ref()).map_err(|e| { + DataFusionError::from(e) + .context("Casting dictionary to values in compute_array_to_string") + })?; + compute_array_to_string(buf, &values, delimiter, null_string, first) + } + Null => Ok(()), + data_type => { + macro_rules! leaf { + ($ARRAY_TYPE:ident) => { + write_leaf_to_string( + buf, + downcast_arg!(arr, $ARRAY_TYPE), + delimiter, + null_string, + first, + |buf, x| write!(buf, "{}", x).unwrap(), + ) + }; } - Dictionary(_key_type, value_type) => { - // Call cast to unwrap the dictionary. This could be optimized if we wanted - // to accept the overhead of extra code - let values = cast(&arr, value_type.as_ref()).map_err(|e| { - DataFusionError::from(e).context( - "Casting dictionary to values in compute_array_to_string", + macro_rules! str_leaf { + ($ARRAY_TYPE:ident) => { + write_leaf_to_string( + buf, + downcast_arg!(arr, $ARRAY_TYPE), + delimiter, + null_string, + first, + |buf, x: &str| buf.push_str(x), ) - })?; - compute_array_to_string( - arg, - &values, - delimiter, - null_string, - with_null_string, - ) + }; } - Null => Ok(arg), - data_type => { - macro_rules! array_function { - ($ARRAY_TYPE:ident) => { - to_string!( - arg, - arr, - &delimiter, - &null_string, - with_null_string, - $ARRAY_TYPE - ) - }; + match data_type { + Utf8 => str_leaf!(StringArray), + Utf8View => str_leaf!(StringViewArray), + LargeUtf8 => str_leaf!(LargeStringArray), + DataType::Boolean => leaf!(BooleanArray), + DataType::Float32 => leaf!(Float32Array), + DataType::Float64 => leaf!(Float64Array), + DataType::Int8 => leaf!(Int8Array), + DataType::Int16 => leaf!(Int16Array), + DataType::Int32 => leaf!(Int32Array), + DataType::Int64 => leaf!(Int64Array), + DataType::UInt8 => leaf!(UInt8Array), + DataType::UInt16 => leaf!(UInt16Array), + DataType::UInt32 => leaf!(UInt32Array), + DataType::UInt64 => leaf!(UInt64Array), + DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { + // Decimal arrays iterate over raw integers, so we need to + // cast to Utf8 to ensure the right display formatting. + let str_arr = cast(arr, &Utf8).map_err(|e| { + DataFusionError::from(e) + .context("Casting decimal to string in array_to_string") + })?; + return compute_array_to_string( + buf, + &str_arr, + delimiter, + null_string, + first, + ); } - call_array_function!(data_type, false) - } - } - } - - fn generate_string_array( - list_arr: &GenericListArray, - delimiters: &[Option<&str>], - null_strings: &Option>>, - ) -> Result { - let mut res: Vec> = Vec::new(); - for (i, (arr, &delimiter)) in list_arr.iter().zip(delimiters.iter()).enumerate() { - if let (Some(arr), Some(delimiter)) = (arr, delimiter) { - let (null_string, with_null_string) = match null_strings { - Some(ns) => match ns[i] { - Some(s) => (s.to_string(), true), - None => (String::new(), false), - }, - None => (String::new(), false), - }; - let mut arg = String::from(""); - let s = compute_array_to_string( - &mut arg, - &arr, - delimiter.to_string(), - null_string, - with_null_string, - )? - .clone(); - - if let Some(s) = s.strip_suffix(delimiter) { - res.push(Some(s.to_string())); - } else { - res.push(Some(s)); + dt => { + return not_impl_err!( + "Unsupported data type in array_to_string: {dt}" + ); } - } else { - res.push(None); } + Ok(()) } - - Ok(StringArray::from(res)) } +} - let string_arr = match arr.data_type() { - List(_) => { - let list_array = as_list_array(&arr)?; - generate_string_array::(list_array, &delimiters, &null_strings)? +/// Appends the string representation of each element in a leaf (non-list) +/// array to `buf`, separated by `delimiter`. Null elements are rendered +/// using `null_string` if provided, or skipped otherwise. The `append` +/// closure controls how each non-null element is written to the buffer. +fn write_leaf_to_string<'a, A, T>( + buf: &mut String, + arr: &'a A, + delimiter: &str, + null_string: Option<&str>, + first: &mut bool, + append: impl Fn(&mut String, T), +) where + &'a A: IntoIterator>, +{ + for x in arr { + // Skip nulls when no null_string is provided + if x.is_none() && null_string.is_none() { + continue; } - LargeList(_) => { - let list_array = as_large_list_array(&arr)?; - generate_string_array::(list_array, &delimiters, &null_strings)? + + if *first { + *first = false; + } else { + buf.push_str(delimiter); } - // Signature guards against this arm - _ => return exec_err!("array_to_string expects list as first argument"), - }; - Ok(Arc::new(string_arr)) + match x { + Some(x) => append(buf, x), + None => buf.push_str(null_string.unwrap()), + } + } } /// String_to_array SQL function diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 0ec2a187540e7..7d709416beb81 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -5160,6 +5160,12 @@ MISSING,5,MISSING statement ok DROP TABLE test_null_str_col; +# array_to_string with decimal values +query T +select array_to_string(arrow_cast(make_array(1.5, NULL, 3.14), 'List(Decimal128(10, 2))'), ',', 'N'); +---- +1.50,N,3.14 + ## cardinality # cardinality scalar function diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index de69ece2a8a62..cdd2738167cde 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -4212,7 +4212,7 @@ array_to_string(array, delimiter[, null_string]) - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. - **delimiter**: Array element separator. -- **null_string**: Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior. +- **null_string**: Optional. String to use for null values in the output. If not provided, nulls will be omitted. #### Example From 449a8a02b2f457df4c7c31c6827042791e396235 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 25 Feb 2026 12:44:18 -0500 Subject: [PATCH 04/10] Add an integer fastpath using `itoa` --- Cargo.lock | 1 + Cargo.toml | 1 + datafusion/functions-nested/Cargo.toml | 1 + datafusion/functions-nested/src/string.rs | 31 +++++++++++++++++------ 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5092a860e3c13..88e48bff05ac1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2302,6 +2302,7 @@ dependencies = [ "datafusion-physical-expr-common", "hashbrown 0.16.1", "itertools 0.14.0", + "itoa", "log", "paste", "rand 0.9.2", diff --git a/Cargo.toml b/Cargo.toml index 44120cfeb2e9b..8371e76dcd734 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -161,6 +161,7 @@ hex = { version = "0.4.3" } indexmap = "2.13.0" insta = { version = "1.46.3", features = ["glob", "filters"] } itertools = "0.14" +itoa = "1.0" liblzma = { version = "0.4.6", features = ["static"] } log = "^0.4" memchr = "2.8.0" diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 98c0e0a5abb21..374228c48c35b 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -59,6 +59,7 @@ datafusion-macros = { workspace = true } datafusion-physical-expr-common = { workspace = true } hashbrown = { workspace = true } itertools = { workspace = true, features = ["use_std"] } +itoa = { workspace = true } log = { workspace = true } paste = { workspace = true } diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index ff3a38e60c748..08030311f3f39 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -427,6 +427,21 @@ fn compute_array_to_string( ) }; } + macro_rules! int_leaf { + ($ARRAY_TYPE:ident) => { + write_leaf_to_string( + buf, + downcast_arg!(arr, $ARRAY_TYPE), + delimiter, + null_string, + first, + |buf, x| { + let mut itoa_buf = itoa::Buffer::new(); + buf.push_str(itoa_buf.format(x)); + }, + ) + }; + } match data_type { Utf8 => str_leaf!(StringArray), Utf8View => str_leaf!(StringViewArray), @@ -434,14 +449,14 @@ fn compute_array_to_string( DataType::Boolean => leaf!(BooleanArray), DataType::Float32 => leaf!(Float32Array), DataType::Float64 => leaf!(Float64Array), - DataType::Int8 => leaf!(Int8Array), - DataType::Int16 => leaf!(Int16Array), - DataType::Int32 => leaf!(Int32Array), - DataType::Int64 => leaf!(Int64Array), - DataType::UInt8 => leaf!(UInt8Array), - DataType::UInt16 => leaf!(UInt16Array), - DataType::UInt32 => leaf!(UInt32Array), - DataType::UInt64 => leaf!(UInt64Array), + DataType::Int8 => int_leaf!(Int8Array), + DataType::Int16 => int_leaf!(Int16Array), + DataType::Int32 => int_leaf!(Int32Array), + DataType::Int64 => int_leaf!(Int64Array), + DataType::UInt8 => int_leaf!(UInt8Array), + DataType::UInt16 => int_leaf!(UInt16Array), + DataType::UInt32 => int_leaf!(UInt32Array), + DataType::UInt64 => int_leaf!(UInt64Array), DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { // Decimal arrays iterate over raw integers, so we need to // cast to Utf8 to ensure the right display formatting. From bc6109c10ee3a674935d1710989c3a0fc45c07fa Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 25 Feb 2026 12:46:51 -0500 Subject: [PATCH 05/10] Add float64 benchmark --- .../benches/array_to_string.rs | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/datafusion/functions-nested/benches/array_to_string.rs b/datafusion/functions-nested/benches/array_to_string.rs index 3896b87bec48a..286ed4eeb0003 100644 --- a/datafusion/functions-nested/benches/array_to_string.rs +++ b/datafusion/functions-nested/benches/array_to_string.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{ArrayRef, Int64Array, ListArray, StringArray}; +use arrow::array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray}; use arrow::buffer::OffsetBuffer; use arrow::datatypes::{DataType, Field}; use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; @@ -36,6 +36,7 @@ const NULL_DENSITY: f64 = 0.1; fn criterion_benchmark(c: &mut Criterion) { bench_array_to_string(c, "array_to_string_int64", create_int64_list_array); + bench_array_to_string(c, "array_to_string_float64", create_float64_list_array); bench_array_to_string(c, "array_to_string_string", create_string_list_array); bench_array_to_string( c, @@ -131,6 +132,32 @@ fn create_nested_int64_list_array(array_size: usize) -> ArrayRef { ) } +fn create_float64_list_array(array_size: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let values = (0..NUM_ROWS * array_size) + .map(|_| { + if rng.random::() < NULL_DENSITY { + None + } else { + Some(rng.random_range(-1000.0..1000.0)) + } + }) + .collect::(); + let offsets = (0..=NUM_ROWS) + .map(|i| (i * array_size) as i32) + .collect::>(); + + Arc::new( + ListArray::try_new( + Arc::new(Field::new("item", DataType::Float64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(values), + None, + ) + .unwrap(), + ) +} + fn create_string_list_array(array_size: usize) -> ArrayRef { let mut rng = StdRng::seed_from_u64(SEED); let values = (0..NUM_ROWS * array_size) From ba4dbb032f5311a4b1d353738b55fd16de3faea7 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 25 Feb 2026 13:28:36 -0500 Subject: [PATCH 06/10] . --- Cargo.lock | 1 + Cargo.toml | 1 + datafusion/functions-nested/Cargo.toml | 1 + datafusion/functions-nested/src/string.rs | 19 +++++++++++++++++-- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 88e48bff05ac1..e0ec1bcd883e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2306,6 +2306,7 @@ dependencies = [ "log", "paste", "rand 0.9.2", + "ryu", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 8371e76dcd734..b832d3af3bb56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -181,6 +181,7 @@ prost = "0.14.1" rand = "0.9" recursive = "0.1.1" regex = "1.12" +ryu = "1.0" rstest = "0.26.1" serde_json = "1" sha2 = "^0.10.9" diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 374228c48c35b..e93558f212036 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -62,6 +62,7 @@ itertools = { workspace = true, features = ["use_std"] } itoa = { workspace = true } log = { workspace = true } paste = { workspace = true } +ryu = { workspace = true } [dev-dependencies] criterion = { workspace = true, features = ["async_tokio"] } diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index 08030311f3f39..d3cd09b01c303 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -442,13 +442,28 @@ fn compute_array_to_string( ) }; } + macro_rules! float_leaf { + ($ARRAY_TYPE:ident) => { + write_leaf_to_string( + buf, + downcast_arg!(arr, $ARRAY_TYPE), + delimiter, + null_string, + first, + |buf, x| { + let mut ryu_buf = ryu::Buffer::new(); + buf.push_str(ryu_buf.format(x)); + }, + ) + }; + } match data_type { Utf8 => str_leaf!(StringArray), Utf8View => str_leaf!(StringViewArray), LargeUtf8 => str_leaf!(LargeStringArray), DataType::Boolean => leaf!(BooleanArray), - DataType::Float32 => leaf!(Float32Array), - DataType::Float64 => leaf!(Float64Array), + DataType::Float32 => float_leaf!(Float32Array), + DataType::Float64 => float_leaf!(Float64Array), DataType::Int8 => int_leaf!(Int8Array), DataType::Int16 => int_leaf!(Int16Array), DataType::Int32 => int_leaf!(Int32Array), From daabfb5dbf218c9baf7118cef65783e03c632f95 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 25 Feb 2026 13:41:34 -0500 Subject: [PATCH 07/10] Add special-case for bool, remove generic write --- datafusion/functions-nested/src/string.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index d3cd09b01c303..b962f368fb18b 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -29,7 +29,6 @@ use datafusion_common::utils::ListCoercion; use datafusion_common::{DataFusionError, Result, not_impl_err}; use std::any::Any; -use std::fmt::Write; use crate::utils::make_scalar_function; use arrow::array::{ @@ -403,7 +402,7 @@ fn compute_array_to_string( } Null => Ok(()), data_type => { - macro_rules! leaf { + macro_rules! str_leaf { ($ARRAY_TYPE:ident) => { write_leaf_to_string( buf, @@ -411,11 +410,11 @@ fn compute_array_to_string( delimiter, null_string, first, - |buf, x| write!(buf, "{}", x).unwrap(), + |buf, x: &str| buf.push_str(x), ) }; } - macro_rules! str_leaf { + macro_rules! bool_leaf { ($ARRAY_TYPE:ident) => { write_leaf_to_string( buf, @@ -423,7 +422,13 @@ fn compute_array_to_string( delimiter, null_string, first, - |buf, x: &str| buf.push_str(x), + |buf, x: bool| { + if x { + buf.push_str("true"); + } else { + buf.push_str("false"); + } + }, ) }; } @@ -461,7 +466,7 @@ fn compute_array_to_string( Utf8 => str_leaf!(StringArray), Utf8View => str_leaf!(StringViewArray), LargeUtf8 => str_leaf!(LargeStringArray), - DataType::Boolean => leaf!(BooleanArray), + DataType::Boolean => bool_leaf!(BooleanArray), DataType::Float32 => float_leaf!(Float32Array), DataType::Float64 => float_leaf!(Float64Array), DataType::Int8 => int_leaf!(Int8Array), From aac69594d2a2148b2ee08a3396a95a0a0bc109e0 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 25 Feb 2026 14:35:41 -0500 Subject: [PATCH 08/10] Revert ryu --- Cargo.lock | 1 - Cargo.toml | 1 - datafusion/functions-nested/Cargo.toml | 1 - datafusion/functions-nested/src/string.rs | 8 ++++++-- datafusion/sqllogictest/test_files/array.slt | 9 +++++++++ 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e0ec1bcd883e4..88e48bff05ac1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2306,7 +2306,6 @@ dependencies = [ "log", "paste", "rand 0.9.2", - "ryu", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index b832d3af3bb56..8371e76dcd734 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -181,7 +181,6 @@ prost = "0.14.1" rand = "0.9" recursive = "0.1.1" regex = "1.12" -ryu = "1.0" rstest = "0.26.1" serde_json = "1" sha2 = "^0.10.9" diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index e93558f212036..374228c48c35b 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -62,7 +62,6 @@ itertools = { workspace = true, features = ["use_std"] } itoa = { workspace = true } log = { workspace = true } paste = { workspace = true } -ryu = { workspace = true } [dev-dependencies] criterion = { workspace = true, features = ["async_tokio"] } diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index b962f368fb18b..e83c85e227457 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -29,6 +29,7 @@ use datafusion_common::utils::ListCoercion; use datafusion_common::{DataFusionError, Result, not_impl_err}; use std::any::Any; +use std::fmt::Write; use crate::utils::make_scalar_function; use arrow::array::{ @@ -456,8 +457,11 @@ fn compute_array_to_string( null_string, first, |buf, x| { - let mut ryu_buf = ryu::Buffer::new(); - buf.push_str(ryu_buf.format(x)); + // TODO: Consider switching to a more efficient + // floating point display library (e.g., ryu). This + // might result in some differences in the output + // format, however. + write!(buf, "{}", x).unwrap(); }, ) }; diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 7d709416beb81..f2efbc0969993 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -5052,6 +5052,15 @@ select array_to_string(arrow_cast(make_array('h', NULL, NULL, NULL, 'o'), 'Fixed ---- h,-,-,-,o nil-2-nil-4-5 1|0|3 +# array_to_string float formatting: special values and longer decimals +query TTT +select + array_to_string(make_array(CAST('NaN' AS DOUBLE), CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('0.30000000000000004' AS DOUBLE), CAST('1.2345678901234567' AS DOUBLE)), '|'), + array_to_string(arrow_cast(make_array(CAST('NaN' AS DOUBLE), CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('0.30000000000000004' AS DOUBLE), CAST('1.2345678901234567' AS DOUBLE)), 'LargeList(Float64)'), '|'), + array_to_string(arrow_cast(make_array(CAST('NaN' AS DOUBLE), CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('0.30000000000000004' AS DOUBLE), CAST('1.2345678901234567' AS DOUBLE)), 'FixedSizeList(5, Float64)'), '|'); +---- +NaN|inf|-inf|0.30000000000000004|1.2345678901234567 NaN|inf|-inf|0.30000000000000004|1.2345678901234567 NaN|inf|-inf|0.30000000000000004|1.2345678901234567 + query T select array_to_string(arrow_cast([arrow_cast([NULL, 'a'], 'FixedSizeList(2, Utf8)'), NULL], 'FixedSizeList(2, FixedSizeList(2, Utf8))'), ',', '-'); ---- From 2e9a9caaaa0ecad35987e023f2563d3eb0e17ce5 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 25 Feb 2026 14:53:01 -0500 Subject: [PATCH 09/10] More floating point display tests --- datafusion/sqllogictest/test_files/array.slt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index f2efbc0969993..ef661d912bd75 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -5061,6 +5061,24 @@ select ---- NaN|inf|-inf|0.30000000000000004|1.2345678901234567 NaN|inf|-inf|0.30000000000000004|1.2345678901234567 NaN|inf|-inf|0.30000000000000004|1.2345678901234567 +# array_to_string float formatting: scientific-notation inputs +query T +select array_to_string( + make_array( + CAST('1E20' AS DOUBLE), + CAST('-1e+20' AS DOUBLE), + CAST('6.02214076e23' AS DOUBLE), + CAST('1.2345e6' AS DOUBLE), + CAST('1e-5' AS DOUBLE), + CAST('-1e-5' AS DOUBLE), + CAST('9.1093837015e-31' AS DOUBLE), + CAST('-2.5e-4' AS DOUBLE) + ), + '|' +); +---- +100000000000000000000|-100000000000000000000|602214076000000000000000|1234500|0.00001|-0.00001|0.00000000000000000000000000000091093837015|-0.00025 + query T select array_to_string(arrow_cast([arrow_cast([NULL, 'a'], 'FixedSizeList(2, Utf8)'), NULL], 'FixedSizeList(2, FixedSizeList(2, Utf8))'), ',', '-'); ---- From 261fafb4acb439ccab327c6e870eb69e01232b1f Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 25 Feb 2026 21:53:50 -0500 Subject: [PATCH 10/10] Generalize cast approach to more data types, per review --- datafusion/functions-nested/src/string.rs | 12 ++--- datafusion/sqllogictest/test_files/array.slt | 55 ++++++++++++++++++++ 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index e83c85e227457..8aabc49309565 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -37,7 +37,7 @@ use arrow::array::{ builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder}, cast::AsArray, }; -use arrow::compute::cast; +use arrow::compute::{can_cast_types, cast}; use arrow::datatypes::DataType::{ Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Utf8View, }; @@ -481,12 +481,10 @@ fn compute_array_to_string( DataType::UInt16 => int_leaf!(UInt16Array), DataType::UInt32 => int_leaf!(UInt32Array), DataType::UInt64 => int_leaf!(UInt64Array), - DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { - // Decimal arrays iterate over raw integers, so we need to - // cast to Utf8 to ensure the right display formatting. + data_type if can_cast_types(data_type, &Utf8) => { let str_arr = cast(arr, &Utf8).map_err(|e| { DataFusionError::from(e) - .context("Casting decimal to string in array_to_string") + .context("Casting to string in array_to_string") })?; return compute_array_to_string( buf, @@ -496,9 +494,9 @@ fn compute_array_to_string( first, ); } - dt => { + data_type => { return not_impl_err!( - "Unsupported data type in array_to_string: {dt}" + "Unsupported data type in array_to_string: {data_type}" ); } } diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index ef661d912bd75..471ed3b385369 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -5193,6 +5193,61 @@ select array_to_string(arrow_cast(make_array(1.5, NULL, 3.14), 'List(Decimal128( ---- 1.50,N,3.14 +# array_to_string with date values +query T +select array_to_string(arrow_cast(make_array('2024-01-15', '2024-06-30', '2024-12-25'), 'List(Date32)'), ','); +---- +2024-01-15,2024-06-30,2024-12-25 + +query T +select array_to_string(arrow_cast(make_array('2024-01-15', NULL, '2024-12-25'), 'List(Date32)'), ',', 'N'); +---- +2024-01-15,N,2024-12-25 + +# array_to_string with timestamp values +query T +select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Second, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Second, None)')), '|'); +---- +2024-01-15T10:30:00|2024-06-30T15:45:00 + +query T +select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Millisecond, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Millisecond, None)')), '|'); +---- +2024-01-15T10:30:00|2024-06-30T15:45:00 + +query T +select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Microsecond, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Microsecond, None)')), '|'); +---- +2024-01-15T10:30:00|2024-06-30T15:45:00 + +query T +select array_to_string(make_array(arrow_cast('2024-01-15T10:30:00', 'Timestamp(Nanosecond, None)'), arrow_cast('2024-06-30T15:45:00', 'Timestamp(Nanosecond, None)')), '|'); +---- +2024-01-15T10:30:00|2024-06-30T15:45:00 + +# array_to_string with time values +query T +select array_to_string(make_array(arrow_cast('10:30:00', 'Time32(Second)'), arrow_cast('15:45:00', 'Time32(Second)')), ','); +---- +10:30:00,15:45:00 + +query T +select array_to_string(make_array(arrow_cast('10:30:00', 'Time64(Microsecond)'), arrow_cast('15:45:00', 'Time64(Microsecond)')), ','); +---- +10:30:00,15:45:00 + +# array_to_string with interval values +query T +select array_to_string(make_array(interval '1 year 2 months', interval '3 days 4 hours'), ','); +---- +14 mons,3 days 4 hours + +# array_to_string with duration values +query T +select array_to_string(make_array(arrow_cast(1000, 'Duration(Millisecond)'), arrow_cast(2000, 'Duration(Millisecond)')), ','); +---- +PT1S,PT2S + ## cardinality # cardinality scalar function