From acf6710785ffe4637ad90a7f6f7264ffec591555 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Fri, 27 Feb 2026 10:47:01 -0500 Subject: [PATCH 1/3] Update reverse UDF to emit utf8view when input is utf8view. --- datafusion/functions-nested/src/string.rs | 39 +----------- datafusion/functions/src/strings.rs | 41 +++++++++++- datafusion/functions/src/unicode/reverse.rs | 62 +++++++++++++------ .../test_files/string/string_literal.slt | 15 +++++ 4 files changed, 99 insertions(+), 58 deletions(-) diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index 1c8d58fca80d0..d8d4e9a577c40 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -33,7 +33,7 @@ use std::any::Any; use crate::utils::make_scalar_function; use arrow::array::{ GenericStringArray, StringArrayType, StringViewArray, - builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder}, + builder::{LargeStringBuilder, StringViewBuilder}, cast::AsArray, }; use arrow::compute::cast; @@ -51,6 +51,7 @@ use datafusion_expr::{ Volatility, }; use datafusion_functions::downcast_arg; +use datafusion_functions::strings::StringArrayBuilderType; use datafusion_macros::user_doc; use std::sync::Arc; @@ -788,39 +789,3 @@ where let list_array = list_builder.finish(); Ok(Arc::new(list_array) as ArrayRef) } - -trait StringArrayBuilderType: ArrayBuilder { - fn append_value(&mut self, val: &str); - - fn append_null(&mut self); -} - -impl StringArrayBuilderType for StringBuilder { - fn append_value(&mut self, val: &str) { - StringBuilder::append_value(self, val); - } - - fn append_null(&mut self) { - StringBuilder::append_null(self); - } -} - -impl StringArrayBuilderType for StringViewBuilder { - fn append_value(&mut self, val: &str) { - StringViewBuilder::append_value(self, val) - } - - fn append_null(&mut self) { - StringViewBuilder::append_null(self) - } -} - -impl StringArrayBuilderType for LargeStringBuilder { - fn append_value(&mut self, val: &str) { - LargeStringBuilder::append_value(self, val); - } - - fn append_null(&mut self) { - LargeStringBuilder::append_null(self); - } -} diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index cfddf57b094b5..acb1e171ad7dc 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -18,8 +18,9 @@ use std::mem::size_of; use arrow::array::{ - Array, ArrayAccessor, ArrayDataBuilder, ByteView, LargeStringArray, - NullBufferBuilder, StringArray, StringViewArray, StringViewBuilder, make_view, + Array, ArrayAccessor, ArrayBuilder, ArrayDataBuilder, ByteView, LargeStringArray, + LargeStringBuilder, NullBufferBuilder, StringArray, StringBuilder, StringViewArray, + StringViewBuilder, make_view, }; use arrow::buffer::{MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; @@ -362,6 +363,42 @@ impl ColumnarValueRef<'_> { } } +pub trait StringArrayBuilderType: ArrayBuilder { + fn append_value(&mut self, val: &str); + + fn append_null(&mut self); +} + +impl StringArrayBuilderType for StringBuilder { + fn append_value(&mut self, val: &str) { + StringBuilder::append_value(self, val); + } + + fn append_null(&mut self) { + StringBuilder::append_null(self); + } +} + +impl StringArrayBuilderType for StringViewBuilder { + fn append_value(&mut self, val: &str) { + StringViewBuilder::append_value(self, val) + } + + fn append_null(&mut self) { + StringViewBuilder::append_null(self) + } +} + +impl StringArrayBuilderType for LargeStringBuilder { + fn append_value(&mut self, val: &str) { + LargeStringBuilder::append_value(self, val); + } + + fn append_null(&mut self) { + LargeStringBuilder::append_null(self); + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index 71a1e6842f8f3..8a412756076d1 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -18,10 +18,12 @@ use std::any::Any; use std::sync::Arc; +use crate::strings::StringArrayBuilderType; use crate::utils::{make_scalar_function, utf8_to_str_type}; use DataType::{LargeUtf8, Utf8, Utf8View}; use arrow::array::{ - Array, ArrayRef, AsArray, GenericStringBuilder, OffsetSizeTrait, StringArrayType, + Array, ArrayRef, AsArray, GenericStringArray, LargeStringBuilder, OffsetSizeTrait, + StringArrayType, StringBuilder, StringViewArray, StringViewBuilder, }; use arrow::datatypes::DataType; use datafusion_common::{Result, exec_err}; @@ -82,7 +84,11 @@ impl ScalarUDFImpl for ReverseFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - utf8_to_str_type(&arg_types[0], "reverse") + if arg_types[0] == Utf8View { + Ok(Utf8View) + } else { + utf8_to_str_type(&arg_types[0], "btrim") + } } fn invoke_with_args( @@ -107,20 +113,38 @@ impl ScalarUDFImpl for ReverseFunc { /// Reverses the order of the characters in the string `reverse('abcde') = 'edcba'`. /// The implementation uses UTF-8 code points as characters fn reverse(args: &[ArrayRef]) -> Result { - if args[0].data_type() == &Utf8View { - reverse_impl::(&args[0].as_string_view()) - } else { - reverse_impl::(&args[0].as_string::()) + let len = args[0].len(); + + match args[0].data_type() { + Utf8View => reverse_impl::<&StringViewArray, StringViewBuilder>( + &args[0].as_string_view(), + StringViewBuilder::with_capacity(len), + ), + LargeUtf8 => reverse_impl::<&GenericStringArray, LargeStringBuilder>( + &args[0].as_string::(), + LargeStringBuilder::with_capacity(len, 1024), + ), + Utf8 => reverse_impl::<&GenericStringArray, StringBuilder>( + &args[0].as_string::(), + StringBuilder::with_capacity(len, 1024), + ), + _ => unreachable!( + "Reverse can only be applied to Utf8View, Utf8 and LargeUtf8 types" + ), } } -fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>( - string_array: &V, -) -> Result { - let mut builder = GenericStringBuilder::::with_capacity(string_array.len(), 1024); - +fn reverse_impl<'a, StringArrType, StringBuilderType>( + string_array: &StringArrType, + mut array_builder: StringBuilderType, +) -> Result +where + StringArrType: StringArrayType<'a>, + StringBuilderType: StringArrayBuilderType, +{ let mut string_buf = String::new(); let mut byte_buf = Vec::::new(); + for string in string_array.iter() { if let Some(s) = string { if s.is_ascii() { @@ -129,25 +153,25 @@ fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>( byte_buf.reverse(); // SAFETY: Since the original string was ASCII, reversing the bytes still results in valid UTF-8. let reversed = unsafe { std::str::from_utf8_unchecked(&byte_buf) }; - builder.append_value(reversed); + array_builder.append_value(reversed); byte_buf.clear(); } else { string_buf.extend(s.chars().rev()); - builder.append_value(&string_buf); + array_builder.append_value(&string_buf); string_buf.clear(); } } else { - builder.append_null(); + array_builder.append_null(); } } - Ok(Arc::new(builder.finish()) as ArrayRef) + Ok(Arc::new(array_builder.finish()) as ArrayRef) } #[cfg(test)] mod tests { - use arrow::array::{Array, LargeStringArray, StringArray}; - use arrow::datatypes::DataType::{LargeUtf8, Utf8}; + use arrow::array::{Array, LargeStringArray, StringArray, StringViewArray}; + use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View}; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; @@ -180,8 +204,8 @@ mod tests { vec![ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))], $EXPECTED, &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); }; } diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt index 6cf02218872df..2272de74cf7f2 100644 --- a/datafusion/sqllogictest/test_files/string/string_literal.slt +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -389,6 +389,21 @@ SELECT reverse(arrow_cast('abcde', 'Utf8View')) ---- edcba +query T +SELECT arrow_typeof(reverse('abcde')) +---- +Utf8 + +query T +SELECT arrow_typeof(reverse(arrow_cast('abcde', 'LargeUtf8'))) +---- +LargeUtf8 + +query T +SELECT arrow_typeof(reverse(arrow_cast('abcde', 'Utf8View'))) +---- +Utf8View + query T SELECT reverse(arrow_cast('abcde', 'Dictionary(Int32, Utf8)')) ---- From 86c7a9b2861ab3e4ea98265ebb3d5093a1e2902a Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Fri, 27 Feb 2026 17:05:24 -0500 Subject: [PATCH 2/3] Switched from StringArrayBuilderType to StringLikeArrayBuilder --- datafusion/functions-nested/src/string.rs | 39 +++++++++++++++++++- datafusion/functions/src/strings.rs | 41 +-------------------- datafusion/functions/src/unicode/reverse.rs | 6 +-- 3 files changed, 42 insertions(+), 44 deletions(-) diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index d8d4e9a577c40..1c8d58fca80d0 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -33,7 +33,7 @@ use std::any::Any; use crate::utils::make_scalar_function; use arrow::array::{ GenericStringArray, StringArrayType, StringViewArray, - builder::{LargeStringBuilder, StringViewBuilder}, + builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder}, cast::AsArray, }; use arrow::compute::cast; @@ -51,7 +51,6 @@ use datafusion_expr::{ Volatility, }; use datafusion_functions::downcast_arg; -use datafusion_functions::strings::StringArrayBuilderType; use datafusion_macros::user_doc; use std::sync::Arc; @@ -789,3 +788,39 @@ where let list_array = list_builder.finish(); Ok(Arc::new(list_array) as ArrayRef) } + +trait StringArrayBuilderType: ArrayBuilder { + fn append_value(&mut self, val: &str); + + fn append_null(&mut self); +} + +impl StringArrayBuilderType for StringBuilder { + fn append_value(&mut self, val: &str) { + StringBuilder::append_value(self, val); + } + + fn append_null(&mut self) { + StringBuilder::append_null(self); + } +} + +impl StringArrayBuilderType for StringViewBuilder { + fn append_value(&mut self, val: &str) { + StringViewBuilder::append_value(self, val) + } + + fn append_null(&mut self) { + StringViewBuilder::append_null(self) + } +} + +impl StringArrayBuilderType for LargeStringBuilder { + fn append_value(&mut self, val: &str) { + LargeStringBuilder::append_value(self, val); + } + + fn append_null(&mut self) { + LargeStringBuilder::append_null(self); + } +} diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index acb1e171ad7dc..cfddf57b094b5 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -18,9 +18,8 @@ use std::mem::size_of; use arrow::array::{ - Array, ArrayAccessor, ArrayBuilder, ArrayDataBuilder, ByteView, LargeStringArray, - LargeStringBuilder, NullBufferBuilder, StringArray, StringBuilder, StringViewArray, - StringViewBuilder, make_view, + Array, ArrayAccessor, ArrayDataBuilder, ByteView, LargeStringArray, + NullBufferBuilder, StringArray, StringViewArray, StringViewBuilder, make_view, }; use arrow::buffer::{MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; @@ -363,42 +362,6 @@ impl ColumnarValueRef<'_> { } } -pub trait StringArrayBuilderType: ArrayBuilder { - fn append_value(&mut self, val: &str); - - fn append_null(&mut self); -} - -impl StringArrayBuilderType for StringBuilder { - fn append_value(&mut self, val: &str) { - StringBuilder::append_value(self, val); - } - - fn append_null(&mut self) { - StringBuilder::append_null(self); - } -} - -impl StringArrayBuilderType for StringViewBuilder { - fn append_value(&mut self, val: &str) { - StringViewBuilder::append_value(self, val) - } - - fn append_null(&mut self) { - StringViewBuilder::append_null(self) - } -} - -impl StringArrayBuilderType for LargeStringBuilder { - fn append_value(&mut self, val: &str) { - LargeStringBuilder::append_value(self, val); - } - - fn append_null(&mut self) { - LargeStringBuilder::append_null(self); - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index 8a412756076d1..0e1fa4d1e68ba 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -18,12 +18,12 @@ use std::any::Any; use std::sync::Arc; -use crate::strings::StringArrayBuilderType; use crate::utils::{make_scalar_function, utf8_to_str_type}; use DataType::{LargeUtf8, Utf8, Utf8View}; use arrow::array::{ Array, ArrayRef, AsArray, GenericStringArray, LargeStringBuilder, OffsetSizeTrait, - StringArrayType, StringBuilder, StringViewArray, StringViewBuilder, + StringArrayType, StringBuilder, StringLikeArrayBuilder, StringViewArray, + StringViewBuilder, }; use arrow::datatypes::DataType; use datafusion_common::{Result, exec_err}; @@ -140,7 +140,7 @@ fn reverse_impl<'a, StringArrType, StringBuilderType>( ) -> Result where StringArrType: StringArrayType<'a>, - StringBuilderType: StringArrayBuilderType, + StringBuilderType: StringLikeArrayBuilder, { let mut string_buf = String::new(); let mut byte_buf = Vec::::new(); From 4c5b2fa41a4da3ff91634a27be7f947ec7090c54 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 3 Mar 2026 10:51:42 -0500 Subject: [PATCH 3/3] Simplify return_type and removed unnecessary generic from reverse function. --- datafusion/functions/src/unicode/reverse.rs | 32 +++++++++------------ 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index 0e1fa4d1e68ba..7b501c0a6542d 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -18,12 +18,11 @@ use std::any::Any; use std::sync::Arc; -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use crate::utils::make_scalar_function; use DataType::{LargeUtf8, Utf8, Utf8View}; use arrow::array::{ - Array, ArrayRef, AsArray, GenericStringArray, LargeStringBuilder, OffsetSizeTrait, - StringArrayType, StringBuilder, StringLikeArrayBuilder, StringViewArray, - StringViewBuilder, + Array, ArrayRef, AsArray, LargeStringBuilder, StringArrayType, StringBuilder, + StringLikeArrayBuilder, StringViewBuilder, }; use arrow::datatypes::DataType; use datafusion_common::{Result, exec_err}; @@ -84,11 +83,7 @@ impl ScalarUDFImpl for ReverseFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - if arg_types[0] == Utf8View { - Ok(Utf8View) - } else { - utf8_to_str_type(&arg_types[0], "btrim") - } + Ok(arg_types[0].clone()) } fn invoke_with_args( @@ -97,8 +92,7 @@ impl ScalarUDFImpl for ReverseFunc { ) -> Result { let args = &args.args; match args[0].data_type() { - Utf8 | Utf8View => make_scalar_function(reverse::, vec![])(args), - LargeUtf8 => make_scalar_function(reverse::, vec![])(args), + Utf8 | Utf8View | LargeUtf8 => make_scalar_function(reverse, vec![])(args), other => { exec_err!("Unsupported data type {other:?} for function reverse") } @@ -112,22 +106,22 @@ impl ScalarUDFImpl for ReverseFunc { /// Reverses the order of the characters in the string `reverse('abcde') = 'edcba'`. /// The implementation uses UTF-8 code points as characters -fn reverse(args: &[ArrayRef]) -> Result { +fn reverse(args: &[ArrayRef]) -> Result { let len = args[0].len(); match args[0].data_type() { - Utf8View => reverse_impl::<&StringViewArray, StringViewBuilder>( + Utf8 => reverse_impl( + &args[0].as_string::(), + StringBuilder::with_capacity(len, 1024), + ), + Utf8View => reverse_impl( &args[0].as_string_view(), StringViewBuilder::with_capacity(len), ), - LargeUtf8 => reverse_impl::<&GenericStringArray, LargeStringBuilder>( - &args[0].as_string::(), + LargeUtf8 => reverse_impl( + &args[0].as_string::(), LargeStringBuilder::with_capacity(len, 1024), ), - Utf8 => reverse_impl::<&GenericStringArray, StringBuilder>( - &args[0].as_string::(), - StringBuilder::with_capacity(len, 1024), - ), _ => unreachable!( "Reverse can only be applied to Utf8View, Utf8 and LargeUtf8 types" ),