From b0fe223fb2d5f998d04a87b1bf5c301c5e72d410 Mon Sep 17 00:00:00 2001 From: Pepijn Van Eeckhoudt Date: Fri, 26 Jun 2026 21:04:07 +0200 Subject: [PATCH 1/2] Use `concat_elements_dyn` from `arrow-rs` instead of a custom implementation of the same function --- .../physical-expr/src/expressions/binary.rs | 43 +-------- .../src/expressions/binary/kernels.rs | 89 ------------------- 2 files changed, 3 insertions(+), 129 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 6f0b60556a751..7945cbbe00495 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -24,9 +24,7 @@ use std::sync::Arc; use arrow::array::*; use arrow::compute::kernels::boolean::{and_kleene, or_kleene}; -use arrow::compute::kernels::concat_elements::{ - concat_element_binary, concat_elements_utf8, -}; +use arrow::compute::kernels::concat_elements::concat_elements_dyn; use arrow::compute::{SlicesIterator, cast, filter_record_batch}; use arrow::datatypes::*; use arrow::error::ArrowError; @@ -50,8 +48,7 @@ use kernels::{ bitwise_and_dyn, bitwise_and_dyn_scalar, bitwise_or_dyn, bitwise_or_dyn_scalar, bitwise_shift_left_dyn, bitwise_shift_left_dyn_scalar, bitwise_shift_right_dyn, bitwise_shift_right_dyn_scalar, bitwise_xor_dyn, bitwise_xor_dyn_scalar, - concat_elements_binary_view_array, concat_elements_utf8view, regex_match_dyn, - regex_match_dyn_scalar, + regex_match_dyn, regex_match_dyn_scalar, }; /// Binary expression @@ -833,7 +830,7 @@ impl BinaryExpr { BitwiseXor => bitwise_xor_dyn(left, right), BitwiseShiftRight => bitwise_shift_right_dyn(left, right), BitwiseShiftLeft => bitwise_shift_left_dyn(left, right), - StringConcat => concat_elements(&left, &right), + StringConcat => concat_elements_dyn(&left, &right).map_err(|e| e.into()), AtArrow | ArrowAt | Arrow | LongArrow | HashArrow | HashLongArrow | AtAt | HashMinus | AtQuestion | Question | QuestionAnd | QuestionPipe | IntegerDivide | Colon => { @@ -1053,40 +1050,6 @@ fn pre_selection_scatter( Ok(ColumnarValue::Array(Arc::new(boolean_result))) } -fn concat_elements(left: &ArrayRef, right: &ArrayRef) -> Result { - Ok(match left.data_type() { - DataType::Utf8 => Arc::new(concat_elements_utf8( - left.as_string::(), - right.as_string::(), - )?), - DataType::LargeUtf8 => Arc::new(concat_elements_utf8( - left.as_string::(), - right.as_string::(), - )?), - DataType::Utf8View => Arc::new(concat_elements_utf8view( - left.as_string_view(), - right.as_string_view(), - )?), - DataType::Binary => Arc::new(concat_element_binary::( - left.as_binary(), - right.as_binary(), - )?), - DataType::LargeBinary => Arc::new(concat_element_binary::( - left.as_binary(), - right.as_binary(), - )?), - DataType::BinaryView => Arc::new(concat_elements_binary_view_array( - left.as_binary_view(), - right.as_binary_view(), - )?), - other => { - return internal_err!( - "Data type {other:?} not supported for binary operation 'concat_elements' on string arrays" - ); - } - }) -} - /// Create a binary expression whose arguments are correctly coerced. /// This function errors if it is not possible to coerce the arguments /// to computational types supported by the operator. diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr/src/expressions/binary/kernels.rs index e573d7ece2afa..39e9c40dbdf24 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels.rs @@ -18,7 +18,6 @@ //! This module contains computation kernels that are specific to //! datafusion and not (yet) targeted to port upstream to arrow use arrow::array::*; -use arrow::buffer::{MutableBuffer, NullBuffer}; use arrow::compute::kernels::bitwise::{ bitwise_and, bitwise_and_scalar, bitwise_or, bitwise_or_scalar, bitwise_shift_left, bitwise_shift_left_scalar, bitwise_shift_right, bitwise_shift_right_scalar, @@ -27,7 +26,6 @@ use arrow::compute::kernels::bitwise::{ use arrow::compute::kernels::boolean::not; use arrow::compute::kernels::comparison::{regexp_is_match, regexp_is_match_scalar}; use arrow::datatypes::DataType; -use arrow::error::ArrowError; use datafusion_common::{Result, ScalarValue}; use datafusion_common::{internal_err, plan_err}; @@ -161,93 +159,6 @@ create_left_integral_dyn_scalar_kernel!( bitwise_shift_left_scalar ); -/// Concatenates two `StringViewArray`s element-wise. -/// If either element is `Null`, the result element is also `Null`. -/// -/// # Errors -/// - Returns an error if the input arrays have different lengths. -/// - Returns an error if any concatenated string exceeds `u32::MAX` (≈4 GB) in length. -pub fn concat_elements_utf8view( - left: &StringViewArray, - right: &StringViewArray, -) -> std::result::Result { - if left.len() != right.len() { - return Err(ArrowError::ComputeError(format!( - "Arrays must have the same length: {} != {}", - left.len(), - right.len() - ))); - } - let mut result = StringViewBuilder::with_capacity(left.len()); - - // Avoid reallocations by writing to a reused buffer (note we could be even - // more efficient by creating the view directly here and avoid the buffer - // but that would be more complex) - let mut buffer = String::new(); - - // Pre-compute combined null bitmap, so the per-row NULL check is more - // efficient - let nulls = NullBuffer::union(left.nulls(), right.nulls()); - - for i in 0..left.len() { - if nulls.as_ref().is_some_and(|n| n.is_null(i)) { - result.append_null(); - } else { - let l = left.value(i); - let r = right.value(i); - buffer.clear(); - buffer.push_str(l); - buffer.push_str(r); - result.try_append_value(&buffer)?; - } - } - Ok(result.finish()) -} - -/// Concatenates two `BinaryViewArray`s element-wise. -/// If either element is `Null`, the result element is also `Null`. -/// -/// # Errors -/// - Returns an error if the input arrays have different lengths. -/// - Returns an error if any concatenated string exceeds `u32::MAX` in length. -pub fn concat_elements_binary_view_array( - left: &BinaryViewArray, - right: &BinaryViewArray, -) -> std::result::Result { - if left.len() != right.len() { - return Err(ArrowError::ComputeError(format!( - "Arrays must have the same length: {} != {}", - left.len(), - right.len() - ))); - } - let mut result = BinaryViewBuilder::with_capacity(left.len()); - - // Avoid reallocations by writing to a reused buffer (note we could be even - // more efficient by creating the view directly here and avoid the buffer - // but that would be more complex) - let mut buffer = MutableBuffer::new(0); - - // Pre-compute combined null bitmap, so the per-row NULL check is more - // efficient - let nulls = NullBuffer::union(left.nulls(), right.nulls()); - - for i in 0..left.len() { - if nulls.as_ref().is_some_and(|n| n.is_null(i)) { - result.append_null(); - } else { - let l = left.value(i); - let r = right.value(i); - buffer.clear(); - buffer.extend_from_slice(l); - buffer.extend_from_slice(r); - // No try-version of append_value - result.try_append_value(&buffer)?; - } - } - Ok(result.finish()) -} - /// Invoke a compute kernel on a pair of binary data arrays with flags macro_rules! regexp_is_match_flag { ($LEFT:expr, $RIGHT:expr, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{ From 4ffec1cf4b7fba35f8d92c97f156c916761dc052 Mon Sep 17 00:00:00 2001 From: Pepijn Van Eeckhoudt Date: Wed, 1 Jul 2026 08:35:47 +0200 Subject: [PATCH 2/2] Support concatenation of mixed FixedSizeBinary types without conversion to Binary --- .../expr-common/src/type_coercion/binary.rs | 101 ++++++++++-------- 1 file changed, 57 insertions(+), 44 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index e700d4a04da3b..4c53c9db84994 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -237,7 +237,7 @@ impl<'a> BinaryTypeCoercer<'a> { }) } StringConcat => { - string_concat_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| { + string_concat_coercion(lhs, rhs).ok_or_else(|| { plan_datafusion_err!( "Cannot infer common string type for string concat operation {} {} {}", self.lhs, self.op, self.rhs ) @@ -1635,50 +1635,63 @@ fn ree_coercion( /// 1. At least one side of lhs and rhs should be string type (Utf8 / LargeUtf8) /// 2. Data type of the other side should be able to cast to string type /// 3. Binary and string types cannot be mixed -fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; - string_coercion(lhs_type, rhs_type).or_else(|| match (lhs_type, rhs_type) { - // Allow pure binary + binary - ( - Binary | LargeBinary | BinaryView | FixedSizeBinary(_), - Binary | LargeBinary | BinaryView | FixedSizeBinary(_), - ) => { - // Coerce fixed-sized binary to variable-sized `Binary` to make uniform signature - // with the `Binary` result - let lhs_type = match lhs_type { - FixedSizeBinary(_) => &Binary, - val => val, - }; - let rhs_type = match rhs_type { - FixedSizeBinary(_) => &Binary, - val => val, - }; - binary_coercion(lhs_type, rhs_type) - } - // Deny other mixed binary + string combinations - ( - Binary | LargeBinary | BinaryView | FixedSizeBinary(_), - Utf8 | LargeUtf8 | Utf8View, - ) => None, - ( - Utf8 | LargeUtf8 | Utf8View, - Binary | LargeBinary | BinaryView | FixedSizeBinary(_), - ) => None, - // Predicate-based coercion rules are following - (Utf8View, from_type) | (from_type, Utf8View) => { - string_concat_internal_coercion(from_type, &Utf8View) - } - (Utf8, from_type) | (from_type, Utf8) => { - string_concat_internal_coercion(from_type, &Utf8) - } - (LargeUtf8, from_type) | (from_type, LargeUtf8) => { - string_concat_internal_coercion(from_type, &LargeUtf8) - } - (Dictionary(_, lhs_value_type), Dictionary(_, rhs_value_type)) => { - string_coercion(lhs_value_type, rhs_value_type).or(None) - } - _ => None, - }) + + string_coercion(lhs_type, rhs_type) + .map(Signature::uniform) + .or_else(|| match (lhs_type, rhs_type) { + // Allow concatenation of mixed fixed size binary + (FixedSizeBinary(l), FixedSizeBinary(r)) => Some(Signature { + lhs: lhs_type.clone(), + rhs: rhs_type.clone(), + ret: FixedSizeBinary(l + r), + }), + // Allow pure binary + binary + ( + Binary | LargeBinary | BinaryView | FixedSizeBinary(_), + Binary | LargeBinary | BinaryView | FixedSizeBinary(_), + ) => { + // Coerce fixed-sized binary to variable-sized `Binary` to make uniform signature + // with the `Binary` result + let lhs_type = match lhs_type { + FixedSizeBinary(_) => &Binary, + val => val, + }; + let rhs_type = match rhs_type { + FixedSizeBinary(_) => &Binary, + val => val, + }; + binary_coercion(lhs_type, rhs_type).map(Signature::uniform) + } + // Deny other mixed binary + string combinations + ( + Binary | LargeBinary | BinaryView | FixedSizeBinary(_), + Utf8 | LargeUtf8 | Utf8View, + ) => None, + ( + Utf8 | LargeUtf8 | Utf8View, + Binary | LargeBinary | BinaryView | FixedSizeBinary(_), + ) => None, + // Predicate-based coercion rules are following + (Utf8View, from_type) | (from_type, Utf8View) => { + string_concat_internal_coercion(from_type, &Utf8View) + .map(Signature::uniform) + } + (Utf8, from_type) | (from_type, Utf8) => { + string_concat_internal_coercion(from_type, &Utf8).map(Signature::uniform) + } + (LargeUtf8, from_type) | (from_type, LargeUtf8) => { + string_concat_internal_coercion(from_type, &LargeUtf8) + .map(Signature::uniform) + } + (Dictionary(_, lhs_value_type), Dictionary(_, rhs_value_type)) => { + string_coercion(lhs_value_type, rhs_value_type) + .or(None) + .map(Signature::uniform) + } + _ => None, + }) } fn array_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option {