From c94e03c63ed7524b53798ea7e64703086be470d2 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 11:21:26 -0800 Subject: [PATCH 01/12] Support utf8view in regex simplify optimization --- .../src/simplify_expressions/regex.rs | 61 +++++++++++-------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index d388aaf74cdac..b28ffde5c8194 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -46,10 +46,16 @@ pub fn simplify_regex_expr( ) -> Result { let mode = OperatorMode::new(&op); - if let Expr::Literal(ScalarValue::Utf8(Some(pattern)), _) = right.as_ref() { - // Handle the special case for ".*" pattern - if pattern == ANY_CHAR_REGEX_PATTERN { - let new_expr = if mode.not { + let (pattern, is_utf8) = match right.as_ref() { + Expr::Literal(ScalarValue::Utf8(Some(p)), _) => (p.as_str(), true), + Expr::Literal(ScalarValue::Utf8View(Some(p)), _) => (p.as_str(), false), + _ => return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })), + }; + + // Handle the special case for ".*" pattern + if pattern == ANY_CHAR_REGEX_PATTERN { + let new_expr = if mode.not { + if is_utf8 { // not empty let empty_lit = Box::new(lit("")); Expr::BinaryExpr(BinaryExpr { @@ -58,32 +64,35 @@ pub fn simplify_regex_expr( right: empty_lit, }) } else { - // not null - left.is_not_null() - }; - return Ok(new_expr); - } + // Leave untouched because optimization doesn't work for Utf8View + Expr::BinaryExpr(BinaryExpr { left, op, right }) + } + } else { + // not null + left.is_not_null() + }; + return Ok(new_expr); + } - match regex_syntax::Parser::new().parse(pattern) { - Ok(hir) => { - let kind = hir.kind(); - if let HirKind::Alternation(alts) = kind { - if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION - && let Some(expr) = lower_alt(&mode, &left, alts) - { - return Ok(expr); - } - } else if let Some(expr) = lower_simple(&mode, &left, &hir) { + match regex_syntax::Parser::new().parse(pattern) { + Ok(hir) => { + let kind = hir.kind(); + if let HirKind::Alternation(alts) = kind { + if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION + && let Some(expr) = lower_alt(&mode, &left, alts) + { return Ok(expr); } + } else if let Some(expr) = lower_simple(&mode, &left, &hir) { + return Ok(expr); } - Err(e) => { - // error out early since the execution may fail anyways - return Err(DataFusionError::Context( - "Invalid regex".to_owned(), - Box::new(DataFusionError::External(Box::new(e))), - )); - } + } + Err(e) => { + // error out early since the execution may fail anyways + return Err(DataFusionError::Context( + "Invalid regex".to_owned(), + Box::new(DataFusionError::External(Box::new(e))), + )); } } From 295d9c88ff94bb5b4f5684914502497087dcef41 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 11:22:58 -0800 Subject: [PATCH 02/12] Fix sqllogictest expected plans --- datafusion/sqllogictest/test_files/simplify_expr.slt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index 99fc9900ef619..dc55e836f7ed6 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -34,10 +34,10 @@ query TT explain select b from t where b ~ '.*' ---- logical_plan -01)Filter: t.b ~ Utf8View(".*") +01)Filter: t.b IS NOT NULL 02)--TableScan: t projection=[b] physical_plan -01)FilterExec: b@0 ~ .* +01)FilterExec: b@0 IS NOT NULL 02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT From 23737823d94f2d418fa0900a6607394dd0c75a1a Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 13:40:59 -0800 Subject: [PATCH 03/12] Fix the core issue by checking and returning the proper str datatype (utf8view and largeutf8 support) --- .../simplify_expressions/expr_simplifier.rs | 4 +- .../src/simplify_expressions/regex.rs | 46 ++++++++++--------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index c6644e008645a..701ffc421de09 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2126,7 +2126,7 @@ fn is_literal_or_literal_cast(expr: &Expr) -> bool { } } -fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { +pub fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { match expr { Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)), Expr::Literal(ScalarValue::LargeUtf8(s), _) => Some((DataType::LargeUtf8, s)), @@ -2135,7 +2135,7 @@ fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { } } -fn to_string_scalar(data_type: &DataType, value: Option) -> Expr { +pub fn to_string_scalar(data_type: &DataType, value: Option) -> Expr { match data_type { DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value), None), DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value), None), diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index b28ffde5c8194..6126646ce0ce4 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -15,10 +15,13 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::{DataFusionError, Result, ScalarValue}; +use arrow::datatypes::DataType; +use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; +use crate::simplify_expressions::expr_simplifier::{as_string_scalar, to_string_scalar}; + /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions. const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; @@ -44,29 +47,27 @@ pub fn simplify_regex_expr( op: Operator, right: Box, ) -> Result { - let mode = OperatorMode::new(&op); - - let (pattern, is_utf8) = match right.as_ref() { - Expr::Literal(ScalarValue::Utf8(Some(p)), _) => (p.as_str(), true), - Expr::Literal(ScalarValue::Utf8View(Some(p)), _) => (p.as_str(), false), - _ => return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })), + // Check if the right operand is a string literal + let Some((datatype, pattern_opt)) = as_string_scalar(&right) else { + return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })); + }; + let Some(pattern_owned) = pattern_opt.as_ref() else { + return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })); }; + let pattern = pattern_owned.as_str(); + + let mode = OperatorMode::new(&op, datatype.clone()); // Handle the special case for ".*" pattern if pattern == ANY_CHAR_REGEX_PATTERN { let new_expr = if mode.not { - if is_utf8 { - // not empty - let empty_lit = Box::new(lit("")); - Expr::BinaryExpr(BinaryExpr { - left, - op: Operator::Eq, - right: empty_lit, - }) - } else { - // Leave untouched because optimization doesn't work for Utf8View - Expr::BinaryExpr(BinaryExpr { left, op, right }) - } + // not empty + let empty_lit = Box::new(to_string_scalar(&datatype, Some("".to_string()))); + Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right: empty_lit, + }) } else { // not null left.is_not_null() @@ -106,10 +107,11 @@ struct OperatorMode { not: bool, /// Ignore case (`true` for case-insensitive). i: bool, + datatype: DataType, } impl OperatorMode { - fn new(op: &Operator) -> Self { + fn new(op: &Operator, datatype: DataType) -> Self { let not = match op { Operator::RegexMatch | Operator::RegexIMatch => false, Operator::RegexNotMatch | Operator::RegexNotIMatch => true, @@ -122,7 +124,7 @@ impl OperatorMode { _ => unreachable!(), }; - Self { not, i } + Self { not, i, datatype } } /// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern. @@ -130,7 +132,7 @@ impl OperatorMode { let like = Like { negated: self.not, expr, - pattern: Box::new(Expr::Literal(ScalarValue::from(pattern), None)), + pattern: Box::new(to_string_scalar(&self.datatype, Some(pattern))), escape_char: None, case_insensitive: self.i, }; From e1f661bc11675e938b8106016bfa2e58bf560bb0 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 13:41:12 -0800 Subject: [PATCH 04/12] Update sqllogictests --- datafusion/sqllogictest/test_files/simplify_expr.slt | 4 ++-- datafusion/sqllogictest/test_files/string/string_view.slt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index dc55e836f7ed6..f8c219e052f80 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -44,10 +44,10 @@ query TT explain select b from t where b !~ '.*' ---- logical_plan -01)Filter: t.b !~ Utf8View(".*") +01)Filter: t.b = Utf8View("") 02)--TableScan: t projection=[b] physical_plan -01)FilterExec: b@0 !~ .* +01)FilterExec: b@0 = 02)--DataSourceExec: partitions=1, partition_sizes=[1] query T diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 13b0aba653efb..4dcc2f663a830 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -1100,7 +1100,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: test.column1_utf8view ~ Utf8View("an") AS c1 +01)Projection: test.column1_utf8view LIKE Utf8View("%an%") AS c1 02)--TableScan: test projection=[column1_utf8view] # `~*` operator (regex match case-insensitive) From 24387995427e3c04fdbb740ee8913de80299396a Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 14:10:04 -0800 Subject: [PATCH 05/12] Properly indicate Transformed::no() when appropriate --- .../src/simplify_expressions/expr_simplifier.rs | 3 ++- .../optimizer/src/simplify_expressions/regex.rs | 16 +++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 701ffc421de09..f1023a31209ef 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1660,7 +1660,8 @@ impl TreeNodeRewriter for Simplifier<'_> { left, op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch), right, - }) => Transformed::yes(simplify_regex_expr(left, op, right)?), + // }) => Transformed::yes(simplify_regex_expr(left, op, right)?), + }) => simplify_regex_expr(left, op, right)?, // Rules for Like Expr::Like(like) => { diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 6126646ce0ce4..dfa7461557075 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -17,6 +17,7 @@ use arrow::datatypes::DataType; use datafusion_common::{DataFusionError, Result}; +use datafusion_common::tree_node::Transformed; use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; @@ -46,13 +47,13 @@ pub fn simplify_regex_expr( left: Box, op: Operator, right: Box, -) -> Result { +) -> Result> { // Check if the right operand is a string literal let Some((datatype, pattern_opt)) = as_string_scalar(&right) else { - return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })); + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); }; let Some(pattern_owned) = pattern_opt.as_ref() else { - return Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })); + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); }; let pattern = pattern_owned.as_str(); @@ -72,7 +73,7 @@ pub fn simplify_regex_expr( // not null left.is_not_null() }; - return Ok(new_expr); + return Ok(Transformed::yes(new_expr)); } match regex_syntax::Parser::new().parse(pattern) { @@ -82,10 +83,10 @@ pub fn simplify_regex_expr( if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION && let Some(expr) = lower_alt(&mode, &left, alts) { - return Ok(expr); + return Ok(Transformed::yes(expr)); } } else if let Some(expr) = lower_simple(&mode, &left, &hir) { - return Ok(expr); + return Ok(Transformed::yes(expr)); } } Err(e) => { @@ -98,7 +99,7 @@ pub fn simplify_regex_expr( } // Leave untouched if optimization didn't work - Ok(Expr::BinaryExpr(BinaryExpr { left, op, right })) + Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))) } #[derive(Debug)] @@ -107,6 +108,7 @@ struct OperatorMode { not: bool, /// Ignore case (`true` for case-insensitive). i: bool, + /// Data type of the pattern (e.g. Utf8, Utf8View, LargeUtf8) datatype: DataType, } From 155f8d428f23e9a0adc4e1584aa58c52fd5281ed Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 14:24:12 -0800 Subject: [PATCH 06/12] Clean up and fmt --- .../simplify_expressions/expr_simplifier.rs | 3 +-- .../src/simplify_expressions/regex.rs | 20 +++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index f1023a31209ef..56d3529248e9d 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1660,8 +1660,7 @@ impl TreeNodeRewriter for Simplifier<'_> { left, op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch), right, - // }) => Transformed::yes(simplify_regex_expr(left, op, right)?), - }) => simplify_regex_expr(left, op, right)?, + }) => simplify_regex_expr(left, op, right)?, // Rules for Like Expr::Like(like) => { diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index dfa7461557075..e6d4feac42c39 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -16,8 +16,8 @@ // under the License. use arrow::datatypes::DataType; -use datafusion_common::{DataFusionError, Result}; use datafusion_common::tree_node::Transformed; +use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; @@ -50,10 +50,18 @@ pub fn simplify_regex_expr( ) -> Result> { // Check if the right operand is a string literal let Some((datatype, pattern_opt)) = as_string_scalar(&right) else { - return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))); }; let Some(pattern_owned) = pattern_opt.as_ref() else { - return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))); }; let pattern = pattern_owned.as_str(); @@ -99,7 +107,11 @@ pub fn simplify_regex_expr( } // Leave untouched if optimization didn't work - Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))) + Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))) } #[derive(Debug)] From edcdba723aeffed7624d8f8e10ad3d8aace34f7e Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Thu, 26 Feb 2026 17:23:51 -0800 Subject: [PATCH 07/12] empty commit to retrigger ci From 11b7cbac301c471baeea0130d99f6e318d417359 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 27 Feb 2026 10:51:01 -0800 Subject: [PATCH 08/12] Review Feedback: Replace as_string_scalar and to_string_scalar with StringScalar enum --- .../simplify_expressions/expr_simplifier.rs | 67 +++++++++++++----- .../src/simplify_expressions/regex.rs | 69 +++++++++++++------ 2 files changed, 95 insertions(+), 41 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 56d3529248e9d..4fe9d6ae55638 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1666,8 +1666,14 @@ impl TreeNodeRewriter for Simplifier<'_> { Expr::Like(like) => { // `\` is implicit escape, see https://github.com/apache/datafusion/issues/13291 let escape_char = like.escape_char.unwrap_or('\\'); - match as_string_scalar(&like.pattern) { - Some((data_type, pattern_str)) => { + + let pattern_scalar = match like.pattern.as_ref() { + Expr::Literal(scalar, _) => scalar, + _ => return Ok(Transformed::no(Expr::Like(like))), + }; + match StringScalar::try_from_scalar(pattern_scalar) { + Some(string_scalar) => { + let pattern_str = string_scalar.as_str(); match pattern_str { None => return Ok(Transformed::yes(lit_bool_null())), Some(pattern_str) if pattern_str == "%" => { @@ -1702,10 +1708,9 @@ impl TreeNodeRewriter for Simplifier<'_> { .replace_all(pattern_str, "%") .to_string(); Transformed::yes(Expr::Like(Like { - pattern: Box::new(to_string_scalar( - &data_type, - Some(simplified_pattern), - )), + pattern: Box::new( + string_scalar.to_scalar(&simplified_pattern), + ), ..like })) } @@ -2126,21 +2131,45 @@ fn is_literal_or_literal_cast(expr: &Expr) -> bool { } } -pub fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { - match expr { - Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)), - Expr::Literal(ScalarValue::LargeUtf8(s), _) => Some((DataType::LargeUtf8, s)), - Expr::Literal(ScalarValue::Utf8View(s), _) => Some((DataType::Utf8View, s)), - _ => None, - } +/// Helper for working with string scalar values (Utf8, LargeUtf8, Utf8View) +pub(crate) enum StringScalar<'a> { + Utf8(&'a ScalarValue), + LargeUtf8(&'a ScalarValue), + Utf8View(&'a ScalarValue), } -pub fn to_string_scalar(data_type: &DataType, value: Option) -> Expr { - match data_type { - DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value), None), - DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value), None), - DataType::Utf8View => Expr::Literal(ScalarValue::Utf8View(value), None), - _ => unreachable!(), +impl<'a> StringScalar<'a> { + /// Create a `StringScalar` view from a `ScalarValue` if it is a supported string type. + /// Returns `None` if the scalar value is not a supported string type. + pub(crate) fn try_from_scalar(scalar: &'a ScalarValue) -> Option { + match scalar { + ScalarValue::Utf8(_) => Some(Self::Utf8(scalar)), + ScalarValue::LargeUtf8(_) => Some(Self::LargeUtf8(scalar)), + ScalarValue::Utf8View(_) => Some(Self::Utf8View(scalar)), + _ => None, + } + } + + /// Returns the underlying string slice. + pub(crate) fn as_str(&self) -> Option<&'a str> { + match self { + Self::Utf8(scalar) | Self::LargeUtf8(scalar) | Self::Utf8View(scalar) => { + scalar.try_as_str().flatten() + } + } + } + + /// Build a new `Expr` of the same string type with the given value. + pub(crate) fn to_scalar(&self, val: &str) -> Expr { + match self { + Self::Utf8(_) => Expr::Literal(ScalarValue::Utf8(Some(val.to_owned())), None), + Self::LargeUtf8(_) => { + Expr::Literal(ScalarValue::LargeUtf8(Some(val.to_owned())), None) + } + Self::Utf8View(_) => { + Expr::Literal(ScalarValue::Utf8View(Some(val.to_owned())), None) + } + } } } diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index e6d4feac42c39..714f387f607a9 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -15,13 +15,12 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::DataType; use datafusion_common::tree_node::Transformed; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; -use crate::simplify_expressions::expr_simplifier::{as_string_scalar, to_string_scalar}; +use crate::simplify_expressions::expr_simplifier::StringScalar; /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions. const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; @@ -48,30 +47,39 @@ pub fn simplify_regex_expr( op: Operator, right: Box, ) -> Result> { + let right_scalar = match right.as_ref() { + Expr::Literal(scalar, _) => scalar, + _ => { + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))); + } + }; // Check if the right operand is a string literal - let Some((datatype, pattern_opt)) = as_string_scalar(&right) else { + let Some(string_scalar) = StringScalar::try_from_scalar(right_scalar) else { return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right, }))); }; - let Some(pattern_owned) = pattern_opt.as_ref() else { + let pattern = string_scalar.as_str(); + let Some(pattern) = pattern else { return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right, }))); }; - let pattern = pattern_owned.as_str(); - - let mode = OperatorMode::new(&op, datatype.clone()); + let mode = OperatorMode::new(&op); // Handle the special case for ".*" pattern if pattern == ANY_CHAR_REGEX_PATTERN { let new_expr = if mode.not { // not empty - let empty_lit = Box::new(to_string_scalar(&datatype, Some("".to_string()))); + let empty_lit = Box::new(string_scalar.to_scalar("")); Expr::BinaryExpr(BinaryExpr { left, op: Operator::Eq, @@ -89,11 +97,11 @@ pub fn simplify_regex_expr( let kind = hir.kind(); if let HirKind::Alternation(alts) = kind { if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION - && let Some(expr) = lower_alt(&mode, &left, alts) + && let Some(expr) = lower_alt(&mode, &left, alts, &string_scalar) { return Ok(Transformed::yes(expr)); } - } else if let Some(expr) = lower_simple(&mode, &left, &hir) { + } else if let Some(expr) = lower_simple(&mode, &left, &hir, &string_scalar) { return Ok(Transformed::yes(expr)); } } @@ -120,12 +128,10 @@ struct OperatorMode { not: bool, /// Ignore case (`true` for case-insensitive). i: bool, - /// Data type of the pattern (e.g. Utf8, Utf8View, LargeUtf8) - datatype: DataType, } impl OperatorMode { - fn new(op: &Operator, datatype: DataType) -> Self { + fn new(op: &Operator) -> Self { let not = match op { Operator::RegexMatch | Operator::RegexIMatch => false, Operator::RegexNotMatch | Operator::RegexNotIMatch => true, @@ -138,15 +144,15 @@ impl OperatorMode { _ => unreachable!(), }; - Self { not, i, datatype } + Self { not, i } } /// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern. - fn expr(&self, expr: Box, pattern: String) -> Expr { + fn expr(&self, expr: Box, pattern: Box) -> Expr { let like = Like { negated: self.not, expr, - pattern: Box::new(to_string_scalar(&self.datatype, Some(pattern))), + pattern, escape_char: None, case_insensitive: self.i, }; @@ -336,14 +342,25 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option> { } /// Tries to lower (transform) a simple regex pattern to a LIKE expression. -fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { +fn lower_simple( + mode: &OperatorMode, + left: &Expr, + hir: &Hir, + string_scalar: &StringScalar, +) -> Option { match hir.kind() { HirKind::Empty => { - return Some(mode.expr(Box::new(left.clone()), "%".to_owned())); + return Some(mode.expr( + Box::new(left.clone()), + Box::new(string_scalar.to_scalar("%")), + )); } HirKind::Literal(l) => { let s = like_str_from_literal(l)?; - return Some(mode.expr(Box::new(left.clone()), format!("%{s}%"))); + return Some(mode.expr( + Box::new(left.clone()), + Box::new(string_scalar.to_scalar(&format!("%{s}%"))), + )); } HirKind::Concat(inner) if is_anchored_literal(inner) => { return anchored_literal_to_expr(inner).map(|right| { @@ -358,7 +375,10 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { if let Some(pattern) = partial_anchored_literal_to_like(inner) .or_else(|| collect_concat_to_like_string(inner)) { - return Some(mode.expr(Box::new(left.clone()), pattern)); + return Some(mode.expr( + Box::new(left.clone()), + Box::new(string_scalar.to_scalar(&pattern)), + )); } } _ => {} @@ -369,11 +389,16 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { /// Calls [`lower_simple`] for each alternative and combine the results with `or` or `and` /// based on [`OperatorMode`]. Any fail attempt to lower an alternative will makes this /// function to return `None`. -fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option { +fn lower_alt( + mode: &OperatorMode, + left: &Expr, + alts: &[Hir], + string_scalar: &StringScalar, +) -> Option { let mut accu: Option = None; for part in alts { - if let Some(expr) = lower_simple(mode, left, part) { + if let Some(expr) = lower_simple(mode, left, part, string_scalar) { accu = match accu { Some(accu) => { if mode.not { From 94cdcc4b65c5d2dec074b1eefb22d6f0b248cda4 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 27 Feb 2026 11:04:23 -0800 Subject: [PATCH 09/12] Use new try_from_expr() constructor and rename to_scalar() to to_expr() --- .../simplify_expressions/expr_simplifier.rs | 21 +++++++++------ .../src/simplify_expressions/regex.rs | 27 ++++++------------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 4fe9d6ae55638..b6e7cb3994f3e 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1667,11 +1667,7 @@ impl TreeNodeRewriter for Simplifier<'_> { // `\` is implicit escape, see https://github.com/apache/datafusion/issues/13291 let escape_char = like.escape_char.unwrap_or('\\'); - let pattern_scalar = match like.pattern.as_ref() { - Expr::Literal(scalar, _) => scalar, - _ => return Ok(Transformed::no(Expr::Like(like))), - }; - match StringScalar::try_from_scalar(pattern_scalar) { + match StringScalar::try_from_expr(&like.pattern) { Some(string_scalar) => { let pattern_str = string_scalar.as_str(); match pattern_str { @@ -1709,7 +1705,7 @@ impl TreeNodeRewriter for Simplifier<'_> { .to_string(); Transformed::yes(Expr::Like(Like { pattern: Box::new( - string_scalar.to_scalar(&simplified_pattern), + string_scalar.to_expr(&simplified_pattern), ), ..like })) @@ -2139,9 +2135,18 @@ pub(crate) enum StringScalar<'a> { } impl<'a> StringScalar<'a> { + /// Create a `StringScalar` view from an `Expr` if it is a supported string literal. + /// Returns `None` if the expression is not a string literal. + pub(crate) fn try_from_expr(expr: &'a Expr) -> Option { + match expr { + Expr::Literal(scalar, _) => Self::try_from_scalar(scalar), + _ => None, + } + } + /// Create a `StringScalar` view from a `ScalarValue` if it is a supported string type. /// Returns `None` if the scalar value is not a supported string type. - pub(crate) fn try_from_scalar(scalar: &'a ScalarValue) -> Option { + fn try_from_scalar(scalar: &'a ScalarValue) -> Option { match scalar { ScalarValue::Utf8(_) => Some(Self::Utf8(scalar)), ScalarValue::LargeUtf8(_) => Some(Self::LargeUtf8(scalar)), @@ -2160,7 +2165,7 @@ impl<'a> StringScalar<'a> { } /// Build a new `Expr` of the same string type with the given value. - pub(crate) fn to_scalar(&self, val: &str) -> Expr { + pub(crate) fn to_expr(&self, val: &str) -> Expr { match self { Self::Utf8(_) => Expr::Literal(ScalarValue::Utf8(Some(val.to_owned())), None), Self::LargeUtf8(_) => { diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 714f387f607a9..6c2492d05404d 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -47,18 +47,8 @@ pub fn simplify_regex_expr( op: Operator, right: Box, ) -> Result> { - let right_scalar = match right.as_ref() { - Expr::Literal(scalar, _) => scalar, - _ => { - return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { - left, - op, - right, - }))); - } - }; - // Check if the right operand is a string literal - let Some(string_scalar) = StringScalar::try_from_scalar(right_scalar) else { + // Check if the right operand is a supported string literal + let Some(string_scalar) = StringScalar::try_from_expr(right.as_ref()) else { return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, @@ -79,7 +69,7 @@ pub fn simplify_regex_expr( if pattern == ANY_CHAR_REGEX_PATTERN { let new_expr = if mode.not { // not empty - let empty_lit = Box::new(string_scalar.to_scalar("")); + let empty_lit = Box::new(string_scalar.to_expr("")); Expr::BinaryExpr(BinaryExpr { left, op: Operator::Eq, @@ -350,16 +340,15 @@ fn lower_simple( ) -> Option { match hir.kind() { HirKind::Empty => { - return Some(mode.expr( - Box::new(left.clone()), - Box::new(string_scalar.to_scalar("%")), - )); + return Some( + mode.expr(Box::new(left.clone()), Box::new(string_scalar.to_expr("%"))), + ); } HirKind::Literal(l) => { let s = like_str_from_literal(l)?; return Some(mode.expr( Box::new(left.clone()), - Box::new(string_scalar.to_scalar(&format!("%{s}%"))), + Box::new(string_scalar.to_expr(&format!("%{s}%"))), )); } HirKind::Concat(inner) if is_anchored_literal(inner) => { @@ -377,7 +366,7 @@ fn lower_simple( { return Some(mode.expr( Box::new(left.clone()), - Box::new(string_scalar.to_scalar(&pattern)), + Box::new(string_scalar.to_expr(&pattern)), )); } } From ab79321626f2ad0d25617070318a149058fac43c Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 27 Feb 2026 11:16:31 -0800 Subject: [PATCH 10/12] Fix clippy --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index b6e7cb3994f3e..fe2e1a3b0408a 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1672,7 +1672,7 @@ impl TreeNodeRewriter for Simplifier<'_> { let pattern_str = string_scalar.as_str(); match pattern_str { None => return Ok(Transformed::yes(lit_bool_null())), - Some(pattern_str) if pattern_str == "%" => { + Some("%") => { // exp LIKE '%' is // - when exp is not NULL, it's true // - when exp is NULL, it's NULL From 5030242c05b9718283502ab85b9d956b1cf3bb86 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 27 Feb 2026 14:53:18 -0800 Subject: [PATCH 11/12] Implement regexp optimization for .*foo.* patterns --- Cargo.lock | 1 + datafusion/optimizer/Cargo.toml | 1 + .../simplify_expressions/expr_simplifier.rs | 14 ++++++ .../src/simplify_expressions/regex.rs | 48 ++++++++++++++++++ .../sqllogictest/test_files/simplify_expr.slt | 49 +++++++++++++++++++ 5 files changed, 113 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 5092a860e3c13..b992b12a62750 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2367,6 +2367,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-expr-common", + "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-window", "datafusion-functions-window-common", diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index 76d3f73f68767..5e2026f05ac2c 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -49,6 +49,7 @@ chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } +datafusion-functions = { workspace = true } datafusion-physical-expr = { workspace = true } indexmap = { workspace = true } itertools = { workspace = true } diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index fe2e1a3b0408a..9adb1d9921f37 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2366,6 +2366,7 @@ mod tests { interval_arithmetic::Interval, *, }; + use datafusion_functions::expr_fn::contains as contains_fn; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use datafusion_functions_window_common::partition::PartitionEvaluatorArgs; use datafusion_physical_expr::PhysicalExpr; @@ -3381,6 +3382,19 @@ mod tests { col("c1").like(lit("%foo%")), ); + // regular expression that matches a substring + assert_change( + regex_match(col("c1"), lit(".*foo.*")), + contains_fn(col("c1"), lit("foo")), + ); + + assert_change( + regex_not_match(col("c1"), lit(".*foo.*")), + Expr::Not(Box::new(contains_fn(col("c1"), lit("foo")))), + ); + + assert_change(regex_match(col("c1"), lit(".*.*")), col("c1").is_not_null()); + // regular expressions that match an exact literal assert_change(regex_match(col("c1"), lit("^$")), col("c1").eq(lit(""))); assert_change( diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 6c2492d05404d..496fd1e51d26f 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -18,6 +18,7 @@ use datafusion_common::tree_node::Transformed; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit}; +use datafusion_functions::expr_fn::contains; use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look}; use crate::simplify_expressions::expr_simplifier::StringScalar; @@ -34,6 +35,8 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*"; /// /// Typical cases this function can simplify: /// - empty regex pattern to `LIKE '%'` +/// - `EQ .*foo.*` to `contains(left, "foo")` +/// - `NE .*foo.*` to `NOT contains(left, "foo")` /// - literal regex patterns to `LIKE '%foo%'` /// - full anchored regex patterns (e.g. `^foo$`) to `= 'foo'` /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'` @@ -82,6 +85,32 @@ pub fn simplify_regex_expr( return Ok(Transformed::yes(new_expr)); } + // Convert patterns of the form ".*foo.*" to `contains(left, "foo")` + if !mode.i + && let Some(inner) = pattern + // If pattern starts and ends with ".*" + .strip_prefix(ANY_CHAR_REGEX_PATTERN) + .and_then(|rest| rest.strip_suffix(ANY_CHAR_REGEX_PATTERN)) + // If inner is all non-special characters + && inner.chars().all(|x| !is_special_character(x)) + { + let new_expr = match (mode.not, inner.is_empty()) { + // contains(left, inner) + (false, false) => contains(*left, lit(inner)), + (false, true) => left.is_not_null(), + // not (contains(left, inner)) + (true, false) => Expr::Not(Box::new(contains(*left, lit(inner)))), + (true, true) => { + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))); + } + }; + return Ok(Transformed::yes(new_expr)); + } + match regex_syntax::Parser::new().parse(pattern) { Ok(hir) => { let kind = hir.kind(); @@ -202,6 +231,25 @@ fn is_safe_for_like(c: char) -> bool { (c != '%') && (c != '_') } +fn is_special_character(c: char) -> bool { + matches!( + c, + '.' | '*' + | '+' + | '?' + | '|' + | '(' + | ')' + | '[' + | ']' + | '{' + | '}' + | '^' + | '$' + | '\\' + ) +} + /// Returns true if the elements in a `Concat` pattern are: /// - `[Look::Start, Look::End]` /// - `[Look::Start, Literal(_), Look::End]` diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index f8c219e052f80..d3e80a023744e 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -58,7 +58,56 @@ c query T select b from t where b !~ '.*' + +# test regex .*literal.* simplifies to contains() +query TT +explain select b from t where b ~ '.*a.*' +---- +logical_plan +01)Filter: contains(t.b, Utf8("a")) +02)--TableScan: t projection=[b] +physical_plan +01)FilterExec: contains(b@0, a) +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query T +select b from t where b ~ '.*a.*' ---- +a + +query TT +explain select b from t where b !~ '.*a.*' +---- +logical_plan +01)Filter: NOT contains(t.b, Utf8("a")) +02)--TableScan: t projection=[b] +physical_plan +01)FilterExec: NOT contains(b@0, a) +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query T +select b from t where b !~ '.*a.*' +---- +c + +query TT +explain select b from t where b ~ '.*.*' +---- +logical_plan +01)Filter: t.b IS NOT NULL +02)--TableScan: t projection=[b] +physical_plan +01)FilterExec: b@0 IS NOT NULL +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query T +select b from t where b ~ '.*.*' +---- +a +c + +query T +select b from t where b !~ '.*.*' query TT explain select * from t where a = a; From fbb9e8efab309562fc7a429178c490f8122b3736 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Fri, 27 Feb 2026 15:51:02 -0800 Subject: [PATCH 12/12] Optimize ~! '.*' and '.*' cases to False instead of Eq empty str condition (fixes old behavior) --- .../src/simplify_expressions/regex.rs | 19 ++++--------------- .../simplify_expressions/simplify_exprs.rs | 16 ++++++++-------- .../sqllogictest/test_files/simplify_expr.slt | 10 ++++------ 3 files changed, 16 insertions(+), 29 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 496fd1e51d26f..8d19f18330392 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -42,7 +42,7 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*"; /// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'` /// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND` /// - `EQ .*` to NotNull -/// - `NE .*` means IS EMPTY +/// - `NE .*` to false (.* matches any string, and NULL !~ results in NULL so NOT match can never be true) /// /// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`. pub fn simplify_regex_expr( @@ -71,13 +71,8 @@ pub fn simplify_regex_expr( // Handle the special case for ".*" pattern if pattern == ANY_CHAR_REGEX_PATTERN { let new_expr = if mode.not { - // not empty - let empty_lit = Box::new(string_scalar.to_expr("")); - Expr::BinaryExpr(BinaryExpr { - left, - op: Operator::Eq, - right: empty_lit, - }) + // Always false. + lit(false) } else { // not null left.is_not_null() @@ -100,13 +95,7 @@ pub fn simplify_regex_expr( (false, true) => left.is_not_null(), // not (contains(left, inner)) (true, false) => Expr::Not(Box::new(contains(*left, lit(inner)))), - (true, true) => { - return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { - left, - op, - right, - }))); - } + (true, true) => lit(false), // "!~ '.*'" is always false. }; return Ok(Transformed::yes(new_expr)); } diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index f7f100015004a..fb970ad0af996 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -883,17 +883,17 @@ mod tests { " )?; - // Test `!= ".*"` transforms to checking if the column is empty + // Test `!= ".*"` transforms to false (.* matches any string, so NOT match is always false) let plan = LogicalPlanBuilder::from(table_scan.clone()) .filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))? .build()?; assert_optimized_plan_equal!( plan, - @ r#" - Filter: test.a = Utf8("") + @ r" + Filter: Boolean(false) TableScan: test - "# + " )?; // Test case-insensitive versions @@ -911,17 +911,17 @@ mod tests { " )?; - // Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty + // Test `!~ ".*"` (case-insensitive) transforms to false (.* matches any string, so NOT match is always false) let plan = LogicalPlanBuilder::from(table_scan.clone()) .filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))? .build()?; assert_optimized_plan_equal!( plan, - @ r#" - Filter: test.a = Utf8("") + @ r" + Filter: Boolean(false) TableScan: test - "# + " ) } diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index d3e80a023744e..4f7967bbfa8d6 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -43,12 +43,8 @@ physical_plan query TT explain select b from t where b !~ '.*' ---- -logical_plan -01)Filter: t.b = Utf8View("") -02)--TableScan: t projection=[b] -physical_plan -01)FilterExec: b@0 = -02)--DataSourceExec: partitions=1, partition_sizes=[1] +logical_plan EmptyRelation: rows=0 +physical_plan EmptyExec query T select b from t where b ~ '.*' @@ -58,6 +54,7 @@ c query T select b from t where b !~ '.*' +---- # test regex .*literal.* simplifies to contains() query TT @@ -108,6 +105,7 @@ c query T select b from t where b !~ '.*.*' +---- query TT explain select * from t where a = a;