Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1660,7 +1660,7 @@ impl TreeNodeRewriter for Simplifier<'_> {
left,
op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch),
right,
}) => Transformed::yes(simplify_regex_expr(left, op, right)?),
}) => simplify_regex_expr(left, op, right)?,

// Rules for Like
Expr::Like(like) => {
Expand Down Expand Up @@ -2126,7 +2126,7 @@ fn is_literal_or_literal_cast(expr: &Expr) -> bool {
}
}

fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
pub fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we are going to make this a pub API, we should at least document them, and maybe we can clean it up bit?

Maybe the simplest thing would be to just make it pub (crate) rather than pub

What do you think about something like

pub enum StringScalar<'a> {
  Utf8(&'a ScalarValue, &'str),
  LargeUtf8(&'a ScalarValue, &'str),
  Utf8View(&'a ScalarValue, &'str),
}

impl StringScalar {
  fn try_from_scalar(scalar: &ScalarValue) -> Self { 
...
  }
 
  fn to_scalar(&self, val: &str) -> Expr {
}

That would:

  1. Avoid creating DataTypes
  2. Put some better documentation in place
  3. Encapsulate the logic a bit more

match expr {
Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)),
Expr::Literal(ScalarValue::LargeUtf8(s), _) => Some((DataType::LargeUtf8, s)),
Expand All @@ -2135,7 +2135,7 @@ fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
}
}

fn to_string_scalar(data_type: &DataType, value: Option<String>) -> Expr {
pub fn to_string_scalar(data_type: &DataType, value: Option<String>) -> Expr {
match data_type {
DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value), None),
DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value), None),
Expand Down
111 changes: 68 additions & 43 deletions datafusion/optimizer/src/simplify_expressions/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@
// specific language governing permissions and limitations
// under the License.

use datafusion_common::{DataFusionError, Result, ScalarValue};
use arrow::datatypes::DataType;
use datafusion_common::tree_node::Transformed;
use datafusion_common::{DataFusionError, Result};
use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};

use crate::simplify_expressions::expr_simplifier::{as_string_scalar, to_string_scalar};

/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;

Expand All @@ -43,52 +47,71 @@ pub fn simplify_regex_expr(
left: Box<Expr>,
op: Operator,
right: Box<Expr>,
) -> Result<Expr> {
let mode = OperatorMode::new(&op);

if let Expr::Literal(ScalarValue::Utf8(Some(pattern)), _) = right.as_ref() {
// Handle the special case for ".*" pattern
if pattern == ANY_CHAR_REGEX_PATTERN {
let new_expr = if mode.not {
// not empty
let empty_lit = Box::new(lit(""));
Expr::BinaryExpr(BinaryExpr {
left,
op: Operator::Eq,
right: empty_lit,
})
} else {
// not null
left.is_not_null()
};
return Ok(new_expr);
}
) -> Result<Transformed<Expr>> {
// Check if the right operand is a string literal
let Some((datatype, pattern_opt)) = as_string_scalar(&right) else {
return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
left,
op,
right,
})));
};
let Some(pattern_owned) = pattern_opt.as_ref() else {
return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
left,
op,
right,
})));
};
let pattern = pattern_owned.as_str();

let mode = OperatorMode::new(&op, datatype.clone());

// Handle the special case for ".*" pattern
if pattern == ANY_CHAR_REGEX_PATTERN {
let new_expr = if mode.not {
// not empty
let empty_lit = Box::new(to_string_scalar(&datatype, Some("".to_string())));
Expr::BinaryExpr(BinaryExpr {
left,
op: Operator::Eq,
right: empty_lit,
})
} else {
// not null
left.is_not_null()
};
return Ok(Transformed::yes(new_expr));
}

match regex_syntax::Parser::new().parse(pattern) {
Ok(hir) => {
let kind = hir.kind();
if let HirKind::Alternation(alts) = kind {
if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION
&& let Some(expr) = lower_alt(&mode, &left, alts)
{
return Ok(expr);
}
} else if let Some(expr) = lower_simple(&mode, &left, &hir) {
return Ok(expr);
match regex_syntax::Parser::new().parse(pattern) {
Ok(hir) => {
let kind = hir.kind();
if let HirKind::Alternation(alts) = kind {
if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION
&& let Some(expr) = lower_alt(&mode, &left, alts)
{
return Ok(Transformed::yes(expr));
}
} else if let Some(expr) = lower_simple(&mode, &left, &hir) {
return Ok(Transformed::yes(expr));
}
Err(e) => {
// error out early since the execution may fail anyways
return Err(DataFusionError::Context(
"Invalid regex".to_owned(),
Box::new(DataFusionError::External(Box::new(e))),
));
}
}
Err(e) => {
// error out early since the execution may fail anyways
return Err(DataFusionError::Context(
"Invalid regex".to_owned(),
Box::new(DataFusionError::External(Box::new(e))),
));
}
}

// Leave untouched if optimization didn't work
Ok(Expr::BinaryExpr(BinaryExpr { left, op, right }))
Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
left,
op,
right,
})))
}

#[derive(Debug)]
Expand All @@ -97,10 +120,12 @@ struct OperatorMode {
not: bool,
/// Ignore case (`true` for case-insensitive).
i: bool,
/// Data type of the pattern (e.g. Utf8, Utf8View, LargeUtf8)
datatype: DataType,
}

impl OperatorMode {
fn new(op: &Operator) -> Self {
fn new(op: &Operator, datatype: DataType) -> Self {
let not = match op {
Operator::RegexMatch | Operator::RegexIMatch => false,
Operator::RegexNotMatch | Operator::RegexNotIMatch => true,
Expand All @@ -113,15 +138,15 @@ impl OperatorMode {
_ => unreachable!(),
};

Self { not, i }
Self { not, i, datatype }
}

/// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern.
fn expr(&self, expr: Box<Expr>, pattern: String) -> Expr {
let like = Like {
negated: self.not,
expr,
pattern: Box::new(Expr::Literal(ScalarValue::from(pattern), None)),
pattern: Box::new(to_string_scalar(&self.datatype, Some(pattern))),
escape_char: None,
case_insensitive: self.i,
};
Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/simplify_expr.slt
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,20 @@ query TT
explain select b from t where b ~ '.*'
----
logical_plan
01)Filter: t.b ~ Utf8View(".*")
01)Filter: t.b IS NOT NULL
02)--TableScan: t projection=[b]
physical_plan
01)FilterExec: b@0 ~ .*
01)FilterExec: b@0 IS NOT NULL
02)--DataSourceExec: partitions=1, partition_sizes=[1]

query TT
explain select b from t where b !~ '.*'
----
logical_plan
01)Filter: t.b !~ Utf8View(".*")
01)Filter: t.b = Utf8View("")
02)--TableScan: t projection=[b]
physical_plan
01)FilterExec: b@0 !~ .*
01)FilterExec: b@0 =
02)--DataSourceExec: partitions=1, partition_sizes=[1]

query T
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/string/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1100,7 +1100,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: test.column1_utf8view ~ Utf8View("an") AS c1
01)Projection: test.column1_utf8view LIKE Utf8View("%an%") AS c1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is certainly a better plan

02)--TableScan: test projection=[column1_utf8view]

# `~*` operator (regex match case-insensitive)
Expand Down
Loading