diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4 index 7767e66ac7dec7..97d59925cf8d03 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4 @@ -45,6 +45,8 @@ AND : 'AND' ; OR : 'OR' ; NOT : 'NOT' | '!' ; +PLUS : '+' ; +MINUS : '-' ; LPAREN : '(' ; RPAREN : ')' ; COLON : ':' ; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchParser.g4 index 3ff445ea1d6e8b..13e3fc9cd55bf4 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchParser.g4 @@ -24,7 +24,7 @@ clause : orClause ; orClause : andClause (OR andClause)* ; // AND is optional - space-separated terms use default_operator andClause : notClause (AND? notClause)* ; -notClause : NOT atomClause | atomClause ; +notClause : NOT atomClause | MINUS atomClause | PLUS atomClause | atomClause ; // Note: fieldGroupQuery is listed before fieldQuery so ANTLR prioritizes field:(group) over field:value. // fieldQuery is listed before bareQuery so ANTLR prioritizes field:value over bare value. // This ensures "field:term" is parsed as fieldQuery, not bareQuery with "field" as term. diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java index 2dc5c52fb1e51a..6a86513d2f22bc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java @@ -765,7 +765,8 @@ public QsNode visitAndClause(SearchParser.AndClauseContext ctx) { @Override public QsNode visitNotClause(SearchParser.NotClauseContext ctx) { - if (ctx.NOT() != null) { + if (ctx.NOT() != null || ctx.MINUS() != null) { + // NOT or - prefix: negate the operand QsNode child = visit(ctx.atomClause()); if (child == null) { throw new RuntimeException("Invalid NOT clause: missing operand"); @@ -1517,7 +1518,14 @@ public static QsNode expandBestFields(QsNode root, List fields) { private static QsNode expandNodeCrossFields(QsNode node, List fields, boolean luceneMode) { // MATCH_ALL_DOCS matches all documents regardless of field - don't expand if (node.getType() == QsClauseType.MATCH_ALL_DOCS) { - return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + QsNode result = new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + // Preserve occur attribute (e.g., SHOULD from "X OR *" queries) + // Without this, occur defaults to null which BE interprets as MUST, + // changing "X OR *" from matching all docs to matching only X. + if (node.getOccur() != null) { + result.setOccur(node.getOccur()); + } + return result; } // Check if this is a leaf node (no children) @@ -1598,7 +1606,11 @@ private static boolean isLeafNode(QsNode node) { private static QsNode deepCopyWithField(QsNode node, String field, List fields) { // MATCH_ALL_DOCS matches all documents regardless of field - don't set field if (node.getType() == QsClauseType.MATCH_ALL_DOCS) { - return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + QsNode result = new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + if (node.getOccur() != null) { + result.setOccur(node.getOccur()); + } + return result; } if (isLeafNode(node)) { // If the user explicitly wrote "field:term" syntax, preserve original field @@ -1645,7 +1657,11 @@ private static QsNode deepCopyWithField(QsNode node, String field, List private static QsNode setFieldOnLeaves(QsNode node, String field, List fields) { // MATCH_ALL_DOCS matches all documents regardless of field - don't set field if (node.getType() == QsClauseType.MATCH_ALL_DOCS) { - return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + QsNode result = new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + if (node.getOccur() != null) { + result.setOccur(node.getOccur()); + } + return result; } if (isLeafNode(node)) { // If the user explicitly wrote "field:term" syntax, preserve original field @@ -2193,7 +2209,8 @@ private boolean hasExplicitAndBefore(SearchParser.AndClauseContext ctx, private void collectTermsFromNotClause(SearchParser.NotClauseContext ctx, List terms, QsOccur defaultOccur, boolean introducedByOr, boolean introducedByAnd) { - boolean isNegated = ctx.NOT() != null; + boolean isNegated = ctx.NOT() != null || ctx.MINUS() != null; + boolean isRequired = ctx.PLUS() != null; SearchParser.AtomClauseContext atomCtx = ctx.atomClause(); QsNode node; @@ -2223,6 +2240,7 @@ private void collectTermsFromNotClause(SearchParser.NotClauseContext ctx, List terms) { for (int i = 0; i < terms.size(); i++) { TermWithOccur current = terms.get(i); - if (current.isNegated) { - // NOT modifier - mark as MUST_NOT + if (current.isRequired) { + // + prefix: force MUST regardless of context + current.occur = QsOccur.MUST; + if (current.introducedByOr && i > 0 && useAnd) { + TermWithOccur prev = terms.get(i - 1); + if (prev.occur != QsOccur.MUST_NOT) { + prev.occur = QsOccur.SHOULD; + } + } + } else if (current.isNegated) { + // NOT or - prefix: mark as MUST_NOT current.occur = QsOccur.MUST_NOT; if (current.introducedByAnd && i > 0) { @@ -2335,7 +2362,8 @@ public QsNode visitAndClause(SearchParser.AndClauseContext ctx) { @Override public QsNode visitNotClause(SearchParser.NotClauseContext ctx) { - if (ctx.NOT() != null) { + if (ctx.NOT() != null || ctx.MINUS() != null) { + // NOT or - prefix: negate the operand QsNode child = visit(ctx.atomClause()); if (child == null) { throw new RuntimeException("Invalid NOT clause: missing operand"); @@ -2590,6 +2618,7 @@ private static class TermWithOccur { boolean introducedByOr = false; boolean introducedByAnd = false; boolean isNegated = false; + boolean isRequired = false; TermWithOccur(QsNode node, QsOccur occur) { this.node = node; diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java index 8e562401e146bd..d20a6434d99db0 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java @@ -781,6 +781,59 @@ public void testLuceneModeNotOperator() { Assertions.assertEquals(QsOccur.MUST_NOT, termNode.getOccur()); } + @Test + public void testLuceneModeMinusPrefixNot() { + // Test: "-a" should be treated as NOT a (MUST_NOT with MATCH_ALL_DOCS injection) + String dsl = "-field:a"; + String options = "{\"mode\":\"lucene\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + Assertions.assertEquals(2, plan.getRoot().getChildren().size()); + Assertions.assertEquals(QsClauseType.MATCH_ALL_DOCS, plan.getRoot().getChildren().get(0).getType()); + Assertions.assertEquals(QsOccur.MUST_NOT, plan.getRoot().getChildren().get(1).getOccur()); + } + + @Test + public void testLuceneModeMinusPrefixMultiple() { + // Test: "-a -b" with default_operator=and + String dsl = "-field:a -field:b"; + String options = "{\"mode\":\"lucene\",\"default_operator\":\"and\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + // Both are MUST_NOT + long mustNotCount = plan.getRoot().getChildren().stream() + .filter(c -> c.getOccur() == QsOccur.MUST_NOT).count(); + Assertions.assertTrue(mustNotCount >= 2, "Should have at least 2 MUST_NOT children"); + } + + @Test + public void testLuceneModePlusPrefixRequired() { + // Test: "+a b" - plus prefix forces MUST + String dsl = "+field:a field:b"; + String options = "{\"mode\":\"lucene\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + Assertions.assertEquals(2, plan.getRoot().getChildren().size()); + Assertions.assertEquals(QsOccur.MUST, plan.getRoot().getChildren().get(0).getOccur()); + } + + @Test + public void testStandardModeMinusPrefix() { + // Test: "-a" in standard mode should parse as NOT + String dsl = "-field:a"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + // Standard mode: NOT wrapping + Assertions.assertEquals(QsClauseType.NOT, plan.getRoot().getType()); + } + @Test public void testLuceneModeMinimumShouldMatchExplicit() { // Test: explicit minimum_should_match=1 keeps SHOULD clauses @@ -2473,4 +2526,59 @@ public void testSingleFieldMatchAllDocsLuceneMode() { Assertions.assertEquals(1, plan.getFieldBindings().size()); Assertions.assertEquals("title", plan.getFieldBindings().get(0).getFieldName()); } + + @Test + public void testMultiFieldMatchAllDocsPreservesOccurInOrQuery() { + // Test: '"Lauren Boebert" OR *' with multi-field + lucene mode + best_fields + // Bug: expandCrossFields was dropping the SHOULD occur on MATCH_ALL_DOCS nodes, + // causing BE to default to MUST, which changed the semantics from + // "phrase OR match_all" (= all docs) to "phrase AND match_all" (= only phrase matches). + String dsl = "\"Lauren Boebert\" OR *"; + String options = "{\"fields\":[\"title\",\"content\"],\"type\":\"best_fields\"," + + "\"default_operator\":\"AND\",\"mode\":\"lucene\",\"minimum_should_match\":0}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + Assertions.assertNotNull(plan); + + // Root should be OCCUR_BOOLEAN + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + + // Find the MATCH_ALL_DOCS child - it MUST have occur=SHOULD + boolean foundMatchAllWithShould = false; + for (QsNode child : plan.getRoot().getChildren()) { + if (child.getType() == QsClauseType.MATCH_ALL_DOCS) { + Assertions.assertEquals(QsOccur.SHOULD, child.getOccur(), + "MATCH_ALL_DOCS must preserve SHOULD occur after multi-field expansion"); + foundMatchAllWithShould = true; + } + } + Assertions.assertTrue(foundMatchAllWithShould, + "Should contain MATCH_ALL_DOCS node with SHOULD occur"); + } + + @Test + public void testMultiFieldMatchAllDocsPreservesOccurWithAndOperator() { + // Test: 'Dollar AND *' with multi-field + lucene mode + // MATCH_ALL_DOCS should have occur=MUST (from AND operator) + String dsl = "Dollar AND *"; + String options = "{\"fields\":[\"title\",\"content\"],\"type\":\"best_fields\"," + + "\"default_operator\":\"OR\",\"mode\":\"lucene\",\"minimum_should_match\":0}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + Assertions.assertNotNull(plan); + + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + + // Find the MATCH_ALL_DOCS child - it MUST have occur=MUST (from AND operator) + boolean foundMatchAllWithMust = false; + for (QsNode child : plan.getRoot().getChildren()) { + if (child.getType() == QsClauseType.MATCH_ALL_DOCS) { + Assertions.assertEquals(QsOccur.MUST, child.getOccur(), + "MATCH_ALL_DOCS must preserve MUST occur after multi-field expansion"); + foundMatchAllWithMust = true; + } + } + Assertions.assertTrue(foundMatchAllWithMust, + "Should contain MATCH_ALL_DOCS node with MUST occur"); + } }