Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.doris.nereids.rules.expression.rules.NestedCaseWhenCondToLiteral;
import org.apache.doris.nereids.rules.expression.rules.NullSafeEqualToEqual;
import org.apache.doris.nereids.rules.expression.rules.PushIntoCaseWhenBranch;
import org.apache.doris.nereids.rules.expression.rules.RegexpFunctionRewrite;
import org.apache.doris.nereids.rules.expression.rules.SimplifyComparisonPredicate;
import org.apache.doris.nereids.rules.expression.rules.SimplifyConflictCompound;
import org.apache.doris.nereids.rules.expression.rules.SimplifyInPredicate;
Expand Down Expand Up @@ -70,6 +71,7 @@ public class ExpressionOptimization extends ExpressionRewrite {
PushIntoCaseWhenBranch.INSTANCE,
NullSafeEqualToEqual.INSTANCE,
LikeToEqualRewrite.INSTANCE,
RegexpFunctionRewrite.INSTANCE,
BetweenToEqual.INSTANCE,
StringEmptyToLengthRule.INSTANCE
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ public enum ExpressionRuleType {
NULL_SAFE_EQUAL_TO_EQUAL,
PUSH_INTO_CASE_WHEN_BRANCH,
REPLACE_VARIABLE_BY_LITERAL,
REGEXP_FUNCTION_REWRITE,
SIMPLIFY_ARITHMETIC_COMPARISON,
SIMPLIFY_ARITHMETIC,
SIMPLIFY_CAST,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.rules.expression.rules;

import org.apache.doris.nereids.rules.expression.ExpressionPatternMatcher;
import org.apache.doris.nereids.rules.expression.ExpressionPatternRuleFactory;
import org.apache.doris.nereids.rules.expression.ExpressionRuleType;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne;
import org.apache.doris.nereids.trees.expressions.literal.IntegerLikeLiteral;
import org.apache.doris.nereids.trees.expressions.literal.Literal;
import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral;

import com.google.common.collect.ImmutableList;

import java.util.List;

/**
* Rewrites regexp functions to cheaper equivalent forms when the regexp shape proves the rewrite is safe.
*/
public class RegexpFunctionRewrite implements ExpressionPatternRuleFactory {
public static final RegexpFunctionRewrite INSTANCE = new RegexpFunctionRewrite();

@Override
public List<ExpressionPatternMatcher<? extends Expression>> buildRules() {
return ImmutableList.of(
matchesType(RegexpReplace.class)
.then(RegexpFunctionRewrite::rewriteRegexpReplace)
.toRule(ExpressionRuleType.REGEXP_FUNCTION_REWRITE),
matchesType(RegexpExtract.class)
.then(RegexpFunctionRewrite::rewriteRegexpExtract)
.toRule(ExpressionRuleType.REGEXP_FUNCTION_REWRITE)
);
}

private static Expression rewriteRegexpReplace(RegexpReplace regexpReplace) {
String pattern = getStringLiteral(regexpReplace.child(1));
if (pattern == null || pattern.isEmpty()) {
return regexpReplace;
}
if (!startsWithUnescapedCaret(pattern) && !endsWithUnescapedDollar(pattern)) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This treats a trailing $ as proving that regexp_replace can match at most once, but inline regex flags can make $ match per line. For example, regexp_replace('a\na', '(?m)a$', 'x') is valid RE2 syntax; the original regexp_replace uses RE2::GlobalReplace and replaces both line-ending matches (x\nx), while the rewritten regexp_replace_one uses RE2::Replace and only replaces the first (x\na). Please skip this rewrite when the pattern can enable multiline mode, or otherwise prove the anchor is still single-match under inline flags.

return regexpReplace;
}
if (hasUnescapedAlternation(pattern) || hasInlineRegexpFlag(pattern, 'm')) {
return regexpReplace;
}

if (regexpReplace.arity() == 3) {
return new RegexpReplaceOne(regexpReplace.child(0), regexpReplace.child(1), regexpReplace.child(2));
}
return new RegexpReplaceOne(regexpReplace.child(0), regexpReplace.child(1), regexpReplace.child(2),
regexpReplace.child(3));
}

private static Expression rewriteRegexpExtract(RegexpExtract regexpExtract) {
String pattern = getStringLiteral(regexpExtract.child(1));
if (pattern == null || pattern.isEmpty() || !isPositiveGroupIndex(regexpExtract.child(2))
|| !hasCapturingGroup(pattern) || hasUnescapedAlternation(pattern)
|| hasInlineRegexpFlag(pattern, 's')) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This guard still lets the suffix trim run for patterns that force BE's regexp_extract into the extended-regex Boost fallback when enable_extended_regex=true. That path is distinct from the RE2 inline-flag cases already discussed: RegexpExtractEngine::compile first tries RE2 with dot_nl=true, but unsupported constructs like look-around fall back to boost::regex::normal, where . does not match newlines by default. For example, with extended regex enabled, regexp_extract(concat('fooa', char(10), 'tail'), '(?<=foo)(a).*$', 1) should not match because the trailing .*$ cannot consume through the newline in Boost; after this rewrite the pattern becomes (?<=foo)(a) and returns a. Please either skip this optimization for patterns that may require the Boost fallback, or otherwise prove the runtime engine keeps the same dot/newline semantics before dropping .*$.

return regexpExtract;
}

String trimmedPattern = trimExtractPattern(pattern);
if (trimmedPattern.equals(pattern)) {
return regexpExtract;
}
return new RegexpExtract(regexpExtract.child(0), new VarcharLiteral(trimmedPattern), regexpExtract.child(2));
}

private static String trimExtractPattern(String pattern) {
String trimmed = pattern;
if (endsWithUnescapedDotStarDollar(trimmed)) {
trimmed = trimmed.substring(0, trimmed.length() - 3);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trimming a trailing .*$ is not semantics-preserving when inline flags disable dot-newline matching. Doris sets RE2 dot_nl by default, but RE2 inline flags can override it; for example regexp_extract(concat('a', char(10), 'b'), '(?-s)^(a).*$', 1) does not match because . cannot consume the newline, whereas the rewritten pattern (?-s)^(a) matches and returns a. Please guard against inline dot-mode changes, or avoid this rewrite for patterns containing inline flag groups that can affect ./$.

}
return trimmed;
}

private static String getStringLiteral(Expression expression) {
if (!(expression instanceof Literal) || !expression.getDataType().isStringLikeType()) {
return null;
}
return ((Literal) expression).getStringValue();
}

private static boolean isPositiveGroupIndex(Expression expression) {
return expression instanceof IntegerLikeLiteral && ((IntegerLikeLiteral) expression).getLongValue() >= 1;
}

private static boolean startsWithUnescapedCaret(String pattern) {
return !pattern.isEmpty() && pattern.charAt(0) == '^';
}

private static boolean endsWithUnescapedDollar(String pattern) {
int dollarPos = pattern.length() - 1;
return dollarPos >= 0 && pattern.charAt(dollarPos) == '$'
&& isUnescaped(pattern, dollarPos) && !isInCharClass(pattern, dollarPos);
}

private static boolean endsWithUnescapedDotStarDollar(String pattern) {
if (pattern.length() < 3 || !pattern.endsWith(".*$")) {
return false;
}
int dotPos = pattern.length() - 3;
return isUnescaped(pattern, dotPos) && !isInCharClass(pattern, dotPos);
}

private static boolean isUnescaped(String pattern, int pos) {
int backslashCount = 0;
for (int i = pos - 1; i >= 0 && pattern.charAt(i) == '\\'; i--) {
backslashCount++;
}
return backslashCount % 2 == 0;
}

private static boolean hasUnescapedAlternation(String pattern) {
boolean inCharClass = false;
for (int i = 0; i < pattern.length(); i++) {
char ch = pattern.charAt(i);
if (!isUnescaped(pattern, i)) {
continue;
}
if (ch == '[') {
inCharClass = true;
} else if (ch == ']' && inCharClass) {
inCharClass = false;
} else if (ch == '|' && !inCharClass) {
return true;
}
}
return false;
}

private static boolean hasCapturingGroup(String pattern) {
boolean inCharClass = false;
for (int i = 0; i < pattern.length(); i++) {
char ch = pattern.charAt(i);
if (!isUnescaped(pattern, i)) {
continue;
}
if (ch == '[') {
inCharClass = true;
} else if (ch == ']' && inCharClass) {
inCharClass = false;
} else if (ch == '(' && !inCharClass && (i + 1 >= pattern.length() || pattern.charAt(i + 1) != '?')) {
return true;
}
}
return false;
}

private static boolean hasInlineRegexpFlag(String pattern, char targetFlag) {
boolean inCharClass = false;
for (int i = 0; i < pattern.length() - 2; i++) {
char ch = pattern.charAt(i);
if (!isUnescaped(pattern, i)) {
continue;
}
if (ch == '[') {
inCharClass = true;
} else if (ch == ']' && inCharClass) {
inCharClass = false;
} else if (ch == '(' && !inCharClass && pattern.charAt(i + 1) == '?'
&& isInlineFlagChar(pattern.charAt(i + 2))) {
for (int j = i + 2; j < pattern.length(); j++) {
char flag = pattern.charAt(j);
if (flag == ':' || flag == ')') {
break;
}
if (flag == targetFlag) {
return true;
}
if (!isInlineFlagChar(flag)) {
break;
}
}
}
}
return false;
}

private static boolean isInlineFlagChar(char ch) {
return ch == 'i' || ch == 'm' || ch == 's' || ch == 'U' || ch == '-';
}

private static boolean isInCharClass(String pattern, int pos) {
boolean inCharClass = false;
for (int i = 0; i < pos; i++) {
char ch = pattern.charAt(i);
if (!isUnescaped(pattern, i)) {
continue;
}
if (ch == '[') {
inCharClass = true;
} else if (ch == ']' && inCharClass) {
inCharClass = false;
}
}
return inCharClass;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.rules.expression.rules;

import org.apache.doris.nereids.rules.expression.ExpressionRewriteTestHelper;
import org.apache.doris.nereids.rules.expression.ExpressionRuleExecutor;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.SlotReference;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne;
import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral;
import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral;
import org.apache.doris.nereids.types.StringType;

import com.google.common.collect.ImmutableList;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

public class RegexpFunctionRewriteTest extends ExpressionRewriteTestHelper {
private SlotReference str;

@BeforeEach
public void setup() {
executor = new ExpressionRuleExecutor(ImmutableList.of(
bottomUp(RegexpFunctionRewrite.INSTANCE)
));
str = new SlotReference("str", StringType.INSTANCE, true);
}

@Test
public void testRewriteAnchoredRegexpReplace() {
RegexpReplace before = new RegexpReplace(str, new VarcharLiteral("^https?://(?:www\\.)?([^/]+)/.*$"),
new VarcharLiteral("\\1"));
RegexpReplaceOne expected = new RegexpReplaceOne(str,
new VarcharLiteral("^https?://(?:www\\.)?([^/]+)/.*$"), new VarcharLiteral("\\1"));
assertRuleRewrite(before, expected);
}

@Test
public void testRewriteAnchoredRegexpReplaceWithOptions() {
RegexpReplace before = new RegexpReplace(str, new VarcharLiteral("^abc"), new VarcharLiteral("x"),
new VarcharLiteral("ignore_invalid_escape"));
RegexpReplaceOne expected = new RegexpReplaceOne(str, new VarcharLiteral("^abc"), new VarcharLiteral("x"),
new VarcharLiteral("ignore_invalid_escape"));
assertRuleRewrite(before, expected);
}

@Test
public void testDoNotRewriteRegexpReplaceWithAlternation() {
RegexpReplace before = new RegexpReplace(str, new VarcharLiteral("^a|b"), new VarcharLiteral("x"));
assertRuleNoRewrite(before);
}

@Test
public void testDoNotRewriteRegexpReplaceWithEscapedDollar() {
RegexpReplace before = new RegexpReplace(str, new VarcharLiteral("a\\$"), new VarcharLiteral("x"));
assertRuleNoRewrite(before);
}

@Test
public void testDoNotRewriteRegexpReplaceWithInlineMultilineFlag() {
RegexpReplace before = new RegexpReplace(str, new VarcharLiteral("(?m)a$"), new VarcharLiteral("x"));
assertRuleNoRewrite(before);
}

@Test
public void testRewriteRegexpExtractTrimSuffix() {
RegexpExtract before = new RegexpExtract(str, new VarcharLiteral("^.*(abc).*$"), new BigIntLiteral(1));
RegexpExtract expected = new RegexpExtract(str, new VarcharLiteral("^.*(abc)"), new BigIntLiteral(1));
assertRuleRewrite(before, expected);
}

@Test
public void testRewriteRegexpExtractTrimSuffixOnly() {
RegexpExtract before = new RegexpExtract(str, new VarcharLiteral("^([a-z]+).*$"), new BigIntLiteral(1));
RegexpExtract expected = new RegexpExtract(str, new VarcharLiteral("^([a-z]+)"), new BigIntLiteral(1));
assertRuleRewrite(before, expected);
}

@Test
public void testDoNotRewriteRegexpExtractGroupZero() {
RegexpExtract before = new RegexpExtract(str, new VarcharLiteral("^.*(abc).*$"), new BigIntLiteral(0));
assertRuleNoRewrite(before);
}

@Test
public void testDoNotRewriteRegexpExtractWithoutCapture() {
RegexpExtract before = new RegexpExtract(str, new VarcharLiteral("^.*abc.*$"), new BigIntLiteral(1));
assertRuleNoRewrite(before);
}

@Test
public void testRewriteRegexpExtractLazyPrefixSuffixOnly() {
RegexpExtract before = new RegexpExtract(str, new VarcharLiteral("^.*?(abc).*$"), new BigIntLiteral(1));
RegexpExtract expected = new RegexpExtract(str, new VarcharLiteral("^.*?(abc)"), new BigIntLiteral(1));
assertRuleRewrite(before, expected);
}

@Test
public void testDoNotRewriteRegexpExtractEscapedDotSuffix() {
RegexpExtract before = new RegexpExtract(str, new VarcharLiteral("^(a)\\.*$"), new BigIntLiteral(1));
assertRuleNoRewrite(before);
}

@Test
public void testDoNotRewriteRegexpExtractWithAlternation() {
RegexpExtract before = new RegexpExtract(str, new VarcharLiteral("(a)|(b).*$"), new BigIntLiteral(1));
assertRuleNoRewrite(before);
}

@Test
public void testDoNotRewriteRegexpExtractWithInlineDotFlag() {
RegexpExtract before = new RegexpExtract(str, new VarcharLiteral("(?-s)^(a).*$"), new BigIntLiteral(1));
assertRuleNoRewrite(before);
}

private void assertRuleRewrite(Expression before, Expression expected) {
Assertions.assertEquals(expected, executor.rewrite(before, context));
}

private void assertRuleNoRewrite(Expression before) {
Assertions.assertEquals(before, executor.rewrite(before, context));
}
}
Loading
Loading