Re: Another regexp performance improvement: skip useless paren-captures - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Another regexp performance improvement: skip useless paren-captures
Date
Msg-id 3534632.1628536485@sss.pgh.pa.us
Whole thread Raw
In response to Re: Another regexp performance improvement: skip useless paren-captures  (Mark Dilger <mark.dilger@enterprisedb.com>)
Responses Re: Another regexp performance improvement: skip useless paren-captures  (Mark Dilger <mark.dilger@enterprisedb.com>)
Re: Another regexp performance improvement: skip useless paren-captures  (Mark Dilger <mark.dilger@enterprisedb.com>)
List pgsql-hackers
Mark Dilger <mark.dilger@enterprisedb.com> writes:
> The patch looks ready to commit.  I don't expect to test it any further unless you have something in particular you'd
likeme to focus on. 

Pushed, but while re-reading it before commit I noticed that there's
some more fairly low-hanging fruit in regexp_replace().  As I had it
in that patch, it never used REG_NOSUB because of the possibility
that the replacement string uses "\N".  However, we're already
pre-checking the replacement string to see if it has backslashes
at all, so while we're at it we can check for \N to discover if we
actually need any subexpression match data or not.  We do need to
refactor a little to postpone calling pg_regcomp until after we
know that, but I think that makes replace_text_regexp's API less
ugly not more so.

While I was at it, I changed the search-for-backslash loops to
use memchr rather than handwritten looping.  Their use of
pg_mblen was pretty unnecessary given we only need to find
backslashes, and we can assume the backend encoding is ASCII-safe.

Using a bunch of random cases generated by your little perl
script, I see maybe 10-15% speedup on test cases that don't
use \N in the replacement string, while it's about a wash
on cases that do.  (If I'd been using a multibyte encoding,
maybe the memchr change would have made a difference, but
I didn't try that.)

            regards, tom lane

diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 268cee1cbe..3b7a607437 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -630,11 +630,10 @@ textregexreplace_noopt(PG_FUNCTION_ARGS)
     text       *s = PG_GETARG_TEXT_PP(0);
     text       *p = PG_GETARG_TEXT_PP(1);
     text       *r = PG_GETARG_TEXT_PP(2);
-    regex_t    *re;
-
-    re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());

-    PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, 0, 1));
+    PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
+                                         REG_ADVANCED, PG_GET_COLLATION(),
+                                         0, 1));
 }

 /*
@@ -648,7 +647,6 @@ textregexreplace(PG_FUNCTION_ARGS)
     text       *p = PG_GETARG_TEXT_PP(1);
     text       *r = PG_GETARG_TEXT_PP(2);
     text       *opt = PG_GETARG_TEXT_PP(3);
-    regex_t    *re;
     pg_re_flags flags;

     /*
@@ -672,10 +670,9 @@ textregexreplace(PG_FUNCTION_ARGS)

     parse_re_flags(&flags, opt);

-    re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
-
-    PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, 0,
-                                         flags.glob ? 0 : 1));
+    PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
+                                         flags.cflags, PG_GET_COLLATION(),
+                                         0, flags.glob ? 0 : 1));
 }

 /*
@@ -694,7 +691,6 @@ textregexreplace_extended(PG_FUNCTION_ARGS)
     int            n = 1;
     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
     pg_re_flags re_flags;
-    regex_t    *re;

     /* Collect optional parameters */
     if (PG_NARGS() > 3)
@@ -723,11 +719,10 @@ textregexreplace_extended(PG_FUNCTION_ARGS)
     if (PG_NARGS() <= 4)
         n = re_flags.glob ? 0 : 1;

-    /* Compile the regular expression */
-    re = RE_compile_and_cache(p, re_flags.cflags, PG_GET_COLLATION());
-
     /* Do the replacement(s) */
-    PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, start - 1, n));
+    PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
+                                         re_flags.cflags, PG_GET_COLLATION(),
+                                         start - 1, n));
 }

 /* This is separate to keep the opr_sanity regression test from complaining */
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 348b5566de..acb8741734 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -4359,34 +4359,36 @@ replace_text(PG_FUNCTION_ARGS)
 }

 /*
- * check_replace_text_has_escape_char
+ * check_replace_text_has_escape
  *
- * check whether replace_text contains escape char.
+ * Returns 0 if text contains no backslashes that need processing.
+ * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
+ * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
  */
-static bool
-check_replace_text_has_escape_char(const text *replace_text)
+static int
+check_replace_text_has_escape(const text *replace_text)
 {
+    int            result = 0;
     const char *p = VARDATA_ANY(replace_text);
     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);

-    if (pg_database_encoding_max_length() == 1)
-    {
-        for (; p < p_end; p++)
-        {
-            if (*p == '\\')
-                return true;
-        }
-    }
-    else
+    while (p < p_end)
     {
-        for (; p < p_end; p += pg_mblen(p))
+        /* Find next escape char, if any. */
+        p = memchr(p, '\\', p_end - p);
+        if (p == NULL)
+            break;
+        p++;
+        /* Note: a backslash at the end doesn't require extra processing. */
+        if (p < p_end)
         {
-            if (*p == '\\')
-                return true;
+            if (*p >= '1' && *p <= '9')
+                return 2;        /* Found a submatch specifier, so done */
+            result = 1;            /* Found some other sequence, keep looking */
+            p++;
         }
     }
-
-    return false;
+    return result;
 }

 /*
@@ -4403,25 +4405,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
 {
     const char *p = VARDATA_ANY(replace_text);
     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
-    int            eml = pg_database_encoding_max_length();

-    for (;;)
+    while (p < p_end)
     {
         const char *chunk_start = p;
         int            so;
         int            eo;

-        /* Find next escape char. */
-        if (eml == 1)
-        {
-            for (; p < p_end && *p != '\\'; p++)
-                 /* nothing */ ;
-        }
-        else
-        {
-            for (; p < p_end && *p != '\\'; p += pg_mblen(p))
-                 /* nothing */ ;
-        }
+        /* Find next escape char, if any. */
+        p = memchr(p, '\\', p_end - p);
+        if (p == NULL)
+            p = p_end;

         /* Copy the text we just scanned over, if any. */
         if (p > chunk_start)
@@ -4473,7 +4467,7 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
             continue;
         }

-        if (so != -1 && eo != -1)
+        if (so >= 0 && eo >= 0)
         {
             /*
              * Copy the text that is back reference of regexp.  Note so and eo
@@ -4491,36 +4485,37 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
     }
 }

-#define REGEXP_REPLACE_BACKREF_CNT        10
-
 /*
  * replace_text_regexp
  *
- * replace substring(s) in src_text that match regexp with replace_text.
+ * replace substring(s) in src_text that match pattern with replace_text.
+ * The replace_text can contain backslash markers to substitute
+ * (parts of) the matched text.
  *
+ * cflags: regexp compile flags.
+ * collation: collation to use.
  * search_start: the character (not byte) offset in src_text at which to
  * begin searching.
  * n: if 0, replace all matches; if > 0, replace only the N'th match.
- *
- * Note: to avoid having to include regex.h in builtins.h, we declare
- * the regexp argument as void *, but really it's regex_t *.
  */
 text *
-replace_text_regexp(text *src_text, void *regexp,
+replace_text_regexp(text *src_text, text *pattern_text,
                     text *replace_text,
+                    int cflags, Oid collation,
                     int search_start, int n)
 {
     text       *ret_text;
-    regex_t    *re = (regex_t *) regexp;
+    regex_t    *re;
     int            src_text_len = VARSIZE_ANY_EXHDR(src_text);
     int            nmatches = 0;
     StringInfoData buf;
-    regmatch_t    pmatch[REGEXP_REPLACE_BACKREF_CNT];
+    regmatch_t    pmatch[10];        /* main match, plus \1 to \9 */
+    int            nmatch = lengthof(pmatch);
     pg_wchar   *data;
     size_t        data_len;
     int            data_pos;
     char       *start_ptr;
-    bool        have_escape;
+    int            escape_status;

     initStringInfo(&buf);

@@ -4528,8 +4523,19 @@ replace_text_regexp(text *src_text, void *regexp,
     data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
     data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);

-    /* Check whether replace_text has escape char. */
-    have_escape = check_replace_text_has_escape_char(replace_text);
+    /* Check whether replace_text has escapes, especially regexp submatches. */
+    escape_status = check_replace_text_has_escape(replace_text);
+
+    /* If no regexp submatches, we can use REG_NOSUB. */
+    if (escape_status < 2)
+    {
+        cflags |= REG_NOSUB;
+        /* Also tell pg_regexec we only want the whole-match location. */
+        nmatch = 1;
+    }
+
+    /* Prepare the regexp. */
+    re = RE_compile_and_cache(pattern_text, cflags, collation);

     /* start_ptr points to the data_pos'th character of src_text */
     start_ptr = (char *) VARDATA_ANY(src_text);
@@ -4546,7 +4552,7 @@ replace_text_regexp(text *src_text, void *regexp,
                                     data_len,
                                     search_start,
                                     NULL,    /* no details */
-                                    REGEXP_REPLACE_BACKREF_CNT,
+                                    nmatch,
                                     pmatch,
                                     0);

@@ -4602,10 +4608,9 @@ replace_text_regexp(text *src_text, void *regexp,
         }

         /*
-         * Copy the replace_text. Process back references when the
-         * replace_text has escape characters.
+         * Copy the replace_text, processing escapes if any are present.
          */
-        if (have_escape)
+        if (escape_status > 0)
             appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
                                          start_ptr, data_pos);
         else
diff --git a/src/include/utils/varlena.h b/src/include/utils/varlena.h
index 6645e2af13..cd12252ed9 100644
--- a/src/include/utils/varlena.h
+++ b/src/include/utils/varlena.h
@@ -33,8 +33,9 @@ extern bool SplitDirectoriesString(char *rawstring, char separator,
                                    List **namelist);
 extern bool SplitGUCList(char *rawstring, char separator,
                          List **namelist);
-extern text *replace_text_regexp(text *src_text, void *regexp,
+extern text *replace_text_regexp(text *src_text, text *pattern_text,
                                  text *replace_text,
+                                 int cflags, Oid collation,
                                  int search_start, int n);

 #endif

pgsql-hackers by date:

Previous
From: Robert Haas
Date:
Subject: using an end-of-recovery record in all cases
Next
From: Mark Dilger
Date:
Subject: Re: Use extended statistics to estimate (Var op Var) clauses