Re: Another regexp performance improvement: skip useless paren-captures - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Another regexp performance improvement: skip useless paren-captures
Date
Msg-id 3173384.1628442278@sss.pgh.pa.us
Whole thread Raw
In response to Re: Another regexp performance improvement: skip useless paren-captures  (Mark Dilger <mark.dilger@enterprisedb.com>)
Responses Re: Another regexp performance improvement: skip useless paren-captures  (Mark Dilger <mark.dilger@enterprisedb.com>)
List pgsql-hackers
Mark Dilger <mark.dilger@enterprisedb.com> writes:
> The patch triggers an assertion that master does not:

> +select 'azrlfkjbjgidgryryiglcabkgqluflu' !~ '(.(.)((.)))((?:(\3)))';

On looking into this, it's pretty simple: regexec.c has an assertion
that a pure-capture subre node ought to be doing some capturing.

        case '(':                /* no-op capture node */
            assert(t->child != NULL);
            assert(t->capno > 0);

That's fine as of HEAD, but with the proposed patch, we may notice
that the node isn't actually referenced by any backref, and remove
its capture marker, allowing this assertion to fire.  Nothing's
really wrong though.

There seem to be three things we could do about that:

1. Extend removecaptures() so that it can actually remove no-op
capture nodes if it's removed their capture markings.  This would
substantially complicate that function, and I judge that it's not
worth the trouble.  We'll only have such nodes in cases of
capturing parentheses immediately surrounding capturing parentheses,
which doesn't seem like a case worth expending sweat for.

2. Just drop the "t->capno > 0" assertion in regexec.c.

3. Weaken said assertion, perhaps by also checking the BRUSE flag bit.

Not sure that I see any point to #3, so I just dropped the
assertion in the attached.

I've also rebased over the bug fixes from the other thread,
and added a couple more test cases.

            regards, tom lane

diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index bf1dea6352..64816dd370 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -541,9 +541,11 @@ createTrgmNFA(text *text_re, Oid collation,
      * Stage 1: Compile the regexp into a NFA, using the regexp library.
      */
 #ifdef IGNORECASE
-    RE_compile(®ex, text_re, REG_ADVANCED | REG_ICASE, collation);
+    RE_compile(®ex, text_re,
+               REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
 #else
-    RE_compile(®ex, text_re, REG_ADVANCED, collation);
+    RE_compile(®ex, text_re,
+               REG_ADVANCED | REG_NOSUB, collation);
 #endif

     /*
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
index 7673dab76f..45727ffa01 100644
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@@ -528,10 +528,7 @@ next(struct vars *v)
                 }
                 assert(NOTREACHED);
             }
-            if (v->cflags & REG_NOSUB)
-                RETV('(', 0);    /* all parens non-capturing */
-            else
-                RETV('(', 1);
+            RETV('(', 1);
             break;
         case CHR(')'):
             if (LASTTYPE('('))
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 60a220c57a..ae3a7b6a38 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -65,7 +65,7 @@ static struct subre *subre(struct vars *, int, int, struct state *, struct state
 static void freesubre(struct vars *, struct subre *);
 static void freesubreandsiblings(struct vars *, struct subre *);
 static void freesrnode(struct vars *, struct subre *);
-static void optst(struct vars *, struct subre *);
+static void removecaptures(struct vars *, struct subre *);
 static int    numst(struct subre *, int);
 static void markst(struct subre *);
 static void cleanst(struct vars *);
@@ -431,7 +431,8 @@ pg_regcomp(regex_t *re,
         dumpst(v->tree, debug, 1);
     }
 #endif
-    optst(v, v->tree);
+    if (v->cflags & REG_NOSUB)
+        removecaptures(v, v->tree);
     v->ntree = numst(v->tree, 1);
     markst(v->tree);
     cleanst(v);
@@ -1013,14 +1014,16 @@ parseqatom(struct vars *v,
             break;
         case BACKREF:            /* the Feature From The Black Lagoon */
             INSIST(type != LACON, REG_ESUBREG);
-            INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
-            INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
+            subno = v->nextvalue;
+            assert(subno > 0);
+            INSIST(subno < v->nsubs, REG_ESUBREG);
+            NOERRN();
+            INSIST(v->subs[subno] != NULL, REG_ESUBREG);
             NOERRN();
-            assert(v->nextvalue > 0);
             atom = subre(v, 'b', BACKR, lp, rp);
             NOERRN();
-            subno = v->nextvalue;
             atom->backno = subno;
+            v->subs[subno]->flags |= BRUSE;
             EMPTYARC(lp, rp);    /* temporarily, so there's something */
             NEXT();
             break;
@@ -2141,19 +2144,54 @@ freesrnode(struct vars *v,        /* might be NULL */
 }

 /*
- * optst - optimize a subRE subtree
+ * removecaptures - remove unnecessary capture subREs
+ *
+ * If the caller said that it doesn't care about subexpression match data,
+ * we may delete the "capture" markers on subREs that are not referenced
+ * by any backrefs, and then simplify anything that's become non-messy.
+ * Call this only if REG_NOSUB flag is set.
  */
 static void
-optst(struct vars *v,
-      struct subre *t)
+removecaptures(struct vars *v,
+               struct subre *t)
 {
+    struct subre *t2;
+
+    assert(t != NULL);
+
+    /*
+     * If this isn't itself a backref target, clear capno and tentatively
+     * clear CAP flag.
+     */
+    if (!(t->flags & BRUSE))
+    {
+        t->capno = 0;
+        t->flags &= ~CAP;
+    }
+
+    /* Now recurse to children */
+    for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+    {
+        removecaptures(v, t2);
+        /* Propagate child CAP flag back up, if it's still set */
+        if (t2->flags & CAP)
+            t->flags |= CAP;
+    }
+
     /*
-     * DGP (2007-11-13): I assume it was the programmer's intent to eventually
-     * come back and add code to optimize subRE trees, but the routine coded
-     * just spends effort traversing the tree and doing nothing. We can do
-     * nothing with less effort.
+     * If t now contains neither captures nor backrefs, there's no longer any
+     * need to care where its sub-match boundaries are, so we can reduce it to
+     * a simple DFA node.  (Note in particular that MIXED child greediness is
+     * not a hindrance here, so we don't use the MESSY() macro.)
      */
-    return;
+    if ((t->flags & (CAP | BACKR)) == 0)
+    {
+        if (t->child)
+            freesubreandsiblings(v, t->child);
+        t->child = NULL;
+        t->op = '=';
+        t->flags &= ~MIXED;
+    }
 }

 /*
@@ -2501,6 +2539,8 @@ stdump(struct subre *t,
         fprintf(f, " hascapture");
     if (t->flags & BACKR)
         fprintf(f, " hasbackref");
+    if (t->flags & BRUSE)
+        fprintf(f, " isreferenced");
     if (!(t->flags & INUSE))
         fprintf(f, " UNUSED");
     if (t->latype != (char) -1)
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 5b9a087820..3800e74ef7 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -213,12 +213,10 @@ pg_regexec(regex_t *re,
         return REG_NOMATCH;
     backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
     v->eflags = flags;
-    if (v->g->cflags & REG_NOSUB)
-        nmatch = 0;                /* override client */
     v->nmatch = nmatch;
-    if (backref)
+    if (backref && nmatch <= v->g->nsub)
     {
-        /* need work area */
+        /* need larger work area */
         if (v->g->nsub + 1 <= LOCALMAT)
             v->pmatch = mat;
         else
@@ -229,7 +227,13 @@ pg_regexec(regex_t *re,
         v->nmatch = v->g->nsub + 1;
     }
     else
+    {
+        /* we can store results directly in caller's array */
         v->pmatch = pmatch;
+        /* ensure any extra entries in caller's array are filled with -1 */
+        if (nmatch > 0)
+            zapallsubs(pmatch, nmatch);
+    }
     v->details = details;
     v->start = (chr *) string;
     v->search_start = (chr *) string + search_start;
@@ -290,12 +294,20 @@ pg_regexec(regex_t *re,
     else
         st = find(v, &v->g->tree->cnfa, &v->g->cmap);

-    /* copy (portion of) match vector over if necessary */
-    if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
+    /* on success, ensure caller's match vector is filled correctly */
+    if (st == REG_OKAY && nmatch > 0)
     {
-        zapallsubs(pmatch, nmatch);
-        n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
-        memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
+        if (v->g->cflags & REG_NOSUB)
+        {
+            /* ensure we don't return partial results */
+            zapallsubs(pmatch, nmatch);
+        }
+        else if (v->pmatch != pmatch)
+        {
+            /* copy portion of match vector over from (larger) work area */
+            assert(nmatch <= v->nmatch);
+            memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
+        }
     }

     /* clean up */
@@ -752,7 +764,6 @@ cdissect(struct vars *v,
             break;
         case '(':                /* no-op capture node */
             assert(t->child != NULL);
-            assert(t->capno > 0);
             er = cdissect(v, t->child, begin, end);
             break;
         default:
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y
index de3d97931e..bd5d4488a0 100644
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -583,6 +583,14 @@ jspConvertRegexFlags(uint32 xflags)
                      errmsg("XQuery \"x\" flag (expanded regular expressions) is not implemented")));
     }

+    /*
+     * We'll never need sub-match details at execution.  While
+     * RE_compile_and_execute would set this flag anyway, force it on here to
+     * ensure that the regex cache entries created by makeItemLikeRegex are
+     * useful.
+     */
+    cflags |= REG_NOSUB;
+
     return cflags;
 }

diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 484d4265fd..268cee1cbe 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -347,6 +347,10 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
 {
     regex_t    *re;

+    /* Use REG_NOSUB if caller does not want sub-match details */
+    if (nmatch < 2)
+        cflags |= REG_NOSUB;
+
     /* Compile RE */
     re = RE_compile_and_cache(text_re, cflags, collation);

@@ -1412,6 +1416,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
     int            orig_len;
     pg_wchar   *wide_str;
     int            wide_len;
+    int            cflags;
     regex_t    *cpattern;
     regmatch_t *pmatch;
     int            pmatch_len;
@@ -1430,7 +1435,10 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
     wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);

     /* set up the compiled pattern */
-    cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
+    cflags = re_flags->cflags;
+    if (!use_subpatterns)
+        cflags |= REG_NOSUB;
+    cpattern = RE_compile_and_cache(pattern, cflags, collation);

     /* do we want to remember subpatterns? */
     if (use_subpatterns && cpattern->re_nsub > 0)
@@ -1952,7 +1960,7 @@ regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
     if (case_insensitive)
         cflags |= REG_ICASE;

-    re = RE_compile_and_cache(text_re, cflags, collation);
+    re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);

     /* Examine it to see if there's a fixed prefix */
     re_result = pg_regprefix(re, &str, &slen);
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h
index 2b48a19fb7..0455ae8069 100644
--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -106,7 +106,7 @@ typedef struct
 #define REG_QUOTE    000004        /* no special characters, none */
 #define REG_NOSPEC    REG_QUOTE    /* historical synonym */
 #define REG_ICASE    000010        /* ignore case */
-#define REG_NOSUB    000020        /* don't care about subexpressions */
+#define REG_NOSUB    000020        /* caller doesn't need subexpr match data */
 #define REG_EXPANDED    000040    /* expanded format, white space & comments */
 #define REG_NLSTOP    000100        /* \n doesn't match . or [^ ] */
 #define REG_NLANCH    000200        /* ^ matches after \n, $ before */
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 8db631c83b..91a52840c4 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -477,13 +477,14 @@ struct subre
 #define  MIXED     04                /* mixed preference below */
 #define  CAP     010            /* capturing parens here or below */
 #define  BACKR     020            /* back reference here or below */
+#define  BRUSE     040            /* is referenced by a back reference */
 #define  INUSE     0100            /* in use in final tree */
-#define  NOPROP  03                /* bits which may not propagate up */
+#define  UPPROP  (MIXED|CAP|BACKR)    /* flags which should propagate up */
 #define  LMIX(f) ((f)<<2)        /* LONGER -> MIXED */
 #define  SMIX(f) ((f)<<1)        /* SHORTER -> MIXED */
-#define  UP(f)     (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED))
+#define  UP(f)     (((f)&UPPROP) | (LMIX(f) & SMIX(f) & MIXED))
 #define  MESSY(f)     ((f)&(MIXED|CAP|BACKR))
-#define  PREF(f) ((f)&NOPROP)
+#define  PREF(f)     ((f)&(LONGER|SHORTER))
 #define  PREF2(f1, f2)     ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
 #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
     char        latype;            /* LATYPE code, if lookaround constraint */
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 44da7d2019..6f47ddde40 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -146,12 +146,12 @@ select * from test_regex('(a)e', 'ae', '-');
  {ae,a}
 (2 rows)

--- expectMatch    4.2  o        (a)e        ae
-select * from test_regex('(a)e', 'ae', 'o');
- test_regex
-------------
- {0}
- {NULL}
+-- expectMatch    4.2  oPR    (.)\1e        abeaae    aae    {}
+select * from test_regex('(.)\1e', 'abeaae', 'oPR');
+           test_regex
+--------------------------------
+ {1,REG_UBACKREF,REG_UNONPOSIX}
+ {aae,NULL}
 (2 rows)

 -- expectMatch    4.3  b        {\(a\)b}    ab    ab    a
@@ -2658,6 +2658,14 @@ select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
  {"abc abc",abc}
 (2 rows)

+-- exercise oversize-regmatch_t-array paths in regexec()
+-- (that case is not reachable via test_regex, sadly)
+select substring('fffoooooooooooooooooooooooooooooooo',
'^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
+ substring
+-----------
+ f
+(1 row)
+
 -- doing 15 "octal escapes vs back references"
 -- # initial zero is always octal
 -- expectMatch    15.1  MP    "a\\010b"    "a\bb"    "a\bb"
@@ -3476,6 +3484,22 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
  {x,x,x,NULL}
 (2 rows)

+-- expectMatch    21.37 RP    ((.))(\2)    xyy    yy    y    y    y
+select * from test_regex('((.))(\2)', 'xyy', 'RP');
+           test_regex
+--------------------------------
+ {3,REG_UBACKREF,REG_UNONPOSIX}
+ {yy,y,y,y}
+(2 rows)
+
+-- expectMatch    21.38 oRP    ((.))(\2)    xyy    yy    {}    {}    {}
+select * from test_regex('((.))(\2)', 'xyy', 'oRP');
+           test_regex
+--------------------------------
+ {3,REG_UBACKREF,REG_UNONPOSIX}
+ {yy,NULL,NULL,NULL}
+(2 rows)
+
 -- doing 22 "multicharacter collating elements"
 -- # again ugh
 -- MCCEs are not implemented in Postgres, so we skip all these tests
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 9224fdfdd3..2861708932 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -63,8 +63,8 @@ select * from test_regex('ab', 'ab', 'b');

 -- expectMatch    4.1  -        (a)e        ae    ae    a
 select * from test_regex('(a)e', 'ae', '-');
--- expectMatch    4.2  o        (a)e        ae
-select * from test_regex('(a)e', 'ae', 'o');
+-- expectMatch    4.2  oPR    (.)\1e        abeaae    aae    {}
+select * from test_regex('(.)\1e', 'abeaae', 'oPR');
 -- expectMatch    4.3  b        {\(a\)b}    ab    ab    a
 select * from test_regex('\(a\)b', 'ab', 'b');
 -- expectMatch    4.4  -        a((b)c)        abc    abc    bc    b
@@ -775,6 +775,10 @@ select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
 select * from test_regex('(^\w+\M).*\1', 'abc abcd abd', 'LRP');
 select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');

+-- exercise oversize-regmatch_t-array paths in regexec()
+-- (that case is not reachable via test_regex, sadly)
+select substring('fffoooooooooooooooooooooooooooooooo',
'^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
+
 -- doing 15 "octal escapes vs back references"

 -- # initial zero is always octal
@@ -1011,6 +1015,10 @@ select * from test_regex('(a*)*', 'bc', 'N');
 select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M');
 -- expectMatch    21.36 RPQ    ((.))(\2){0}    xy    x    x    x    {}
 select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
+-- expectMatch    21.37 RP    ((.))(\2)    xyy    yy    y    y    y
+select * from test_regex('((.))(\2)', 'xyy', 'RP');
+-- expectMatch    21.38 oRP    ((.))(\2)    xyy    yy    {}    {}    {}
+select * from test_regex('((.))(\2)', 'xyy', 'oRP');

 -- doing 22 "multicharacter collating elements"
 -- # again ugh

pgsql-hackers by date:

Previous
From: Mark Dilger
Date:
Subject: Re: Assert triggered during RE_compile_and_cache
Next
From: Mark Dilger
Date:
Subject: Re: Assert triggered during RE_compile_and_cache