Re: Another regexp performance improvement: skip useless paren-captures - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Another regexp performance improvement: skip useless paren-captures
Date
Msg-id 3643732.1628551901@sss.pgh.pa.us
Whole thread Raw
In response to Re: Another regexp performance improvement: skip useless paren-captures  (Mark Dilger <mark.dilger@enterprisedb.com>)
Responses Re: Another regexp performance improvement: skip useless paren-captures  (Mark Dilger <mark.dilger@enterprisedb.com>)
Re: Another regexp performance improvement: skip useless paren-captures  (Mark Dilger <mark.dilger@enterprisedb.com>)
List pgsql-hackers
Mark Dilger <mark.dilger@enterprisedb.com> writes:
> +select regexp_split_to_array('', '(?:((?:q+))){0}(\1){0,0}?*[^]');
> +server closed the connection unexpectedly

Here's a quick draft patch for this.  Basically it moves the
responsibility for clearing v->subs[] pointers into the freesubre()
recursion, so that it will happen for contained capturing parens
not only the top level.

There is a potentially interesting definitional question:
what exactly ought this regexp do?

        ((.)){0}\2

Because the capturing paren sets are zero-quantified, they will
never be matched to any characters, so the backref can never
have any defined referent.  I suspect that study of the POSIX
spec would lead to the conclusion that this is a legal regexp
but it will never match anything.  Implementing that would be
tedious though, and what's more it seems very unlikely that
the user wanted any such behavior.  So I think throwing an
error is an appropriate response.  The existing code will
throw such an error for

        ((.)){0}\1

so I guess Spencer did think about this to some extent -- he
just forgot about the possibility of nested parens.

This patch should work OK in HEAD and v14, but it will need
a bit of fooling-about for older branches I think, given that
they fill v->subs[] a little differently.

            regards, tom lane

diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index ae3a7b6a38..ae1ff670f9 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -550,8 +550,6 @@ freev(struct vars *v,
 {
     if (v->re != NULL)
         rfree(v->re);
-    if (v->subs != v->sub10)
-        FREE(v->subs);
     if (v->nfa != NULL)
         freenfa(v->nfa);
     if (v->tree != NULL)
@@ -562,6 +560,8 @@ freev(struct vars *v,
         freecvec(v->cv);
     if (v->cv2 != NULL)
         freecvec(v->cv2);
+    if (v->subs != v->sub10)
+        FREE(v->subs);
     if (v->lacons != NULL)
         freelacons(v->lacons, v->nlacons);
     ERR(err);                    /* nop if err==0 */
@@ -1091,8 +1091,8 @@ parseqatom(struct vars *v,
     {
         if (atom != NULL)
             freesubre(v, atom);
-        if (atomtype == '(')
-            v->subs[subno] = NULL;
+        /* freesubre() will clean up any v->subs[] pointers into the atom */
+        assert(atomtype != '(' || v->subs[subno] == NULL);
         delsub(v->nfa, lp, rp);
         EMPTYARC(lp, rp);
         return top;
@@ -2127,9 +2127,24 @@ freesrnode(struct vars *v,        /* might be NULL */
     if (sr == NULL)
         return;

+    /*
+     * If the node is referenced in v->subs[], clear that to avoid leaving a
+     * dangling pointer.  This should only ever matter when parseqatom() is
+     * throwing away a {0,0}-quantified atom that contains capturing parens.
+     * Doing this will cause later backrefs to such parens to fail, which is
+     * maybe not strictly POSIX but seems more helpful than allowing a backref
+     * that can never have a defined referent.
+     */
+    if (sr->capno > 0 && v != NULL)
+    {
+        assert(v->subs[sr->capno] == sr);
+        v->subs[sr->capno] = NULL;
+    }
+
     if (!NULLCNFA(sr->cnfa))
         freecnfa(&sr->cnfa);
     sr->flags = 0;                /* in particular, not INUSE */
+    sr->capno = 0;
     sr->child = sr->sibling = NULL;
     sr->begin = sr->end = NULL;

diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 5a6cdf47c2..850e0c8ef5 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -3506,6 +3506,10 @@ select * from test_regex('((.))(\2)', 'xyy', 'oRP');
  {yy,NULL,NULL,NULL}
 (2 rows)

+-- # throwing an error for this is arguably not per POSIX
+-- expectError    21.39 -        {((.)){0}(\2){0}}    ESUBREG
+select * from test_regex('((.)){0}(\2){0}', '', '-');
+ERROR:  invalid regular expression: invalid backreference number
 -- doing 22 "multicharacter collating elements"
 -- # again ugh
 -- MCCEs are not implemented in Postgres, so we skip all these tests
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 3419564203..378f22b67b 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -1020,6 +1020,9 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
 select * from test_regex('((.))(\2)', 'xyy', 'RP');
 -- expectMatch    21.38 oRP    ((.))(\2)    xyy    yy    {}    {}    {}
 select * from test_regex('((.))(\2)', 'xyy', 'oRP');
+-- # throwing an error for this is arguably not per POSIX
+-- expectError    21.39 -        {((.)){0}(\2){0}}    ESUBREG
+select * from test_regex('((.)){0}(\2){0}', '', '-');

 -- doing 22 "multicharacter collating elements"
 -- # again ugh

pgsql-hackers by date:

Previous
From: Peter Geoghegan
Date:
Subject: Re: ECPG bug fix: DECALRE STATEMENT and DEALLOCATE, DESCRIBE
Next
From: Mark Dilger
Date:
Subject: Re: Another regexp performance improvement: skip useless paren-captures