Regexp: observable bug from careless usage of zaptreesubs - Mailing list pgsql-hackers

From Tom Lane
Subject Regexp: observable bug from careless usage of zaptreesubs
Date
Msg-id 151435.1629733387@sss.pgh.pa.us
Whole thread Raw
List pgsql-hackers
While looking at the regexp code, I started to get uncomfortable
about the implications of commit 0c3405cf1: it means that not
only the cdissect() phase, but also the preceding DFA-check phase
(longest()/shortest()) rely on saved subexpression match positions
to be valid for the match we're currently considering.  It seemed
to me that the cdissect() recursion wasn't being careful to reset
the match pointers for an abandoned submatch before moving on to
the next attempt, meaning that dfa_backref() could conceivably get
applied using a stale match pointer.

Upon poking into it, I failed to find any bug of that exact ilk,
but what I did find was a pre-existing bug of not resetting an
abandoned match pointer at all.  That allows these fun things:

regression=# select 'abcdef' ~ '^(.)\1|\1.';
 ?column? 
----------
 t
(1 row)

regression=# select 'abadef' ~ '^((.)\2|..)\2';
 ?column? 
----------
 t
(1 row)

In both of these examples, the (.) capture is in an alternation
branch that subsequently fails; therefore, the later backref
should never be able to match.  But it does, because we forgot
to clear the capture's match data on the way out.

It turns out that this can be fixed using fewer, not more, zaptreesubs
calls, if we are careful to define the recursion rules precisely.
See attached.

This bug is ancient.  I verified it as far back as PG 7.4, and
it can also be reproduced in Tcl, so it's likely aboriginal to
Spencer's library.  It's not that surprising that no one has
reported it, because regexps that have this sort of possibly-invalid
backref are most likely incorrect.  In most use-cases the existing
code will fail to match, as expected, so users would probably notice
that and fix their regexps without observing that there are cases
where the match incorrectly succeeds.

            regards, tom lane

diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index aa5ba85514..2411e6561d 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -216,14 +216,14 @@ pg_regexec(regex_t *re,
     if (backref && nmatch <= v->g->nsub)
     {
         /* need larger work area */
-        if (v->g->nsub + 1 <= LOCALMAT)
+        v->nmatch = v->g->nsub + 1;
+        if (v->nmatch <= LOCALMAT)
             v->pmatch = mat;
         else
-            v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) *
-                                              sizeof(regmatch_t));
+            v->pmatch = (regmatch_t *) MALLOC(v->nmatch * sizeof(regmatch_t));
         if (v->pmatch == NULL)
             return REG_ESPACE;
-        v->nmatch = v->g->nsub + 1;
+        zapallsubs(v->pmatch, v->nmatch);
     }
     else
     {
@@ -488,7 +488,6 @@ find(struct vars *v,
         return REG_OKAY;

     /* find submatches */
-    zapallsubs(v->pmatch, v->nmatch);
     return cdissect(v, v->g->tree, begin, end);
 }

@@ -599,7 +598,6 @@ cfindloop(struct vars *v,
                     break;        /* no match with this begin point, try next */
                 MDEBUG(("tentative end %ld\n", LOFF(end)));
                 /* Dissect the potential match to see if it really matches */
-                zapallsubs(v->pmatch, v->nmatch);
                 er = cdissect(v, v->g->tree, begin, end);
                 if (er == REG_OKAY)
                 {
@@ -647,6 +645,8 @@ cfindloop(struct vars *v,

 /*
  * zapallsubs - initialize all subexpression matches to "no match"
+ *
+ * Note that p[0], the overall-match location, is not touched.
  */
 static void
 zapallsubs(regmatch_t *p,
@@ -716,8 +716,30 @@ subset(struct vars *v,
  * DFA and found that the proposed substring satisfies the DFA.  (We make
  * the caller do that because in concatenation and iteration nodes, it's
  * much faster to check all the substrings against the child DFAs before we
- * recurse.)  Also, caller must have cleared subexpression match data via
- * zaptreesubs (or zapallsubs at the top level).
+ * recurse.)
+ *
+ * A side-effect of a successful match is to save match locations for
+ * capturing subexpressions in v->pmatch[].  This is a little bit tricky,
+ * so we make the following rules:
+ * 1. Before initial entry to cdissect, all match data must have been
+ *    cleared (this is seen to by zapallsubs).
+ * 2. Before any recursive entry to cdissect, the match data for that
+ *    subexpression tree must be guaranteed clear (see zaptreesubs).
+ * 3. When returning REG_OKAY, each level of cdissect will have saved
+ *    any relevant match locations.
+ * 4. When returning REG_NOMATCH, each level of cdissect will guarantee
+ *    that its subexpression match locations are again clear.
+ * 5. No guarantees are made for error cases (i.e., other result codes).
+ * 6. When a level of cdissect abandons a successful sub-match, it will
+ *    clear that subtree's match locations with zaptreesubs before trying
+ *    any new DFA match or cdissect call for that subtree or any subtree
+ *    to its right (that is, any subtree that could have a backref into the
+ *    abandoned match).
+ * This may seem overly complicated, but it's difficult to simplify it
+ * because of the provision that match locations must be reset before
+ * any fresh DFA match (a rule that is needed to make dfa_backref safe).
+ * That means it won't work to just reset relevant match locations at the
+ * start of each cdissect level.
  */
 static int                        /* regexec return code */
 cdissect(struct vars *v,
@@ -841,6 +863,8 @@ ccondissect(struct vars *v,
                     MDEBUG(("%d: successful\n", t->id));
                     return REG_OKAY;
                 }
+                /* Reset left's matches (right should have done so itself) */
+                zaptreesubs(v, left);
             }
             if (er != REG_NOMATCH)
                 return er;
@@ -863,8 +887,6 @@ ccondissect(struct vars *v,
             return REG_NOMATCH;
         }
         MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
-        zaptreesubs(v, left);
-        zaptreesubs(v, right);
     }

     /* can't get here */
@@ -922,6 +944,8 @@ crevcondissect(struct vars *v,
                     MDEBUG(("%d: successful\n", t->id));
                     return REG_OKAY;
                 }
+                /* Reset left's matches (right should have done so itself) */
+                zaptreesubs(v, left);
             }
             if (er != REG_NOMATCH)
                 return er;
@@ -944,8 +968,6 @@ crevcondissect(struct vars *v,
             return REG_NOMATCH;
         }
         MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
-        zaptreesubs(v, left);
-        zaptreesubs(v, right);
     }

     /* can't get here */
@@ -1214,6 +1236,7 @@ citerdissect(struct vars *v,

         for (i = nverified + 1; i <= k; i++)
         {
+            /* zap any match data from a non-last iteration */
             zaptreesubs(v, t->child);
             er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
             if (er == REG_OKAY)
@@ -1426,6 +1449,7 @@ creviterdissect(struct vars *v,

         for (i = nverified + 1; i <= k; i++)
         {
+            /* zap any match data from a non-last iteration */
             zaptreesubs(v, t->child);
             er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
             if (er == REG_OKAY)
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 6242d0baa9..4886858d66 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -2664,6 +2664,20 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP');
  {2,REG_UBACKREF,REG_UNONPOSIX}
 (1 row)

+-- expectNomatch    14.30 RP    {^(.)\1|\1.}    {abcdef}
+select * from test_regex('^(.)\1|\1.', 'abcdef', 'RP');
+           test_regex
+--------------------------------
+ {1,REG_UBACKREF,REG_UNONPOSIX}
+(1 row)
+
+-- expectNomatch    14.31 RP    {^((.)\2|..)\2}    {abadef}
+select * from test_regex('^((.)\2|..)\2', 'abadef', 'RP');
+           test_regex
+--------------------------------
+ {2,REG_UBACKREF,REG_UNONPOSIX}
+(1 row)
+
 -- back reference only matches the string, not any constraints
 select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
                  test_regex
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 389b8b61b3..418527da3d 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -775,6 +775,10 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abc', 'RP');
 select * from test_regex('^(.+)( \1)+$', 'abc abd abc', 'RP');
 -- expectNomatch    14.29 RP    {^(.+)( \1)+$}    {abc abc abd}
 select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP');
+-- expectNomatch    14.30 RP    {^(.)\1|\1.}    {abcdef}
+select * from test_regex('^(.)\1|\1.', 'abcdef', 'RP');
+-- expectNomatch    14.31 RP    {^((.)\2|..)\2}    {abadef}
+select * from test_regex('^((.)\2|..)\2', 'abadef', 'RP');

 -- back reference only matches the string, not any constraints
 select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out
index 86477cc506..cbe2cfc3ea 100644
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -567,6 +567,19 @@ select 'a' ~ '()+\1';
  t
 (1 row)

+-- Test ancient oversight in when to apply zaptreesubs
+select 'abcdef' ~ '^(.)\1|\1.' as f;
+ f
+---
+ f
+(1 row)
+
+select 'abadef' ~ '^((.)\2|..)\2' as f;
+ f
+---
+ f
+(1 row)
+
 -- Add coverage for some cases in checkmatchall
 select regexp_match('xy', '.|...');
  regexp_match
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql
index b03a8d9ac2..c6974a43d1 100644
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -135,6 +135,10 @@ select 'a' ~ '.. ()|\1';
 select 'a' ~ '()*\1';
 select 'a' ~ '()+\1';

+-- Test ancient oversight in when to apply zaptreesubs
+select 'abcdef' ~ '^(.)\1|\1.' as f;
+select 'abadef' ~ '^((.)\2|..)\2' as f;
+
 -- Add coverage for some cases in checkmatchall
 select regexp_match('xy', '.|...');
 select regexp_match('xyz', '.|...');

pgsql-hackers by date:

Previous
From: Alvaro Herrera
Date:
Subject: Re: Mark all GUC variable as PGDLLIMPORT
Next
From: "alvherre@alvh.no-ip.org"
Date:
Subject: Re: archive status ".ready" files may be created too early