Re: Assert triggered during RE_compile_and_cache - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Assert triggered during RE_compile_and_cache
Date
Msg-id 2860330.1628384606@sss.pgh.pa.us
Whole thread Raw
In response to Re: Assert triggered during RE_compile_and_cache  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: Assert triggered during RE_compile_and_cache
Re: Assert triggered during RE_compile_and_cache
List pgsql-hackers
I wrote:
> ... What surprises me about this is not that
> crash, but that we didn't see fundamental breakage of backref-using
> patterns all over the place.  It evidently breaks only in corner
> cases, but I'm not quite sure why the effects are so limited.

Ah-hah, I've sussed it.  I'd supposed that the issue was that parseqatom
could replace the subre atom's endpoints in the bit where it starts
to "prepare a general-purpose state skeleton".  However, it seems that
that's not what makes it go off the rails.  Rather, in the specific
combination we've got in the test case, the next outer recursion level
of parseqatom() decides that it can free the subre that was returned
by the inner level, *which is the same subre that v->subs[n] is pointing
to*.  So this is really a use-after-free problem.  freesrnode() won't
actually free() the node, just stick it back onto the chain for possible
recycling --- and it doesn't clear the state pointer fields when it
does that.  So there's no use-after-free that ordinary tools could detect.
We only see a problem manifest if the subre node is reused later, which
explains why '(())(\2){0}' fails while '(())\2{0}' does not.

So the problem boils down to parseqatom deciding it can free child
subre nodes that it knows little about the provenance of.  It would
be safer to make that optimization by instead freeing the "top"
node passed in from parsebranch, which we know has got no other
interesting linkages.  That requires tweaking the API of parseqatom,
which why I'd not done it like that to begin with --- but that's not
a hard or complicated change, just a mite tedious.

Hence, the attached patch.

While this is sufficient to make the problem go away, I'm
inclined to apply both changesets.  Even if it accidentally
works right now to have later backrefs consult the outer s/s2
state pair rather than the original pair, that seems far too
convoluted and bug-prone.  The outer states should be strictly
the concern of the iteration setup logic in the outer invocation
of parseqatom.

(I'm sort of wondering now whether the outer s/s2 states are
even really necessary anymore ... maybe Spencer put those in
as a way of preventing some prehistoric version of this bug.
But I'm not excited about messing with that right now.)

            regards, tom lane

diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 9f71177d31..3d7f11af8c 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -43,7 +43,7 @@ static int    freev(struct vars *, int);
 static void makesearch(struct vars *, struct nfa *);
 static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
 static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
-static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
+static struct subre *parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
 static void nonword(struct vars *, int, struct state *, struct state *);
 static void word(struct vars *, int, struct state *, struct state *);
 static void charclass(struct vars *, enum char_classes,
@@ -756,7 +756,7 @@ parsebranch(struct vars *v,
         seencontent = 1;

         /* NB, recursion in parseqatom() may swallow rest of branch */
-        parseqatom(v, stopper, type, lp, right, t);
+        t = parseqatom(v, stopper, type, lp, right, t);
         NOERRN();
     }

@@ -777,8 +777,12 @@ parsebranch(struct vars *v,
  * The bookkeeping near the end cooperates very closely with parsebranch();
  * in particular, it contains a recursion that can involve parsing the rest
  * of the branch, making this function's name somewhat inaccurate.
+ *
+ * Usually, the return value is just "top", but in some cases where we
+ * have parsed the rest of the branch, we may deem "top" redundant and
+ * free it, returning some child subre instead.
  */
-static void
+static struct subre *
 parseqatom(struct vars *v,
            int stopper,            /* EOS or ')' */
            int type,            /* LACON (lookaround subRE) or PLAIN */
@@ -818,84 +822,84 @@ parseqatom(struct vars *v,
             if (v->cflags & REG_NLANCH)
                 ARCV(BEHIND, v->nlcolor);
             NEXT();
-            return;
+            return top;
             break;
         case '$':
             ARCV('$', 1);
             if (v->cflags & REG_NLANCH)
                 ARCV(AHEAD, v->nlcolor);
             NEXT();
-            return;
+            return top;
             break;
         case SBEGIN:
             ARCV('^', 1);        /* BOL */
             ARCV('^', 0);        /* or BOS */
             NEXT();
-            return;
+            return top;
             break;
         case SEND:
             ARCV('$', 1);        /* EOL */
             ARCV('$', 0);        /* or EOS */
             NEXT();
-            return;
+            return top;
             break;
         case '<':
             wordchrs(v);
             s = newstate(v->nfa);
-            NOERR();
+            NOERRN();
             nonword(v, BEHIND, lp, s);
             word(v, AHEAD, s, rp);
             NEXT();
-            return;
+            return top;
             break;
         case '>':
             wordchrs(v);
             s = newstate(v->nfa);
-            NOERR();
+            NOERRN();
             word(v, BEHIND, lp, s);
             nonword(v, AHEAD, s, rp);
             NEXT();
-            return;
+            return top;
             break;
         case WBDRY:
             wordchrs(v);
             s = newstate(v->nfa);
-            NOERR();
+            NOERRN();
             nonword(v, BEHIND, lp, s);
             word(v, AHEAD, s, rp);
             s = newstate(v->nfa);
-            NOERR();
+            NOERRN();
             word(v, BEHIND, lp, s);
             nonword(v, AHEAD, s, rp);
             NEXT();
-            return;
+            return top;
             break;
         case NWBDRY:
             wordchrs(v);
             s = newstate(v->nfa);
-            NOERR();
+            NOERRN();
             word(v, BEHIND, lp, s);
             word(v, AHEAD, s, rp);
             s = newstate(v->nfa);
-            NOERR();
+            NOERRN();
             nonword(v, BEHIND, lp, s);
             nonword(v, AHEAD, s, rp);
             NEXT();
-            return;
+            return top;
             break;
         case LACON:                /* lookaround constraint */
             latype = v->nextvalue;
             NEXT();
             s = newstate(v->nfa);
             s2 = newstate(v->nfa);
-            NOERR();
+            NOERRN();
             t = parse(v, ')', LACON, s, s2);
             freesubre(v, t);    /* internal structure irrelevant */
-            NOERR();
+            NOERRN();
             assert(SEE(')'));
             NEXT();
             processlacon(v, s, s2, latype, lp, rp);
-            return;
+            return top;
             break;
             /* then errors, to get them out of the way */
         case '*':
@@ -903,18 +907,18 @@ parseqatom(struct vars *v,
         case '?':
         case '{':
             ERR(REG_BADRPT);
-            return;
+            return top;
             break;
         default:
             ERR(REG_ASSERT);
-            return;
+            return top;
             break;
             /* then plain characters, and minor variants on that theme */
         case ')':                /* unbalanced paren */
             if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
             {
                 ERR(REG_EPAREN);
-                return;
+                return top;
             }
             /* legal in EREs due to specification botch */
             NOTE(REG_UPBOTCH);
@@ -923,7 +927,7 @@ parseqatom(struct vars *v,
         case PLAIN:
             onechr(v, v->nextvalue, lp, rp);
             okcolors(v->nfa, v->cm);
-            NOERR();
+            NOERRN();
             NEXT();
             break;
         case '[':
@@ -972,14 +976,14 @@ parseqatom(struct vars *v,
              */
             s = newstate(v->nfa);
             s2 = newstate(v->nfa);
-            NOERR();
+            NOERRN();
             EMPTYARC(lp, s);
             EMPTYARC(s2, rp);
-            NOERR();
+            NOERRN();
             atom = parse(v, ')', type, s, s2);
             assert(SEE(')') || ISERR());
             NEXT();
-            NOERR();
+            NOERRN();
             if (cap)
             {
                 assert(v->subs[subno] == NULL);
@@ -994,7 +998,7 @@ parseqatom(struct vars *v,
                 {
                     /* generate no-op wrapper node to handle "((x))" */
                     t = subre(v, '(', atom->flags | CAP, lp, rp);
-                    NOERR();
+                    NOERRN();
                     t->capno = subno;
                     t->child = atom;
                     atom = t;
@@ -1006,10 +1010,10 @@ parseqatom(struct vars *v,
             INSIST(type != LACON, REG_ESUBREG);
             INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
             INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
-            NOERR();
+            NOERRN();
             assert(v->nextvalue > 0);
             atom = subre(v, 'b', BACKR, lp, rp);
-            NOERR();
+            NOERRN();
             subno = v->nextvalue;
             atom->backno = subno;
             EMPTYARC(lp, rp);    /* temporarily, so there's something */
@@ -1050,7 +1054,7 @@ parseqatom(struct vars *v,
                 if (m > n)
                 {
                     ERR(REG_BADBR);
-                    return;
+                    return top;
                 }
                 /* {m,n} exercises preference, even if it's {m,m} */
                 qprefer = (v->nextvalue) ? LONGER : SHORTER;
@@ -1064,7 +1068,7 @@ parseqatom(struct vars *v,
             if (!SEE('}'))
             {                    /* catches errors too */
                 ERR(REG_BADBR);
-                return;
+                return top;
             }
             NEXT();
             break;
@@ -1083,7 +1087,7 @@ parseqatom(struct vars *v,
             v->subs[subno] = NULL;
         delsub(v->nfa, lp, rp);
         EMPTYARC(lp, rp);
-        return;
+        return top;
     }

     /* if not a messy case, avoid hard part */
@@ -1096,7 +1100,7 @@ parseqatom(struct vars *v,
         if (atom != NULL)
             freesubre(v, atom);
         top->flags = f;
-        return;
+        return top;
     }

     /*
@@ -1110,7 +1114,7 @@ parseqatom(struct vars *v,
     if (atom == NULL)
     {
         atom = subre(v, '=', 0, lp, rp);
-        NOERR();
+        NOERRN();
     }

     /*----------
@@ -1131,20 +1135,20 @@ parseqatom(struct vars *v,
      */
     s = newstate(v->nfa);        /* first, new endpoints for the atom */
     s2 = newstate(v->nfa);
-    NOERR();
+    NOERRN();
     moveouts(v->nfa, lp, s);
     moveins(v->nfa, rp, s2);
-    NOERR();
+    NOERRN();
     atom->begin = s;
     atom->end = s2;
     s = newstate(v->nfa);        /* set up starting state */
-    NOERR();
+    NOERRN();
     EMPTYARC(lp, s);
-    NOERR();
+    NOERRN();

     /* break remaining subRE into x{...} and what follows */
     t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
-    NOERR();
+    NOERRN();
     t->child = atom;
     atomp = &t->child;

@@ -1163,7 +1167,7 @@ parseqatom(struct vars *v,
      */
     assert(top->op == '=' && top->child == NULL);
     top->child = subre(v, '=', top->flags, top->begin, lp);
-    NOERR();
+    NOERRN();
     top->op = '.';
     top->child->sibling = t;
     /* top->flags will get updated later */
@@ -1182,11 +1186,11 @@ parseqatom(struct vars *v,
          */
         dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
                atom->begin, atom->end);
-        NOERR();
+        NOERRN();

         /* The backref node's NFA should not enforce any constraints */
         removeconstraints(v->nfa, atom->begin, atom->end);
-        NOERR();
+        NOERRN();
     }

     /*
@@ -1226,7 +1230,7 @@ parseqatom(struct vars *v,
         repeat(v, atom->begin, atom->end, m, n);
         f = COMBINE(qprefer, atom->flags);
         t = subre(v, '=', f, atom->begin, atom->end);
-        NOERR();
+        NOERRN();
         freesubre(v, atom);
         *atomp = t;
         /* rest of branch can be strung starting from t->end */
@@ -1247,9 +1251,9 @@ parseqatom(struct vars *v,
         repeat(v, s, atom->begin, m - 1, (n == DUPINF) ? n : n - 1);
         f = COMBINE(qprefer, atom->flags);
         t = subre(v, '.', f, s, atom->end); /* prefix and atom */
-        NOERR();
+        NOERRN();
         t->child = subre(v, '=', PREF(f), s, atom->begin);
-        NOERR();
+        NOERRN();
         t->child->sibling = atom;
         *atomp = t;
         /* rest of branch can be strung starting from atom->end */
@@ -1259,14 +1263,14 @@ parseqatom(struct vars *v,
     {
         /* general case: need an iteration node */
         s2 = newstate(v->nfa);
-        NOERR();
+        NOERRN();
         moveouts(v->nfa, atom->end, s2);
-        NOERR();
+        NOERRN();
         dupnfa(v->nfa, atom->begin, atom->end, s, s2);
         repeat(v, s, s2, m, n);
         f = COMBINE(qprefer, atom->flags);
         t = subre(v, '*', f, s, s2);
-        NOERR();
+        NOERRN();
         t->min = (short) m;
         t->max = (short) n;
         t->child = atom;
@@ -1280,7 +1284,7 @@ parseqatom(struct vars *v,
     {
         /* parse all the rest of the branch, and insert in t->child->sibling */
         t->child->sibling = parsebranch(v, stopper, type, s2, rp, 1);
-        NOERR();
+        NOERRN();
         assert(SEE('|') || SEE(stopper) || SEE(EOS));

         /* here's the promised update of the flags */
@@ -1299,9 +1303,7 @@ parseqatom(struct vars *v,
          *
          * If the messy atom was the first thing in the branch, then
          * top->child is vacuous and we can get rid of one level of
-         * concatenation.  Since the caller is holding a pointer to the top
-         * node, we can't remove that node; but we're allowed to change its
-         * properties.
+         * concatenation.
          */
         assert(top->child->op == '=');
         if (top->child->begin == top->child->end)
@@ -1351,21 +1353,13 @@ parseqatom(struct vars *v,
         {
             assert(!MESSY(top->child->flags));
             t = top->child->sibling;
-            freesubre(v, top->child);
-            top->op = t->op;
-            top->flags = t->flags;
-            top->latype = t->latype;
-            top->id = t->id;
-            top->capno = t->capno;
-            top->backno = t->backno;
-            top->min = t->min;
-            top->max = t->max;
-            top->child = t->child;
-            top->begin = t->begin;
-            top->end = t->end;
-            freesrnode(v, t);
+            top->child->sibling = NULL;
+            freesubre(v, top);
+            top = t;
         }
     }
+
+    return top;
 }

 /*
@@ -2109,7 +2103,9 @@ freesrnode(struct vars *v,        /* might be NULL */

     if (!NULLCNFA(sr->cnfa))
         freecnfa(&sr->cnfa);
-    sr->flags = 0;
+    sr->flags = 0;                /* in particular, not INUSE */
+    sr->child = sr->sibling = NULL;
+    sr->begin = sr->end = NULL;

     if (v != NULL && v->treechain != NULL)
     {
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 01d50ec1e3..44da7d2019 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -3468,6 +3468,14 @@ select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M'
  {" TO foo",foo,o,NULL}
 (2 rows)

+-- expectMatch    21.36 RPQ    ((.))(\2){0}    xy    x    x    x    {}
+select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
+                 test_regex
+--------------------------------------------
+ {3,REG_UBACKREF,REG_UBOUNDS,REG_UNONPOSIX}
+ {x,x,x,NULL}
+(2 rows)
+
 -- doing 22 "multicharacter collating elements"
 -- # again ugh
 -- MCCEs are not implemented in Postgres, so we skip all these tests
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 7f5bc6e418..9224fdfdd3 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -1009,6 +1009,8 @@ select * from test_regex('(.*).*', 'abc', 'N');
 select * from test_regex('(a*)*', 'bc', 'N');
 -- expectMatch    21.35 M        { TO (([a-z0-9._]+|"([^"]+|"")+")+)}    {asd TO foo}    { TO foo} foo o {}
 select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M');
+-- expectMatch    21.36 RPQ    ((.))(\2){0}    xy    x    x    x    {}
+select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');

 -- doing 22 "multicharacter collating elements"
 -- # again ugh

pgsql-hackers by date:

Previous
From: Andres Freund
Date:
Subject: elog.c query_id support vs shutdown
Next
From: Peter Smith
Date:
Subject: Re: Small documentation improvement for ALTER SUBSCRIPTION