Re: BUG #18708: regex problem: (?:[^\d\D]){0} asserts with "lp->nouts == 0 && rp->nins == 0" - Mailing list pgsql-bugs

From Tom Lane
Subject Re: BUG #18708: regex problem: (?:[^\d\D]){0} asserts with "lp->nouts == 0 && rp->nins == 0"
Date
Msg-id 2344149.1731636867@sss.pgh.pa.us
Whole thread Raw
In response to Re: BUG #18708: regex problem: (?:[^\d\D]){0} asserts with "lp->nouts == 0 && rp->nins == 0"  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: BUG #18708: regex problem: (?:[^\d\D]){0} asserts with "lp->nouts == 0 && rp->nins == 0"
List pgsql-bugs
I wrote:
> So I think the right thing to do is to fix colorcomplement() so
> that it still produces some arc between its "from" and "to"
> states, keeping the graph connected until we finish parsing the
> regex.  It has to be a dummy arc that can't match anything, of
> course.  We can throw away such an arc once we get to regex
> optimization, because we don't need delsub() to work anymore.

> In the attached draft patch I represented the dummy arc as a PLAIN
> arc with a new fake color BLACK.  Another way to do it could be to
> introduce a new arc "type" value, but that seemed like it would
> involve touching more code.

After thinking awhile longer, I decided to try it that way (with
a new arc type), and I believe I like the result better after all.
It's just a few lines more code, and I think it's more robust.
Up to now PLAIN arcs always matched exactly one character,
so the first version was putting a major dent in their semantics.
We got rid of the bogus arcs before doing anything that really leans
hard on arc semantics, but still it seems messy.  Hence v2 attached.

I'm not especially in love with the CANTMATCH arc type name, but
other possibilities such as DUMMY or NOMATCH felt too generic.
Better ideas anyone?

            regards, tom lane

diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c
index 30bda0e5ad..8ae788f519 100644
--- a/src/backend/regex/regc_color.c
+++ b/src/backend/regex/regc_color.c
@@ -1075,9 +1075,19 @@ colorcomplement(struct nfa *nfa,

     assert(of != from);

-    /* A RAINBOW arc matches all colors, making the complement empty */
+    /*
+     * A RAINBOW arc matches all colors, making the complement empty.  But we
+     * can't just return without making any arcs, because that would leave the
+     * NFA disconnected which would break any future delsub().  Instead, make
+     * a CANTMATCH arc.  Also set the HASCANTMATCH flag so we know we need to
+     * clean that up at the start of NFA optimization.
+     */
     if (findarc(of, PLAIN, RAINBOW) != NULL)
+    {
+        newarc(nfa, CANTMATCH, 0, from, to);
+        nfa->flags |= HASCANTMATCH;
         return;
+    }

     /* Otherwise, transiently mark the colors that appear in of's out-arcs */
     for (a = of->outs; a != NULL; a = a->outchain)
@@ -1089,6 +1099,12 @@ colorcomplement(struct nfa *nfa,
             assert(!UNUSEDCOLOR(cd));
             cd->flags |= COLMARK;
         }
+
+        /*
+         * There's no syntax for re-complementing a color set, so we cannot
+         * see CANTMATCH arcs here.
+         */
+        assert(a->type != CANTMATCH);
     }

     /* Scan colors, clear transient marks, add arcs for unmarked colors */
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index f1819a24f6..acd2286def 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -1462,6 +1462,7 @@ removetraverse(struct nfa *nfa,
         {
             case PLAIN:
             case EMPTY:
+            case CANTMATCH:
                 /* nothing to do */
                 break;
             case AHEAD:
@@ -1599,6 +1600,12 @@ optimize(struct nfa *nfa,
     if (verbose)
         fprintf(f, "\ninitial cleanup:\n");
 #endif
+    /* If we have any CANTMATCH arcs, drop them; but this is uncommon */
+    if (nfa->flags & HASCANTMATCH)
+    {
+        removecantmatch(nfa);
+        nfa->flags &= ~HASCANTMATCH;
+    }
     cleanup(nfa);                /* may simplify situation */
 #ifdef REG_DEBUG
     if (verbose)
@@ -2922,6 +2929,34 @@ clonesuccessorstates(struct nfa *nfa,
     }
 }

+/*
+ * removecantmatch - remove CANTMATCH arcs, which are no longer useful
+ * once we are done with the parsing phase.  (We need them only to
+ * preserve connectedness of NFA subgraphs during parsing.)
+ */
+static void
+removecantmatch(struct nfa *nfa)
+{
+    struct state *s;
+
+    for (s = nfa->states; s != NULL; s = s->next)
+    {
+        struct arc *a;
+        struct arc *nexta;
+
+        for (a = s->outs; a != NULL; a = nexta)
+        {
+            nexta = a->outchain;
+            if (a->type == CANTMATCH)
+            {
+                freearc(nfa, a);
+                if (NISERR())
+                    return;
+            }
+        }
+    }
+}
+
 /*
  * cleanup - clean up NFA after optimizations
  */
@@ -3627,6 +3662,8 @@ dumpnfa(struct nfa *nfa,
         fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
     if (nfa->flags & HASLACONS)
         fprintf(f, ", haslacons");
+    if (nfa->flags & HASCANTMATCH)
+        fprintf(f, ", hascantmatch");
     if (nfa->flags & MATCHALL)
     {
         fprintf(f, ", minmatchall %d", nfa->minmatchall);
@@ -3749,6 +3786,9 @@ dumparc(struct arc *a,
             break;
         case EMPTY:
             break;
+        case CANTMATCH:
+            fprintf(f, "X");
+            break;
         default:
             fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
             break;
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 8a6cfb2973..15b264e50f 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -215,6 +215,7 @@ static void clonesuccessorstates(struct nfa *nfa, struct state *ssource,
                                  struct state *spredecessor,
                                  struct arc *refarc, char *curdonemap,
                                  char *outerdonemap, int nstates);
+static void removecantmatch(struct nfa *nfa);
 static void cleanup(struct nfa *nfa);
 static void markreachable(struct nfa *nfa, struct state *s,
                           struct state *okay, struct state *mark);
@@ -342,6 +343,7 @@ struct vars
 #define BEHIND    'r'                /* color-lookbehind arc */
 #define WBDRY    'w'                /* word boundary constraint */
 #define NWBDRY    'W'                /* non-word-boundary constraint */
+#define CANTMATCH 'x'            /* arc that cannot match anything */
 #define SBEGIN    'A'                /* beginning of string (even if not BOL) */
 #define SEND    'Z'                /* end of string (even if not EOL) */

@@ -2368,6 +2370,7 @@ nfanode(struct vars *v,
     nfa = newnfa(v, v->cm, v->nfa);
     NOERRZ();
     dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
+    nfa->flags = v->nfa->flags;
     if (!ISERR())
         specialcolors(nfa);
     if (!ISERR())
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 3ca3647e11..fd69299a16 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -410,6 +410,8 @@ struct cnfa
     int            flags;            /* bitmask of the following flags: */
 #define  HASLACONS    01            /* uses lookaround constraints */
 #define  MATCHALL    02            /* matches all strings of a range of lengths */
+#define  HASCANTMATCH 04        /* contains CANTMATCH arcs */
+    /* Note: HASCANTMATCH appears in nfa structs' flags, but never in cnfas */
     int            pre;            /* setup state number */
     int            post;            /* teardown state number */
     color        bos[2];            /* colors, if any, assigned to BOS and BOL */
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 731ba506d3..c44c717edf 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -2071,6 +2071,20 @@ select * from test_regex('[\s\S]*', '012  3456789abc_*', 'LNPE');
  {"012  3456789abc_*"}
 (2 rows)

+-- bug #18708:
+select * from test_regex('(?:[^\d\D]){0}', '0123456789abc*', 'LNPQE');
+                             test_regex
+--------------------------------------------------------------------
+ {0,REG_UBOUNDS,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,REG_UEMPTYMATCH}
+ {""}
+(2 rows)
+
+select * from test_regex('[^\d\D]', '0123456789abc*', 'ILPE');
+                       test_regex
+--------------------------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,REG_UIMPOSSIBLE}
+(1 row)
+
 -- check char classes' handling of newlines
 select * from test_regex('\s+', E'abc  \n  def', 'LP');
           test_regex
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 478fa2c547..b2a847577e 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -619,6 +619,9 @@ select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
 select * from test_regex('\W', '0123456789abc_*', 'LP');
 select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
 select * from test_regex('[\s\S]*', '012  3456789abc_*', 'LNPE');
+-- bug #18708:
+select * from test_regex('(?:[^\d\D]){0}', '0123456789abc*', 'LNPQE');
+select * from test_regex('[^\d\D]', '0123456789abc*', 'ILPE');

 -- check char classes' handling of newlines
 select * from test_regex('\s+', E'abc  \n  def', 'LP');

pgsql-bugs by date:

Previous
From: Tom Lane
Date:
Subject: Re: BUG #18708: regex problem: (?:[^\d\D]){0} asserts with "lp->nouts == 0 && rp->nins == 0"
Next
From: PG Bug reporting form
Date:
Subject: BUG #18710: "pg_get_viewdef" triggers assertions in special scenarios