Re: Assert triggered during RE_compile_and_cache - Mailing list pgsql-hackers
From | Tom Lane |
---|---|
Subject | Re: Assert triggered during RE_compile_and_cache |
Date | |
Msg-id | 2822987.1628372679@sss.pgh.pa.us Whole thread Raw |
In response to | Re: Assert triggered during RE_compile_and_cache (Tom Lane <tgl@sss.pgh.pa.us>) |
Responses |
Re: Assert triggered during RE_compile_and_cache
|
List | pgsql-hackers |
I wrote: >> Hm. I thought that this might be an old issue, but I'm not seeing the >> crash in pre-v14 branches. That means it's some bug introduced in >> the performance improvements I did a few months ago. Open item added. > git bisect says the trouble started with > ea1268f6301cc7adce571cc9c5ebe8d9342a2ef4 is the first bad commit Here's a draft fix for this. The issue seems to be that parseqatom records where a previous capture group is by saving a pointer to the "subre" that parse() returned for it. That was okay before said commit because we didn't do anything further to that subre: we immediately wrapped it into a parent subre, and then any further hacking that parseqatom did was done on the parent subre. But after ea1268f63, in most cases we'd hack right on that same subre, thus potentially modifying the portion of the NFA that would be copied by a subsequent back-reference. The particular thing that's asserting when you wrap {0} around such a back-ref is just accidental road kill: it happens to notice that the NFA is no longer consistent, because there's no path leading from the start to the end of the copied sub-NFA, thanks to the copying having been done between a pair of states that weren't actually appropriate to reference. What surprises me about this is not that crash, but that we didn't see fundamental breakage of backref-using patterns all over the place. It evidently breaks only in corner cases, but I'm not quite sure why the effects are so limited. Anyway, the fix seems pretty straightforward. We don't really need anything about the subre as such except for its starting and ending NFA states, so the attached modifies things to record only those state pointers. I'm not done testing this by any means, but it does fix the two cases you showed, and I've been running that perl script for some time with no further crashes. I suspect the assertion you hit with the REG_NOSUB patch was just further fallout from this same basic bug, but I've not yet rebased that patch over this to check. regards, tom lane diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 9f71177d31..afa6dd44c6 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -233,6 +233,13 @@ static int cmp(const chr *, const chr *, size_t); static int casecmp(const chr *, const chr *, size_t); +/* info we need during compilation about a known capturing subexpression */ +struct subinfo +{ + struct state *left; /* left end of its sub-NFA */ + struct state *right; /* right end of its sub-NFA */ +}; + /* internal variables, bundled for easy passing around */ struct vars { @@ -245,10 +252,10 @@ struct vars int nexttype; /* type of next token */ chr nextvalue; /* value (if any) of next token */ int lexcon; /* lexical context type (see regc_lex.c) */ - int nsubexp; /* subexpression count */ - struct subre **subs; /* subRE pointer vector */ - size_t nsubs; /* length of vector */ - struct subre *sub10[10]; /* initial vector, enough for most */ + int nsubexp; /* number of known capturing subexpressions */ + struct subinfo *subs; /* info about known capturing subexpressions */ + size_t nsubs; /* allocated length of subs[] vector */ + struct subinfo sub10[10]; /* initial vector, enough for most */ struct nfa *nfa; /* the NFA */ struct colormap *cm; /* character color map */ color nlcolor; /* color of newline */ @@ -368,7 +375,7 @@ pg_regcomp(regex_t *re, v->subs = v->sub10; v->nsubs = 10; for (j = 0; j < v->nsubs; j++) - v->subs[j] = NULL; + v->subs[j].left = v->subs[j].right = NULL; v->nfa = NULL; v->cm = NULL; v->nlcolor = COLORLESS; @@ -504,13 +511,13 @@ pg_regcomp(regex_t *re, } /* - * moresubs - enlarge subRE vector + * moresubs - enlarge capturing-subexpressions vector */ static void moresubs(struct vars *v, int wanted) /* want enough room for this one */ { - struct subre **p; + struct subinfo *p; size_t n; assert(wanted > 0 && (size_t) wanted >= v->nsubs); @@ -518,13 +525,13 @@ moresubs(struct vars *v, if (v->subs == v->sub10) { - p = (struct subre **) MALLOC(n * sizeof(struct subre *)); + p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo)); if (p != NULL) memcpy(VS(p), VS(v->subs), - v->nsubs * sizeof(struct subre *)); + v->nsubs * sizeof(struct subinfo)); } else - p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *)); + p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo)); if (p == NULL) { ERR(REG_ESPACE); @@ -532,7 +539,7 @@ moresubs(struct vars *v, } v->subs = p; for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) - *p = NULL; + p->left = p->right = NULL; assert(v->nsubs == n); assert((size_t) wanted < v->nsubs); } @@ -982,8 +989,10 @@ parseqatom(struct vars *v, NOERR(); if (cap) { - assert(v->subs[subno] == NULL); - v->subs[subno] = atom; + /* save the sub-NFA's endpoints for future backrefs to use */ + assert(v->subs[subno].left == NULL); + v->subs[subno].left = s; + v->subs[subno].right = s2; if (atom->capno == 0) { /* normal case: just mark the atom as capturing */ @@ -1005,7 +1014,7 @@ parseqatom(struct vars *v, case BACKREF: /* the Feature From The Black Lagoon */ INSIST(type != LACON, REG_ESUBREG); INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); - INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG); + INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG); NOERR(); assert(v->nextvalue > 0); atom = subre(v, 'b', BACKR, lp, rp); @@ -1080,7 +1089,7 @@ parseqatom(struct vars *v, if (atom != NULL) freesubre(v, atom); if (atomtype == '(') - v->subs[subno] = NULL; + v->subs[subno].left = v->subs[subno].right = NULL; delsub(v->nfa, lp, rp); EMPTYARC(lp, rp); return; @@ -1173,14 +1182,14 @@ parseqatom(struct vars *v, { assert(atom->begin->nouts == 1); /* just the EMPTY */ delsub(v->nfa, atom->begin, atom->end); - assert(v->subs[subno] != NULL); + assert(v->subs[subno].left != NULL); /* * And here's why the recursion got postponed: it must wait until the * skeleton is filled in, because it may hit a backref that wants to * copy the filled-in skeleton. */ - dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end, + dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right, atom->begin, atom->end); NOERR(); diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out index 01d50ec1e3..44da7d2019 100644 --- a/src/test/modules/test_regex/expected/test_regex.out +++ b/src/test/modules/test_regex/expected/test_regex.out @@ -3468,6 +3468,14 @@ select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M' {" TO foo",foo,o,NULL} (2 rows) +-- expectMatch 21.36 RPQ ((.))(\2){0} xy x x x {} +select * from test_regex('((.))(\2){0}', 'xy', 'RPQ'); + test_regex +-------------------------------------------- + {3,REG_UBACKREF,REG_UBOUNDS,REG_UNONPOSIX} + {x,x,x,NULL} +(2 rows) + -- doing 22 "multicharacter collating elements" -- # again ugh -- MCCEs are not implemented in Postgres, so we skip all these tests diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql index 7f5bc6e418..9224fdfdd3 100644 --- a/src/test/modules/test_regex/sql/test_regex.sql +++ b/src/test/modules/test_regex/sql/test_regex.sql @@ -1009,6 +1009,8 @@ select * from test_regex('(.*).*', 'abc', 'N'); select * from test_regex('(a*)*', 'bc', 'N'); -- expectMatch 21.35 M { TO (([a-z0-9._]+|"([^"]+|"")+")+)} {asd TO foo} { TO foo} foo o {} select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M'); +-- expectMatch 21.36 RPQ ((.))(\2){0} xy x x x {} +select * from test_regex('((.))(\2){0}', 'xy', 'RPQ'); -- doing 22 "multicharacter collating elements" -- # again ugh
pgsql-hackers by date: