Re: Another regexp performance improvement: skip useless paren-captures - Mailing list pgsql-hackers
From | Tom Lane |
---|---|
Subject | Re: Another regexp performance improvement: skip useless paren-captures |
Date | |
Msg-id | 3224306.1628454300@sss.pgh.pa.us Whole thread Raw |
In response to | Re: Another regexp performance improvement: skip useless paren-captures (Mark Dilger <mark.dilger@enterprisedb.com>) |
Responses |
Re: Another regexp performance improvement: skip useless paren-captures
|
List | pgsql-hackers |
Mark Dilger <mark.dilger@enterprisedb.com> writes: > Hmm. This changes the behavior when applied against master (c1132aae336c41cf9d316222e525d8d593c2b5d2): > select regexp_split_to_array('uuuzkodphfbfbfb', '((.))(\1\2)', 'ntw'); > regexp_split_to_array > ----------------------- > - {"",zkodphfbfbfb} > + {uuuzkodphfbfbfb} > (1 row) Ugh. The regex engine is finding the match correctly, but it's failing to tell the caller where it is :-(. I was a little too cute in optimizing the regmatch_t result-vector copying in pg_regexec, and forgot to ensure that the overall match position would be reported. Thanks for the testing! regards, tom lane diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index bf1dea6352..64816dd370 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -541,9 +541,11 @@ createTrgmNFA(text *text_re, Oid collation, * Stage 1: Compile the regexp into a NFA, using the regexp library. */ #ifdef IGNORECASE - RE_compile(®ex, text_re, REG_ADVANCED | REG_ICASE, collation); + RE_compile(®ex, text_re, + REG_ADVANCED | REG_NOSUB | REG_ICASE, collation); #else - RE_compile(®ex, text_re, REG_ADVANCED, collation); + RE_compile(®ex, text_re, + REG_ADVANCED | REG_NOSUB, collation); #endif /* diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c index 7673dab76f..45727ffa01 100644 --- a/src/backend/regex/regc_lex.c +++ b/src/backend/regex/regc_lex.c @@ -528,10 +528,7 @@ next(struct vars *v) } assert(NOTREACHED); } - if (v->cflags & REG_NOSUB) - RETV('(', 0); /* all parens non-capturing */ - else - RETV('(', 1); + RETV('(', 1); break; case CHR(')'): if (LASTTYPE('(')) diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 60a220c57a..ae3a7b6a38 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -65,7 +65,7 @@ static struct subre *subre(struct vars *, int, int, struct state *, struct state static void freesubre(struct vars *, struct subre *); static void freesubreandsiblings(struct vars *, struct subre *); static void freesrnode(struct vars *, struct subre *); -static void optst(struct vars *, struct subre *); +static void removecaptures(struct vars *, struct subre *); static int numst(struct subre *, int); static void markst(struct subre *); static void cleanst(struct vars *); @@ -431,7 +431,8 @@ pg_regcomp(regex_t *re, dumpst(v->tree, debug, 1); } #endif - optst(v, v->tree); + if (v->cflags & REG_NOSUB) + removecaptures(v, v->tree); v->ntree = numst(v->tree, 1); markst(v->tree); cleanst(v); @@ -1013,14 +1014,16 @@ parseqatom(struct vars *v, break; case BACKREF: /* the Feature From The Black Lagoon */ INSIST(type != LACON, REG_ESUBREG); - INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); - INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG); + subno = v->nextvalue; + assert(subno > 0); + INSIST(subno < v->nsubs, REG_ESUBREG); + NOERRN(); + INSIST(v->subs[subno] != NULL, REG_ESUBREG); NOERRN(); - assert(v->nextvalue > 0); atom = subre(v, 'b', BACKR, lp, rp); NOERRN(); - subno = v->nextvalue; atom->backno = subno; + v->subs[subno]->flags |= BRUSE; EMPTYARC(lp, rp); /* temporarily, so there's something */ NEXT(); break; @@ -2141,19 +2144,54 @@ freesrnode(struct vars *v, /* might be NULL */ } /* - * optst - optimize a subRE subtree + * removecaptures - remove unnecessary capture subREs + * + * If the caller said that it doesn't care about subexpression match data, + * we may delete the "capture" markers on subREs that are not referenced + * by any backrefs, and then simplify anything that's become non-messy. + * Call this only if REG_NOSUB flag is set. */ static void -optst(struct vars *v, - struct subre *t) +removecaptures(struct vars *v, + struct subre *t) { + struct subre *t2; + + assert(t != NULL); + + /* + * If this isn't itself a backref target, clear capno and tentatively + * clear CAP flag. + */ + if (!(t->flags & BRUSE)) + { + t->capno = 0; + t->flags &= ~CAP; + } + + /* Now recurse to children */ + for (t2 = t->child; t2 != NULL; t2 = t2->sibling) + { + removecaptures(v, t2); + /* Propagate child CAP flag back up, if it's still set */ + if (t2->flags & CAP) + t->flags |= CAP; + } + /* - * DGP (2007-11-13): I assume it was the programmer's intent to eventually - * come back and add code to optimize subRE trees, but the routine coded - * just spends effort traversing the tree and doing nothing. We can do - * nothing with less effort. + * If t now contains neither captures nor backrefs, there's no longer any + * need to care where its sub-match boundaries are, so we can reduce it to + * a simple DFA node. (Note in particular that MIXED child greediness is + * not a hindrance here, so we don't use the MESSY() macro.) */ - return; + if ((t->flags & (CAP | BACKR)) == 0) + { + if (t->child) + freesubreandsiblings(v, t->child); + t->child = NULL; + t->op = '='; + t->flags &= ~MIXED; + } } /* @@ -2501,6 +2539,8 @@ stdump(struct subre *t, fprintf(f, " hascapture"); if (t->flags & BACKR) fprintf(f, " hasbackref"); + if (t->flags & BRUSE) + fprintf(f, " isreferenced"); if (!(t->flags & INUSE)) fprintf(f, " UNUSED"); if (t->latype != (char) -1) diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 5b9a087820..c08a224618 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -213,12 +213,10 @@ pg_regexec(regex_t *re, return REG_NOMATCH; backref = (v->g->info & REG_UBACKREF) ? 1 : 0; v->eflags = flags; - if (v->g->cflags & REG_NOSUB) - nmatch = 0; /* override client */ v->nmatch = nmatch; - if (backref) + if (backref && nmatch <= v->g->nsub) { - /* need work area */ + /* need larger work area */ if (v->g->nsub + 1 <= LOCALMAT) v->pmatch = mat; else @@ -229,7 +227,13 @@ pg_regexec(regex_t *re, v->nmatch = v->g->nsub + 1; } else + { + /* we can store results directly in caller's array */ v->pmatch = pmatch; + /* ensure any extra entries in caller's array are filled with -1 */ + if (nmatch > 0) + zapallsubs(pmatch, nmatch); + } v->details = details; v->start = (chr *) string; v->search_start = (chr *) string + search_start; @@ -290,12 +294,20 @@ pg_regexec(regex_t *re, else st = find(v, &v->g->tree->cnfa, &v->g->cmap); - /* copy (portion of) match vector over if necessary */ - if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) + /* on success, ensure caller's match vector is filled correctly */ + if (st == REG_OKAY && nmatch > 0) { - zapallsubs(pmatch, nmatch); - n = (nmatch < v->nmatch) ? nmatch : v->nmatch; - memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t)); + if (v->pmatch != pmatch) + { + /* copy portion of match vector over from (larger) work area */ + assert(nmatch <= v->nmatch); + memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t)); + } + if (v->g->cflags & REG_NOSUB) + { + /* don't expose possibly-partial sub-match results to caller */ + zapallsubs(pmatch, nmatch); + } } /* clean up */ @@ -752,7 +764,6 @@ cdissect(struct vars *v, break; case '(': /* no-op capture node */ assert(t->child != NULL); - assert(t->capno > 0); er = cdissect(v, t->child, begin, end); break; default: diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y index de3d97931e..bd5d4488a0 100644 --- a/src/backend/utils/adt/jsonpath_gram.y +++ b/src/backend/utils/adt/jsonpath_gram.y @@ -583,6 +583,14 @@ jspConvertRegexFlags(uint32 xflags) errmsg("XQuery \"x\" flag (expanded regular expressions) is not implemented"))); } + /* + * We'll never need sub-match details at execution. While + * RE_compile_and_execute would set this flag anyway, force it on here to + * ensure that the regex cache entries created by makeItemLikeRegex are + * useful. + */ + cflags |= REG_NOSUB; + return cflags; } diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 484d4265fd..268cee1cbe 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -347,6 +347,10 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len, { regex_t *re; + /* Use REG_NOSUB if caller does not want sub-match details */ + if (nmatch < 2) + cflags |= REG_NOSUB; + /* Compile RE */ re = RE_compile_and_cache(text_re, cflags, collation); @@ -1412,6 +1416,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, int orig_len; pg_wchar *wide_str; int wide_len; + int cflags; regex_t *cpattern; regmatch_t *pmatch; int pmatch_len; @@ -1430,7 +1435,10 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); /* set up the compiled pattern */ - cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation); + cflags = re_flags->cflags; + if (!use_subpatterns) + cflags |= REG_NOSUB; + cpattern = RE_compile_and_cache(pattern, cflags, collation); /* do we want to remember subpatterns? */ if (use_subpatterns && cpattern->re_nsub > 0) @@ -1952,7 +1960,7 @@ regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation, if (case_insensitive) cflags |= REG_ICASE; - re = RE_compile_and_cache(text_re, cflags, collation); + re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation); /* Examine it to see if there's a fixed prefix */ re_result = pg_regprefix(re, &str, &slen); diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index 2b48a19fb7..0455ae8069 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -106,7 +106,7 @@ typedef struct #define REG_QUOTE 000004 /* no special characters, none */ #define REG_NOSPEC REG_QUOTE /* historical synonym */ #define REG_ICASE 000010 /* ignore case */ -#define REG_NOSUB 000020 /* don't care about subexpressions */ +#define REG_NOSUB 000020 /* caller doesn't need subexpr match data */ #define REG_EXPANDED 000040 /* expanded format, white space & comments */ #define REG_NLSTOP 000100 /* \n doesn't match . or [^ ] */ #define REG_NLANCH 000200 /* ^ matches after \n, $ before */ diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 8db631c83b..91a52840c4 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -477,13 +477,14 @@ struct subre #define MIXED 04 /* mixed preference below */ #define CAP 010 /* capturing parens here or below */ #define BACKR 020 /* back reference here or below */ +#define BRUSE 040 /* is referenced by a back reference */ #define INUSE 0100 /* in use in final tree */ -#define NOPROP 03 /* bits which may not propagate up */ +#define UPPROP (MIXED|CAP|BACKR) /* flags which should propagate up */ #define LMIX(f) ((f)<<2) /* LONGER -> MIXED */ #define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */ -#define UP(f) (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED)) +#define UP(f) (((f)&UPPROP) | (LMIX(f) & SMIX(f) & MIXED)) #define MESSY(f) ((f)&(MIXED|CAP|BACKR)) -#define PREF(f) ((f)&NOPROP) +#define PREF(f) ((f)&(LONGER|SHORTER)) #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) char latype; /* LATYPE code, if lookaround constraint */ diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out index 44da7d2019..5a6cdf47c2 100644 --- a/src/test/modules/test_regex/expected/test_regex.out +++ b/src/test/modules/test_regex/expected/test_regex.out @@ -146,12 +146,12 @@ select * from test_regex('(a)e', 'ae', '-'); {ae,a} (2 rows) --- expectMatch 4.2 o (a)e ae -select * from test_regex('(a)e', 'ae', 'o'); - test_regex ------------- - {0} - {NULL} +-- expectMatch 4.2 oPR (.)\1e abeaae aae {} +select * from test_regex('(.)\1e', 'abeaae', 'oPR'); + test_regex +-------------------------------- + {1,REG_UBACKREF,REG_UNONPOSIX} + {aae,NULL} (2 rows) -- expectMatch 4.3 b {\(a\)b} ab ab a @@ -2658,6 +2658,20 @@ select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP'); {"abc abc",abc} (2 rows) +-- exercise oversize-regmatch_t-array paths in regexec() +-- (that case is not reachable via test_regex, sadly) +select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)'); + substring +----------- + f +(1 row) + +select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)'); + regexp_split_to_array +----------------------- + {abc,def,ghi} +(1 row) + -- doing 15 "octal escapes vs back references" -- # initial zero is always octal -- expectMatch 15.1 MP "a\\010b" "a\bb" "a\bb" @@ -3476,6 +3490,22 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ'); {x,x,x,NULL} (2 rows) +-- expectMatch 21.37 RP ((.))(\2) xyy yy y y y +select * from test_regex('((.))(\2)', 'xyy', 'RP'); + test_regex +-------------------------------- + {3,REG_UBACKREF,REG_UNONPOSIX} + {yy,y,y,y} +(2 rows) + +-- expectMatch 21.38 oRP ((.))(\2) xyy yy {} {} {} +select * from test_regex('((.))(\2)', 'xyy', 'oRP'); + test_regex +-------------------------------- + {3,REG_UBACKREF,REG_UNONPOSIX} + {yy,NULL,NULL,NULL} +(2 rows) + -- doing 22 "multicharacter collating elements" -- # again ugh -- MCCEs are not implemented in Postgres, so we skip all these tests diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql index 9224fdfdd3..3419564203 100644 --- a/src/test/modules/test_regex/sql/test_regex.sql +++ b/src/test/modules/test_regex/sql/test_regex.sql @@ -63,8 +63,8 @@ select * from test_regex('ab', 'ab', 'b'); -- expectMatch 4.1 - (a)e ae ae a select * from test_regex('(a)e', 'ae', '-'); --- expectMatch 4.2 o (a)e ae -select * from test_regex('(a)e', 'ae', 'o'); +-- expectMatch 4.2 oPR (.)\1e abeaae aae {} +select * from test_regex('(.)\1e', 'abeaae', 'oPR'); -- expectMatch 4.3 b {\(a\)b} ab ab a select * from test_regex('\(a\)b', 'ab', 'b'); -- expectMatch 4.4 - a((b)c) abc abc bc b @@ -775,6 +775,11 @@ select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP'); select * from test_regex('(^\w+\M).*\1', 'abc abcd abd', 'LRP'); select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP'); +-- exercise oversize-regmatch_t-array paths in regexec() +-- (that case is not reachable via test_regex, sadly) +select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)'); +select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)'); + -- doing 15 "octal escapes vs back references" -- # initial zero is always octal @@ -1011,6 +1016,10 @@ select * from test_regex('(a*)*', 'bc', 'N'); select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M'); -- expectMatch 21.36 RPQ ((.))(\2){0} xy x x x {} select * from test_regex('((.))(\2){0}', 'xy', 'RPQ'); +-- expectMatch 21.37 RP ((.))(\2) xyy yy y y y +select * from test_regex('((.))(\2)', 'xyy', 'RP'); +-- expectMatch 21.38 oRP ((.))(\2) xyy yy {} {} {} +select * from test_regex('((.))(\2)', 'xyy', 'oRP'); -- doing 22 "multicharacter collating elements" -- # again ugh
pgsql-hackers by date: