Minor regexp hacking: code coverage, moveins() and friends - Mailing list pgsql-hackers

From Tom Lane
Subject Minor regexp hacking: code coverage, moveins() and friends
Date
Msg-id 810272.1629064063@sss.pgh.pa.us
Whole thread Raw
List pgsql-hackers
While trying to improve the code-coverage report for backend/regex/,
I found that there are portions of copyins() and copyouts() that are
just plain unreachable, because those functions are over-engineered.
In point of fact, the only uses of copyins() and copyouts() are in
places where the target state is brand new and cannot have any
pre-existing in-arcs (resp. out-arcs).  This means that all the
trouble we're going to to de-duplicate the copied arcs is entirely
wasted; we could just copy the source arcs without extra checking.

A fairly significant fraction, though by no means all, of the calls
of moveins() and moveouts() are likewise working with new target
states, and so don't really need to do any de-duplication.

Hence I propose 0001 attached, which creates simplified functions
copyinstonew() and so on, for use when the target state is known not
to have any existing arcs.  I'd thought that this might show a useful
improvement in regexp compilation speed, but it's pretty hard to
measure any noticeable change on typical regexps such as Jacobson's
web corpus.  (I do see maybe a 1% improvement on that, but that's
below the noise threshold so I don't take it too seriously.)  It is
possible to demonstrate noticeable improvement on handpicked regexes,
for example on HEAD:

regression=# SELECT regexp_matches('foo', 'abcdefghijklmnopq((\y|.?)+)+','');
 regexp_matches 
----------------
(0 rows)

Time: 6.297 ms

versus with patch:

regression=# SELECT regexp_matches('foo', 'abcdefghijklmnopq((\y|.?)+)+','');
 regexp_matches 
----------------
(0 rows)

Time: 5.506 ms

So this isn't entirely a waste of time, but it is marginal.  Improving
the code-coverage numbers is probably a better argument.  (0001 also
adds some test cases that exercise nearly everything that's reachable
without OOM conditions or cancels in regc_nfa.c.)

0002 below is some additional test cases to improve code coverage in
regc_locale.c and regc_pg_locale.c.  (regc_pg_locale.c is still not
great, but that's mostly because much of the code is only reachable
for particular choices of database encoding, so any one coverage
run hits just some of it.)

Barring objections, I plan to push this in a day or two; I don't
think it needs much review.

            regards, tom lane

diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index 6d77c59e12..2b5ffcba8f 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -867,9 +867,37 @@ moveins(struct nfa *nfa,
     assert(oldState->ins == NULL);
 }

+/*
+ * moveinstonew - move all in arcs of a state to another state
+ *
+ * The newState must not have any existing in-arcs.
+ */
+static void
+moveinstonew(struct nfa *nfa,
+             struct state *oldState,
+             struct state *newState)
+{
+    struct arc *a;
+
+    assert(oldState != newState);
+    assert(newState->ins == NULL && newState->nins == 0);
+
+    /*
+     * Since there is no risk of creating duplicate arcs (given that none
+     * exist already), we can just use createarc directly and not spend any
+     * time de-duplicating.
+     */
+    while ((a = oldState->ins) != NULL)
+    {
+        createarc(nfa, a->type, a->co, a->from, newState);
+        freearc(nfa, a);
+    }
+}
+
 /*
  * copyins - copy in arcs of a state to another state
  */
+#ifdef NOT_USED                    /* not currently needed */
 static void
 copyins(struct nfa *nfa,
         struct state *oldState,
@@ -945,6 +973,31 @@ copyins(struct nfa *nfa,
         }
     }
 }
+#endif                            /* NOT_USED */
+
+/*
+ * copyinstonew - copy in arcs of a state to another state
+ *
+ * The newState must not have any existing in-arcs.
+ */
+static void
+copyinstonew(struct nfa *nfa,
+             struct state *oldState,
+             struct state *newState)
+{
+    struct arc *a;
+
+    assert(oldState != newState);
+    assert(newState->ins == NULL && newState->nins == 0);
+
+    /*
+     * Since there is no risk of creating duplicate arcs (given that none
+     * exist already), we can just use createarc directly and not spend any
+     * time de-duplicating.
+     */
+    for (a = oldState->ins; a != NULL; a = a->inchain)
+        createarc(nfa, a->type, a->co, a->from, newState);
+}

 /*
  * mergeins - merge a list of inarcs into a state
@@ -1140,9 +1193,37 @@ moveouts(struct nfa *nfa,
     assert(oldState->outs == NULL);
 }

+/*
+ * moveoutstonew - move all out arcs of a state to another state
+ *
+ * The newState must not have any existing out-arcs.
+ */
+static void
+moveoutstonew(struct nfa *nfa,
+              struct state *oldState,
+              struct state *newState)
+{
+    struct arc *a;
+
+    assert(oldState != newState);
+    assert(newState->outs == NULL && newState->nouts == 0);
+
+    /*
+     * Since there is no risk of creating duplicate arcs (given that none
+     * exist already), we can just use createarc directly and not spend any
+     * time de-duplicating.
+     */
+    while ((a = oldState->outs) != NULL)
+    {
+        createarc(nfa, a->type, a->co, newState, a->to);
+        freearc(nfa, a);
+    }
+}
+
 /*
  * copyouts - copy out arcs of a state to another state
  */
+#ifdef NOT_USED                    /* not currently needed */
 static void
 copyouts(struct nfa *nfa,
          struct state *oldState,
@@ -1218,6 +1299,31 @@ copyouts(struct nfa *nfa,
         }
     }
 }
+#endif                            /* NOT_USED */
+
+/*
+ * copyoutstonew - copy out arcs of a state to another state
+ *
+ * The newState must not have any existing out-arcs.
+ */
+static void
+copyoutstonew(struct nfa *nfa,
+              struct state *oldState,
+              struct state *newState)
+{
+    struct arc *a;
+
+    assert(oldState != newState);
+    assert(newState->outs == NULL && newState->nouts == 0);
+
+    /*
+     * Since there is no risk of creating duplicate arcs (given that none
+     * exist already), we can just use createarc directly and not spend any
+     * time de-duplicating.
+     */
+    for (a = oldState->outs; a != NULL; a = a->outchain)
+        createarc(nfa, a->type, a->co, newState, a->to);
+}

 /*
  * cloneouts - copy out arcs of a state to another state pair, modifying type
@@ -1712,7 +1818,7 @@ pull(struct nfa *nfa,
         s = newstate(nfa);
         if (NISERR())
             return 0;
-        copyins(nfa, from, s);    /* duplicate inarcs */
+        copyinstonew(nfa, from, s); /* duplicate inarcs */
         cparc(nfa, con, s, to); /* move constraint arc */
         freearc(nfa, con);
         if (NISERR())
@@ -1883,7 +1989,7 @@ push(struct nfa *nfa,
         s = newstate(nfa);
         if (NISERR())
             return 0;
-        copyouts(nfa, to, s);    /* duplicate outarcs */
+        copyoutstonew(nfa, to, s);    /* duplicate outarcs */
         cparc(nfa, con, from, s);    /* move constraint arc */
         freearc(nfa, con);
         if (NISERR())
@@ -1975,6 +2081,7 @@ combine(struct nfa *nfa,
             else if (a->co == RAINBOW)
             {
                 /* con is incompatible if it's for a pseudocolor */
+                /* (this is hypothetical; we make no such constraints today) */
                 if (nfa->cm->cd[con->co].flags & PSEUDO)
                     return INCOMPATIBLE;
                 /* otherwise, constraint constrains arc to be only its color */
@@ -2001,6 +2108,7 @@ combine(struct nfa *nfa,
             else if (a->co == RAINBOW)
             {
                 /* con is incompatible if it's for a pseudocolor */
+                /* (this is hypothetical; we make no such constraints today) */
                 if (nfa->cm->cd[con->co].flags & PSEUDO)
                     return INCOMPATIBLE;
                 /* otherwise, constraint constrains arc to be only its color */
@@ -3562,6 +3670,7 @@ carc_cmp(const void *a, const void *b)
         return -1;
     if (aa->to > bb->to)
         return +1;
+    /* This is unreached, since there should be no duplicate arcs now: */
     return 0;
 }

diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index ae3a7b6a38..979619f9a0 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -141,10 +141,12 @@ static int    sortins_cmp(const void *, const void *);
 static void sortouts(struct nfa *, struct state *);
 static int    sortouts_cmp(const void *, const void *);
 static void moveins(struct nfa *, struct state *, struct state *);
-static void copyins(struct nfa *, struct state *, struct state *);
+static void moveinstonew(struct nfa *, struct state *, struct state *);
+static void copyinstonew(struct nfa *, struct state *, struct state *);
 static void mergeins(struct nfa *, struct state *, struct arc **, int);
 static void moveouts(struct nfa *, struct state *, struct state *);
-static void copyouts(struct nfa *, struct state *, struct state *);
+static void moveoutstonew(struct nfa *, struct state *, struct state *);
+static void copyoutstonew(struct nfa *, struct state *, struct state *);
 static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
 static void delsub(struct nfa *, struct state *, struct state *);
 static void deltraverse(struct nfa *, struct state *, struct state *);
@@ -639,7 +641,7 @@ makesearch(struct vars *v,
     {
         s2 = newstate(nfa);
         NOERR();
-        copyouts(nfa, s, s2);
+        copyoutstonew(nfa, s, s2);
         NOERR();
         for (a = s->ins; a != NULL; a = b)
         {
@@ -752,7 +754,7 @@ parsebranch(struct vars *v,
         {                        /* implicit concat operator */
             lp = newstate(v->nfa);
             NOERRN();
-            moveins(v->nfa, right, lp);
+            moveinstonew(v->nfa, right, lp);
         }
         seencontent = 1;

@@ -1137,8 +1139,8 @@ parseqatom(struct vars *v,
         s = newstate(v->nfa);
         s2 = newstate(v->nfa);
         NOERRN();
-        moveouts(v->nfa, lp, s);
-        moveins(v->nfa, rp, s2);
+        moveoutstonew(v->nfa, lp, s);
+        moveinstonew(v->nfa, rp, s2);
         atom->begin = s;
         atom->end = s2;
     }
@@ -1290,7 +1292,7 @@ parseqatom(struct vars *v,
         /* general case: need an iteration node */
         s2 = newstate(v->nfa);
         NOERRN();
-        moveouts(v->nfa, atom->end, s2);
+        moveoutstonew(v->nfa, atom->end, s2);
         NOERRN();
         dupnfa(v->nfa, atom->begin, atom->end, s, s2);
         repeat(v, s, s2, m, n);
@@ -1552,8 +1554,8 @@ repeat(struct vars *v,
         case PAIR(0, INF):        /* loop x around */
             s = newstate(v->nfa);
             NOERR();
-            moveouts(v->nfa, lp, s);
-            moveins(v->nfa, rp, s);
+            moveoutstonew(v->nfa, lp, s);
+            moveinstonew(v->nfa, rp, s);
             EMPTYARC(lp, s);
             EMPTYARC(s, rp);
             break;
@@ -1562,7 +1564,7 @@ repeat(struct vars *v,
         case PAIR(1, SOME):        /* do as x{0,n-1}x = (x{1,n-1}|)x */
             s = newstate(v->nfa);
             NOERR();
-            moveouts(v->nfa, lp, s);
+            moveoutstonew(v->nfa, lp, s);
             dupnfa(v->nfa, s, rp, lp, s);
             NOERR();
             repeat(v, lp, s, 1, n - 1);
@@ -1573,8 +1575,8 @@ repeat(struct vars *v,
             s = newstate(v->nfa);
             s2 = newstate(v->nfa);
             NOERR();
-            moveouts(v->nfa, lp, s);
-            moveins(v->nfa, rp, s2);
+            moveoutstonew(v->nfa, lp, s);
+            moveinstonew(v->nfa, rp, s2);
             EMPTYARC(lp, s);
             EMPTYARC(s2, rp);
             EMPTYARC(s2, s);
@@ -1582,7 +1584,7 @@ repeat(struct vars *v,
         case PAIR(SOME, SOME):    /* do as x{m-1,n-1}x */
             s = newstate(v->nfa);
             NOERR();
-            moveouts(v->nfa, lp, s);
+            moveoutstonew(v->nfa, lp, s);
             dupnfa(v->nfa, s, rp, lp, s);
             NOERR();
             repeat(v, lp, s, m - 1, n - 1);
@@ -1590,7 +1592,7 @@ repeat(struct vars *v,
         case PAIR(SOME, INF):    /* do as x{m-1,}x */
             s = newstate(v->nfa);
             NOERR();
-            moveouts(v->nfa, lp, s);
+            moveoutstonew(v->nfa, lp, s);
             dupnfa(v->nfa, s, rp, lp, s);
             NOERR();
             repeat(v, lp, s, m - 1, n);
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 5a6cdf47c2..611b6c7243 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -4926,3 +4926,59 @@ select * from test_regex('(\Y)+', 'foo', 'LNP');
  {"",""}
 (2 rows)

+-- and now, tests not from either Spencer or the Tcl project
+-- These cases exercise additional code paths in pushfwd()/push()/combine()
+select * from test_regex('a\Y(?=45)', 'a45', 'HLP');
+                  test_regex
+-----------------------------------------------
+ {0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE}
+ {a}
+(2 rows)
+
+select * from test_regex('a(?=.)c', 'ac', 'HP');
+            test_regex
+-----------------------------------
+ {0,REG_ULOOKAROUND,REG_UNONPOSIX}
+ {ac}
+(2 rows)
+
+select * from test_regex('a(?=.).*(?=3)3*', 'azz33', 'HP');
+            test_regex
+-----------------------------------
+ {0,REG_ULOOKAROUND,REG_UNONPOSIX}
+ {azz33}
+(2 rows)
+
+select * from test_regex('a(?=\w)\w*(?=.).*', 'az3%', 'HLP');
+                  test_regex
+-----------------------------------------------
+ {0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE}
+ {az3%}
+(2 rows)
+
+-- These exercise the bulk-arc-movement paths in moveins() and moveouts();
+-- you may need to make them longer if you change BULK_ARC_OP_USE_SORT()
+select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ(?:\w|a|b|c|d|e|f|0|1|2|3|4|5|6|Q)',
+                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ3', 'LP');
+          test_regex
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {ABCDEFGHIJKLMNOPQRSTUVWXYZ3}
+(2 rows)
+
+select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(\Y\Y)+',
+                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789Z', 'LP');
+                test_regex
+-------------------------------------------
+ {1,REG_UNONPOSIX,REG_ULOCALE}
+ {ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,""}
+(2 rows)
+
+select * from test_regex('((x|xabcdefghijklmnopqrstuvwxyz0123456789)x*|[^y]z)$',
+                         'az', '');
+  test_regex
+--------------
+ {2}
+ {az,az,NULL}
+(2 rows)
+
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 3419564203..13a0586272 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -1741,3 +1741,21 @@ select * from test_regex(repeat('x*y*z*', 200), 'x', 'N');
 --     regexp {(\Y)+} foo
 -- } 1
 select * from test_regex('(\Y)+', 'foo', 'LNP');
+
+
+-- and now, tests not from either Spencer or the Tcl project
+
+-- These cases exercise additional code paths in pushfwd()/push()/combine()
+select * from test_regex('a\Y(?=45)', 'a45', 'HLP');
+select * from test_regex('a(?=.)c', 'ac', 'HP');
+select * from test_regex('a(?=.).*(?=3)3*', 'azz33', 'HP');
+select * from test_regex('a(?=\w)\w*(?=.).*', 'az3%', 'HLP');
+
+-- These exercise the bulk-arc-movement paths in moveins() and moveouts();
+-- you may need to make them longer if you change BULK_ARC_OP_USE_SORT()
+select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ(?:\w|a|b|c|d|e|f|0|1|2|3|4|5|6|Q)',
+                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ3', 'LP');
+select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(\Y\Y)+',
+                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789Z', 'LP');
+select * from test_regex('((x|xabcdefghijklmnopqrstuvwxyz0123456789)x*|[^y]z)$',
+                         'az', '');
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 611b6c7243..6242d0baa9 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -937,6 +937,34 @@ select * from test_regex('a[[=x=]]', 'az', '+Lb');
  {0,REG_ULOCALE}
 (1 row)

+-- expectMatch    9.9b  &iL    {a[[=Y=]]}    ay    ay
+select * from test_regex('a[[=Y=]]', 'ay', 'iL');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {ay}
+(2 rows)
+
+select * from test_regex('a[[=Y=]]', 'ay', 'iLb');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {ay}
+(2 rows)
+
+-- expectNomatch    9.9c  &L    {a[[=Y=]]}    ay
+select * from test_regex('a[[=Y=]]', 'ay', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+(1 row)
+
+select * from test_regex('a[[=Y=]]', 'ay', 'Lb');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+(1 row)
+
 -- expectError    9.10 &        {a[0-[=x=]]}    ERANGE
 select * from test_regex('a[0-[=x=]]', '', '');
 ERROR:  invalid regular expression: invalid character range
@@ -2932,6 +2960,34 @@ select * from test_regex('a[^b-d]', 'aC', 'iMb');
  {0,REG_UUNPORT}
 (1 row)

+-- expectMatch    19.6 &iM    {a[B-Z]}    aC    aC
+select * from test_regex('a[B-Z]', 'aC', 'iM');
+   test_regex
+-----------------
+ {0,REG_UUNPORT}
+ {aC}
+(2 rows)
+
+select * from test_regex('a[B-Z]', 'aC', 'iMb');
+   test_regex
+-----------------
+ {0,REG_UUNPORT}
+ {aC}
+(2 rows)
+
+-- expectNomatch    19.7 &iM    {a[^B-Z]}    aC
+select * from test_regex('a[^B-Z]', 'aC', 'iM');
+   test_regex
+-----------------
+ {0,REG_UUNPORT}
+(1 row)
+
+select * from test_regex('a[^B-Z]', 'aC', 'iMb');
+   test_regex
+-----------------
+ {0,REG_UUNPORT}
+(1 row)
+
 -- doing 20 "directors and embedded options"
 -- expectError    20.1  &        ***?        BADPAT
 select * from test_regex('***?', '', '');
@@ -3850,6 +3906,14 @@ select * from test_regex('^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', 'foo/bar/baz',
  {foo/bar/baz,foo,bar,baz}
 (2 rows)

+-- expectMatch    24.14 PRT    {^(.+?)(?:/(.+?))(?:/(.+?)\3)?$}    {foo/bar/baz/quux}    {foo/bar/baz/quux}    {foo}
{bar/baz/quux}    {} 
+select * from test_regex('^(.+?)(?:/(.+?))(?:/(.+?)\3)?$', 'foo/bar/baz/quux', 'PRT');
+                  test_regex
+----------------------------------------------
+ {3,REG_UBACKREF,REG_UNONPOSIX,REG_USHORTEST}
+ {foo/bar/baz/quux,foo,bar/baz/quux,NULL}
+(2 rows)
+
 -- doing 25 "mixed quantifiers"
 -- # this is very incomplete as yet
 -- # should include |
diff --git a/src/test/modules/test_regex/expected/test_regex_utf8.out
b/src/test/modules/test_regex/expected/test_regex_utf8.out
index 112698ac61..3b56f36c07 100644
--- a/src/test/modules/test_regex/expected/test_regex_utf8.out
+++ b/src/test/modules/test_regex/expected/test_regex_utf8.out
@@ -98,3 +98,109 @@ select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
  {0,REG_UBBS,REG_UNONPOSIX,REG_UUNPORT,REG_ULOCALE}
 (1 row)

+select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
+  E'\u1500\u1237', 'iELMP');
+                     test_regex
+----------------------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_UUNPORT,REG_ULOCALE}
+ {ᔀሷ}
+(2 rows)
+
+-- systematically test char classes
+select * from test_regex('[[:alnum:]]+',  E'x\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {xᔀሷ}
+(2 rows)
+
+select * from test_regex('[[:alpha:]]+',  E'x\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {xᔀሷ}
+(2 rows)
+
+select * from test_regex('[[:ascii:]]+',  E'x\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {x}
+(2 rows)
+
+select * from test_regex('[[:blank:]]+',  E'x \t\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {"      "}
+(2 rows)
+
+select * from test_regex('[[:cntrl:]]+',  E'x\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+(1 row)
+
+select * from test_regex('[[:digit:]]+',  E'x9\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {9}
+(2 rows)
+
+select * from test_regex('[[:graph:]]+',  E'x\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {xᔀሷ}
+(2 rows)
+
+select * from test_regex('[[:lower:]]+',  E'x\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {x}
+(2 rows)
+
+select * from test_regex('[[:print:]]+',  E'x\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {xᔀሷ}
+(2 rows)
+
+select * from test_regex('[[:punct:]]+',  E'x.\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {.}
+(2 rows)
+
+select * from test_regex('[[:space:]]+',  E'x \t\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {"      "}
+(2 rows)
+
+select * from test_regex('[[:upper:]]+',  E'xX\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {X}
+(2 rows)
+
+select * from test_regex('[[:xdigit:]]+',  E'xa9\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {a9}
+(2 rows)
+
+select * from test_regex('[[:word:]]+',  E'x_\u1500\u1237', 'L');
+   test_regex
+-----------------
+ {0,REG_ULOCALE}
+ {x_ᔀሷ}
+(2 rows)
+
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 13a0586272..389b8b61b3 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -304,6 +304,12 @@ select * from test_regex('a[[=x=]]', 'ay', '+Lb');
 -- expectNomatch    9.9  &+L    {a[[=x=]]}    az
 select * from test_regex('a[[=x=]]', 'az', '+L');
 select * from test_regex('a[[=x=]]', 'az', '+Lb');
+-- expectMatch    9.9b  &iL    {a[[=Y=]]}    ay    ay
+select * from test_regex('a[[=Y=]]', 'ay', 'iL');
+select * from test_regex('a[[=Y=]]', 'ay', 'iLb');
+-- expectNomatch    9.9c  &L    {a[[=Y=]]}    ay
+select * from test_regex('a[[=Y=]]', 'ay', 'L');
+select * from test_regex('a[[=Y=]]', 'ay', 'Lb');
 -- expectError    9.10 &        {a[0-[=x=]]}    ERANGE
 select * from test_regex('a[0-[=x=]]', '', '');
 select * from test_regex('a[0-[=x=]]', '', 'b');
@@ -864,6 +870,12 @@ select * from test_regex('a[b-d]', 'aC', 'iMb');
 -- expectNomatch    19.5 &iM    {a[^b-d]}    aC
 select * from test_regex('a[^b-d]', 'aC', 'iM');
 select * from test_regex('a[^b-d]', 'aC', 'iMb');
+-- expectMatch    19.6 &iM    {a[B-Z]}    aC    aC
+select * from test_regex('a[B-Z]', 'aC', 'iM');
+select * from test_regex('a[B-Z]', 'aC', 'iMb');
+-- expectNomatch    19.7 &iM    {a[^B-Z]}    aC
+select * from test_regex('a[^B-Z]', 'aC', 'iM');
+select * from test_regex('a[^B-Z]', 'aC', 'iMb');

 -- doing 20 "directors and embedded options"

@@ -1171,6 +1183,8 @@ select * from test_regex('z*4', '123zzzz456', '-');
 select * from test_regex('z*?4', '123zzzz456', 'PT');
 -- expectMatch    24.13 PT    {^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$}    {foo/bar/baz}    {foo/bar/baz} {foo} {bar}
{baz}
 select * from test_regex('^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', 'foo/bar/baz', 'PT');
+-- expectMatch    24.14 PRT    {^(.+?)(?:/(.+?))(?:/(.+?)\3)?$}    {foo/bar/baz/quux}    {foo/bar/baz/quux}    {foo}
{bar/baz/quux}    {} 
+select * from test_regex('^(.+?)(?:/(.+?))(?:/(.+?)\3)?$', 'foo/bar/baz/quux', 'PRT');

 -- doing 25 "mixed quantifiers"
 -- # this is very incomplete as yet
diff --git a/src/test/modules/test_regex/sql/test_regex_utf8.sql b/src/test/modules/test_regex/sql/test_regex_utf8.sql
index cfd9396194..f23907162e 100644
--- a/src/test/modules/test_regex/sql/test_regex_utf8.sql
+++ b/src/test/modules/test_regex/sql/test_regex_utf8.sql
@@ -58,3 +58,21 @@ select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
   E'\u1500\u1237', 'ELMP');
 select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
   E'A\u1239', 'ELMP');
+select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
+  E'\u1500\u1237', 'iELMP');
+
+-- systematically test char classes
+select * from test_regex('[[:alnum:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:alpha:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:ascii:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:blank:]]+',  E'x \t\u1500\u1237', 'L');
+select * from test_regex('[[:cntrl:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:digit:]]+',  E'x9\u1500\u1237', 'L');
+select * from test_regex('[[:graph:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:lower:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:print:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:punct:]]+',  E'x.\u1500\u1237', 'L');
+select * from test_regex('[[:space:]]+',  E'x \t\u1500\u1237', 'L');
+select * from test_regex('[[:upper:]]+',  E'xX\u1500\u1237', 'L');
+select * from test_regex('[[:xdigit:]]+',  E'xa9\u1500\u1237', 'L');
+select * from test_regex('[[:word:]]+',  E'x_\u1500\u1237', 'L');

pgsql-hackers by date:

Previous
From: Masahiko Sawada
Date:
Subject: Re: Added schema level support for publication.
Next
From: David Fetter
Date:
Subject: Re: Default to TIMESTAMP WITH TIME ZONE?