Thread: Fix quadratic performance of regexp match/split functions

Fix quadratic performance of regexp match/split functions

From
Andrew Gierth
Date:
While poking at that xml issue I tried to compare the memory usage of
the xml queries with vaguely comparable regexp_split_to_table queries,
and found that I could not do so; the regexp queries simply did not
complete in any sensible timeframe.

Investigation showed that in UTF8 (or other multibyte encodings), the
performance of regexp_matches and regexp_split_to_* is subject to
quadratic slowdowns thanks to the use of character offsets and calls to
text_substr, which must scan the string from the start each time to
count characters.

This patch changes that by keeping the wide-char copy of the string
available rather than freeing it, and converting the required substrings
back to the db encoding via a pre-allocated conversion buffer.

This gives noticable speedups on even very small strings (all timings
below are with -O2 --disable-cassert):

select count(*)
  from (select 'aaaaa,aaaaa,aaaaa'::text as content
          from generate_series(1,1000000) i offset 0) s,
       regexp_split_to_table(content, ',') r;

-- ~10% faster with patch: 2.8 s -> 2.5 s

but on strings of even modest size (~1kb) the improvement is vast:

select count(*)
  from (select repeat(repeat('a',10) || ',', 100+(i*0))::text
                 as content -- 1100 bytes, 101 matches
          from generate_series(1,100000) i offset 0) s,
       regexp_split_to_table(content, ',') r;

-- over 8 times faster: 51.8 sec -> 6.3 sec

and it only gets bigger:

select count(*)
  from regexp_split_to_table(repeat('aaa,',10000), ',') r;  -- 40KB

-- 270 times faster: 1628ms -> 6ms

This patch passes regression but I haven't yet tested it on complex
multibyte data (or non-UTF8 encodings but that shouldn't matter).

Comments?

-- 
Andrew (irc:RhodiumToad)

diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 5025a449fb..8839fb4ce2 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -61,6 +61,9 @@ typedef struct regexp_matches_ctx
     /* workspace for build_regexp_match_result() */
     Datum       *elems;            /* has npatterns elements */
     bool       *nulls;            /* has npatterns elements */
+    pg_wchar   *wide_str;        /* wide-char version of original string */
+    char       *conv_buf;        /* conversion buffer */
+    size_t        conv_bufsiz;    /* size thereof */
 } regexp_matches_ctx;
 
 /*
@@ -111,7 +114,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
                      pg_re_flags *flags,
                      Oid collation,
                      bool use_subpatterns,
-                     bool ignore_degenerate);
+                     bool ignore_degenerate,
+                     bool fetching_unmatched);
 static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
 static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
 static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
                  errhint("Use the regexp_matches function instead.")));
 
     matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
-                                    PG_GET_COLLATION(), true, false);
+                                    PG_GET_COLLATION(), true, false, false);
 
     if (matchctx->nmatches == 0)
         PG_RETURN_NULL();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
         matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                         &re_flags,
                                         PG_GET_COLLATION(),
-                                        true, false);
+                                        true, false, false);
 
         /* Pre-create workspace that build_regexp_match_result needs */
         matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -954,7 +958,7 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
  * all the matching in one swoop.  The returned regexp_matches_ctx contains
  * the locations of all the substrings matching the pattern.
  *
- * The two bool parameters have only two patterns (one for matching, one for
+ * The three bool parameters have only two patterns (one for matching, one for
  * splitting) but it seems clearer to distinguish the functionality this way
  * than to key it all off one "is_split" flag.
  */
@@ -962,9 +966,11 @@ static regexp_matches_ctx *
 setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
                      Oid collation,
                      bool use_subpatterns,
-                     bool ignore_degenerate)
+                     bool ignore_degenerate,
+                     bool fetching_unmatched)
 {
     regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
+    int            eml = pg_database_encoding_max_length();
     int            orig_len;
     pg_wchar   *wide_str;
     int            wide_len;
@@ -975,6 +981,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
     int            array_idx;
     int            prev_match_end;
     int            start_search;
+    int            maxlen = 0;        /* largest fetch length in characters */
 
     /* save original string --- we'll extract result substrings from it */
     matchctx->orig_str = orig_str;
@@ -1024,7 +1031,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
              pmatch[0].rm_eo > prev_match_end))
         {
             /* enlarge output space if needed */
-            while (array_idx + matchctx->npatterns * 2 > array_len)
+            while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
             {
                 array_len *= 2;
                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
@@ -1038,16 +1045,29 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 
                 for (i = 1; i <= matchctx->npatterns; i++)
                 {
-                    matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
-                    matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
+                    int        so = pmatch[i].rm_so;
+                    int        eo = pmatch[i].rm_eo;
+                    matchctx->match_locs[array_idx++] = so;
+                    matchctx->match_locs[array_idx++] = eo;
+                    if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+                        maxlen = (eo - so);
                 }
             }
             else
             {
-                matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
-                matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
+                int        so = pmatch[0].rm_so;
+                int        eo = pmatch[0].rm_eo;
+                matchctx->match_locs[array_idx++] = so;
+                matchctx->match_locs[array_idx++] = eo;
+                if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+                    maxlen = (eo - so);
             }
             matchctx->nmatches++;
+
+            if (fetching_unmatched &&
+                pmatch[0].rm_so >= 0 &&
+                (pmatch[0].rm_so - prev_match_end) > maxlen)
+                maxlen = (pmatch[0].rm_so - prev_match_end);
         }
         prev_match_end = pmatch[0].rm_eo;
 
@@ -1068,8 +1088,45 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
             break;
     }
 
+    if (fetching_unmatched &&
+        (wide_len - prev_match_end) > maxlen)
+        maxlen = (wide_len - prev_match_end);
+
+    /*
+     * Keep a note of the end position of the string for the benefit of
+     * splitting code.
+     */
+    matchctx->match_locs[array_idx] = wide_len;
+
+    if (eml > 1)
+    {
+        int64        maxsiz = (int64) maxlen * eml;
+        size_t        conv_bufsiz;
+
+        /*
+         * worst case: assume we need the maximum size (maxlen*eml), but take
+         * advantage of the fact that the original string length in bytes is an
+         * upper bound on the byte length of any fetched substring (and we know
+         * that len+1 is safe to allocate because the varlena header is longer
+         * than 1 byte).
+         */
+        if (maxsiz > orig_len)
+            conv_bufsiz = (size_t) orig_len + 1;
+        else
+            conv_bufsiz = (size_t) maxsiz + 1;    /* safe since maxsiz < 2^30 */
+        matchctx->conv_buf = palloc(conv_bufsiz);
+        matchctx->conv_bufsiz = conv_bufsiz;
+        matchctx->wide_str = wide_str;
+    }
+    else
+    {
+        pfree(wide_str);
+        matchctx->wide_str = NULL;
+        matchctx->conv_buf = NULL;
+        matchctx->conv_bufsiz = 0;
+    }
+
     /* Clean up temp storage */
-    pfree(wide_str);
     pfree(pmatch);
 
     return matchctx;
@@ -1082,6 +1139,10 @@ static void
 cleanup_regexp_matches(regexp_matches_ctx *matchctx)
 {
     pfree(matchctx->orig_str);
+    if (matchctx->wide_str)
+        pfree(matchctx->wide_str);
+    if (matchctx->conv_buf)
+        pfree(matchctx->conv_buf);
     pfree(matchctx->match_locs);
     if (matchctx->elems)
         pfree(matchctx->elems);
@@ -1096,6 +1157,7 @@ cleanup_regexp_matches(regexp_matches_ctx *matchctx)
 static ArrayType *
 build_regexp_match_result(regexp_matches_ctx *matchctx)
 {
+    int            eml = pg_database_encoding_max_length();
     Datum       *elems = matchctx->elems;
     bool       *nulls = matchctx->nulls;
     int            dims[1];
@@ -1115,6 +1177,17 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
             elems[i] = (Datum) 0;
             nulls[i] = true;
         }
+        else if (eml > 1)
+        {
+            size_t     bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
+            char   *buf = matchctx->conv_buf;
+            int        len = pg_wchar2mb_with_len(matchctx->wide_str + so,
+                                               buf,
+                                               eo - so);
+            Assert(len < bufsiz);
+            elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
+            nulls[i] = false;
+        }
         else
         {
             elems[i] = DirectFunctionCall3(text_substr,
@@ -1168,7 +1241,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
         splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                         &re_flags,
                                         PG_GET_COLLATION(),
-                                        false, true);
+                                        false, true, true);
 
         MemoryContextSwitchTo(oldcontext);
         funcctx->user_fctx = (void *) splitctx;
@@ -1224,7 +1297,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
                                     PG_GETARG_TEXT_PP(1),
                                     &re_flags,
                                     PG_GET_COLLATION(),
-                                    false, true);
+                                    false, true, true);
 
     while (splitctx->next_match <= splitctx->nmatches)
     {
@@ -1261,6 +1334,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
 static Datum
 build_regexp_split_result(regexp_matches_ctx *splitctx)
 {
+    int            eml = pg_database_encoding_max_length();
     int            startpos;
     int            endpos;
 
@@ -1271,7 +1345,22 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
     if (startpos < 0)
         elog(ERROR, "invalid match ending position");
 
-    if (splitctx->next_match < splitctx->nmatches)
+    if (eml > 1)
+    {
+        size_t    bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
+        char   *buf = splitctx->conv_buf;
+        int        len;
+
+        endpos = splitctx->match_locs[splitctx->next_match * 2];
+        if (endpos < startpos)
+            elog(ERROR, "invalid match starting position");
+        len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
+                                   buf,
+                                   endpos-startpos);
+        Assert(len < bufsiz);
+        return PointerGetDatum(cstring_to_text_with_len(buf, len));
+    }
+    else if (splitctx->next_match < splitctx->nmatches)
     {
         endpos = splitctx->match_locs[splitctx->next_match * 2];
         if (endpos < startpos)

Re: Fix quadratic performance of regexp match/split functions

From
Andrew Gierth
Date:
>>>>> "Andrew" == Andrew Gierth <andrew@tao11.riddles.org.uk> writes:

Patch take 2. Changes:

1. Remove cleanup function with retail pfree()s; this was added in
commit ae65ca312 (Aug 2007) to fix an actual memory leak, but obsoleted
by commit ff428cded (Feb 2008); since then, the pfrees were pointless
since all the freed objects were in a memory context that was
immediately destroyed.

2. Use presence of a conversion buffer as a flag rather than call
pg_database_encoding_max_length() everywhere.

3. Increase limit on number of matches to 134 million and provide an
error message when it is reached (rather than an ugly invalid memory
request error). This limit could be removed by using repalloc_huge, but
that should probably be paired with equivalent changes in RE_execute and
done in a separate patch.

4. Disuse size_t in favour of int for sizes that can't overflow an int,
to avoid any chance of signed/unsigned mixups.

5. Remove special-case "substring to end of string" logic in the
single-byte case; adding an end-of-string position to the end of the
matches array makes it unnecessary and there's no performance benefit.

6. Moar commentz.

-- 
Andrew (irc:RhodiumToad)

diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 5025a449fb..982b419c9d 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -61,6 +61,9 @@ typedef struct regexp_matches_ctx
     /* workspace for build_regexp_match_result() */
     Datum       *elems;            /* has npatterns elements */
     bool       *nulls;            /* has npatterns elements */
+    pg_wchar   *wide_str;        /* wide-char version of original string */
+    char       *conv_buf;        /* conversion buffer */
+    int            conv_bufsiz;    /* size thereof */
 } regexp_matches_ctx;
 
 /*
@@ -111,8 +114,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
                      pg_re_flags *flags,
                      Oid collation,
                      bool use_subpatterns,
-                     bool ignore_degenerate);
-static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
+                     bool ignore_degenerate,
+                     bool fetching_unmatched);
 static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
 static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
 
@@ -863,7 +866,7 @@ regexp_match(PG_FUNCTION_ARGS)
                  errhint("Use the regexp_matches function instead.")));
 
     matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
-                                    PG_GET_COLLATION(), true, false);
+                                    PG_GET_COLLATION(), true, false, false);
 
     if (matchctx->nmatches == 0)
         PG_RETURN_NULL();
@@ -911,7 +914,7 @@ regexp_matches(PG_FUNCTION_ARGS)
         matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                         &re_flags,
                                         PG_GET_COLLATION(),
-                                        true, false);
+                                        true, false, false);
 
         /* Pre-create workspace that build_regexp_match_result needs */
         matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -933,9 +936,6 @@ regexp_matches(PG_FUNCTION_ARGS)
         SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
     }
 
-    /* release space in multi-call ctx to avoid intraquery memory leak */
-    cleanup_regexp_matches(matchctx);
-
     SRF_RETURN_DONE(funcctx);
 }
 
@@ -954,17 +954,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
  * all the matching in one swoop.  The returned regexp_matches_ctx contains
  * the locations of all the substrings matching the pattern.
  *
- * The two bool parameters have only two patterns (one for matching, one for
+ * The three bool parameters have only two patterns (one for matching, one for
  * splitting) but it seems clearer to distinguish the functionality this way
- * than to key it all off one "is_split" flag.
+ * than to key it all off one "is_split" flag. We don't currently assume that
+ * fetching_unmatched is exclusive of fetching the matched text too; if it's
+ * set, the conversion buffer is large enough to fetch any single matched or
+ * unmatched string, but not any larger substring. (In practice, when splitting
+ * the matches are usually small anyway, and it didn't seem worth complicating
+ * the code further.)
  */
 static regexp_matches_ctx *
 setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
                      Oid collation,
                      bool use_subpatterns,
-                     bool ignore_degenerate)
+                     bool ignore_degenerate,
+                     bool fetching_unmatched)
 {
     regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
+    int            eml = pg_database_encoding_max_length();
     int            orig_len;
     pg_wchar   *wide_str;
     int            wide_len;
@@ -975,6 +982,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
     int            array_idx;
     int            prev_match_end;
     int            start_search;
+    int            maxlen = 0;        /* largest fetch length in characters */
 
     /* save original string --- we'll extract result substrings from it */
     matchctx->orig_str = orig_str;
@@ -1003,8 +1011,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
     /* temporary output space for RE package */
     pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
 
-    /* the real output space (grown dynamically if needed) */
-    array_len = re_flags->glob ? 256 : 32;
+    /*
+     * the real output space (grown dynamically if needed)
+     *
+     * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
+     * than at 2^27
+     */
+    array_len = re_flags->glob ? 255 : 31;
     matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
     array_idx = 0;
 
@@ -1024,9 +1037,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
              pmatch[0].rm_eo > prev_match_end))
         {
             /* enlarge output space if needed */
-            while (array_idx + matchctx->npatterns * 2 > array_len)
+            while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
             {
-                array_len *= 2;
+                array_len += array_len + 1;        /* 2^n-1 => 2^(n+1)-1 */
+                if (array_len > MaxAllocSize/sizeof(int))
+                    ereport(ERROR,
+                            (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                             errmsg("too many regular expression matches")));
                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
                                                         sizeof(int) * array_len);
             }
@@ -1038,16 +1055,33 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 
                 for (i = 1; i <= matchctx->npatterns; i++)
                 {
-                    matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
-                    matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
+                    int        so = pmatch[i].rm_so;
+                    int        eo = pmatch[i].rm_eo;
+                    matchctx->match_locs[array_idx++] = so;
+                    matchctx->match_locs[array_idx++] = eo;
+                    if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+                        maxlen = (eo - so);
                 }
             }
             else
             {
-                matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
-                matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
+                int        so = pmatch[0].rm_so;
+                int        eo = pmatch[0].rm_eo;
+                matchctx->match_locs[array_idx++] = so;
+                matchctx->match_locs[array_idx++] = eo;
+                if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
+                    maxlen = (eo - so);
             }
             matchctx->nmatches++;
+
+            /*
+             * check length of unmatched portion between end of previous match
+             * and start of current one
+             */
+            if (fetching_unmatched &&
+                pmatch[0].rm_so >= 0 &&
+                (pmatch[0].rm_so - prev_match_end) > maxlen)
+                maxlen = (pmatch[0].rm_so - prev_match_end);
         }
         prev_match_end = pmatch[0].rm_eo;
 
@@ -1068,34 +1102,67 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
             break;
     }
 
+    /*
+     * check length of unmatched portion between end of last match and end of
+     * input string
+     */
+    if (fetching_unmatched &&
+        (wide_len - prev_match_end) > maxlen)
+        maxlen = (wide_len - prev_match_end);
+
+    /*
+     * Keep a note of the end position of the string for the benefit of
+     * splitting code.
+     */
+    matchctx->match_locs[array_idx] = wide_len;
+
+    if (eml > 1)
+    {
+        int64        maxsiz = eml * (int64) maxlen;
+        int            conv_bufsiz;
+
+        /*
+         * Make the conversion buffer large enough for any substring of
+         * interest.
+         *
+         * Worst case: assume we need the maximum size (maxlen*eml), but take
+         * advantage of the fact that the original string length in bytes is an
+         * upper bound on the byte length of any fetched substring (and we know
+         * that len+1 is safe to allocate because the varlena header is longer
+         * than 1 byte).
+         */
+        if (maxsiz > orig_len)
+            conv_bufsiz = orig_len + 1;
+        else
+            conv_bufsiz = maxsiz + 1;    /* safe since maxsiz < 2^30 */
+
+        matchctx->conv_buf = palloc(conv_bufsiz);
+        matchctx->conv_bufsiz = conv_bufsiz;
+        matchctx->wide_str = wide_str;
+    }
+    else
+    {
+        /* No need to keep the wide string if we're in a single-byte charset. */
+        pfree(wide_str);
+        matchctx->wide_str = NULL;
+        matchctx->conv_buf = NULL;
+        matchctx->conv_bufsiz = 0;
+    }
+
     /* Clean up temp storage */
-    pfree(wide_str);
     pfree(pmatch);
 
     return matchctx;
 }
 
 /*
- * cleanup_regexp_matches - release memory of a regexp_matches_ctx
- */
-static void
-cleanup_regexp_matches(regexp_matches_ctx *matchctx)
-{
-    pfree(matchctx->orig_str);
-    pfree(matchctx->match_locs);
-    if (matchctx->elems)
-        pfree(matchctx->elems);
-    if (matchctx->nulls)
-        pfree(matchctx->nulls);
-    pfree(matchctx);
-}
-
-/*
  * build_regexp_match_result - build output array for current match
  */
 static ArrayType *
 build_regexp_match_result(regexp_matches_ctx *matchctx)
 {
+    char       *buf = matchctx->conv_buf;
+    int            bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
     Datum       *elems = matchctx->elems;
     bool       *nulls = matchctx->nulls;
     int            dims[1];
@@ -1115,6 +1182,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
             elems[i] = (Datum) 0;
             nulls[i] = true;
         }
+        else if (buf)
+        {
+            int        len = pg_wchar2mb_with_len(matchctx->wide_str + so,
+                                               buf,
+                                               eo - so);
+            Assert(len < bufsiz);
+            elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
+            nulls[i] = false;
+        }
         else
         {
             elems[i] = DirectFunctionCall3(text_substr,
@@ -1168,7 +1244,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
         splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                         &re_flags,
                                         PG_GET_COLLATION(),
-                                        false, true);
+                                        false, true, true);
 
         MemoryContextSwitchTo(oldcontext);
         funcctx->user_fctx = (void *) splitctx;
@@ -1185,9 +1261,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
         SRF_RETURN_NEXT(funcctx, result);
     }
 
-    /* release space in multi-call ctx to avoid intraquery memory leak */
-    cleanup_regexp_matches(splitctx);
-
     SRF_RETURN_DONE(funcctx);
 }
 
@@ -1224,7 +1297,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
                                     PG_GETARG_TEXT_PP(1),
                                     &re_flags,
                                     PG_GET_COLLATION(),
-                                    false, true);
+                                    false, true, true);
 
     while (splitctx->next_match <= splitctx->nmatches)
     {
@@ -1236,12 +1309,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
         splitctx->next_match++;
     }
 
-    /*
-     * We don't call cleanup_regexp_matches here; it would try to pfree the
-     * input string, which we didn't copy.  The space is not in a long-lived
-     * memory context anyway.
-     */
-
     PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
 }
 
@@ -1261,6 +1328,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
 static Datum
 build_regexp_split_result(regexp_matches_ctx *splitctx)
 {
+    char       *buf = splitctx->conv_buf;
     int            startpos;
     int            endpos;
 
@@ -1271,22 +1339,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
     if (startpos < 0)
         elog(ERROR, "invalid match ending position");
 
-    if (splitctx->next_match < splitctx->nmatches)
+    if (buf)
     {
+        int        bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
+        int        len;
+
         endpos = splitctx->match_locs[splitctx->next_match * 2];
         if (endpos < startpos)
             elog(ERROR, "invalid match starting position");
-        return DirectFunctionCall3(text_substr,
-                                   PointerGetDatum(splitctx->orig_str),
-                                   Int32GetDatum(startpos + 1),
-                                   Int32GetDatum(endpos - startpos));
+        len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
+                                   buf,
+                                   endpos-startpos);
+        Assert(len < bufsiz);
+        return PointerGetDatum(cstring_to_text_with_len(buf, len));
     }
     else
     {
-        /* no more matches, return rest of string */
-        return DirectFunctionCall2(text_substr_no_len,
+        endpos = splitctx->match_locs[splitctx->next_match * 2];
+        if (endpos < startpos)
+            elog(ERROR, "invalid match starting position");
+        return DirectFunctionCall3(text_substr,
                                    PointerGetDatum(splitctx->orig_str),
-                                   Int32GetDatum(startpos + 1));
+                                   Int32GetDatum(startpos + 1),
+                                   Int32GetDatum(endpos - startpos));
     }
 }


Re: Fix quadratic performance of regexp match/split functions

From
Kaiting Chen
Date:
Applied cleanly for me. Here are my performance test results:

  count
---------
 3000000
(1 row)

Time: 3167.836 ms (00:03.168)
  count
----------
 10100000
(1 row)

Time: 6074.369 ms (00:06.074)
 count
-------
 10001
(1 row)

Time: 8.159 ms

The performance improves substantially in case 2 as advertised:

  count
---------
 3000000
(1 row)

Time: 3387.612 ms (00:03.388)
  count
----------
 10100000
(1 row)

Time: 43794.224 ms (00:43.794)
 count
-------
 10001
(1 row)

Time: 1436.587 ms (00:01.437)

I'll do some more testing to determine how this behaves in the presence of multibyte characters in UTF-8.

Re: Fix quadratic performance of regexp match/split functions

From
Andrew Gierth
Date:
>>>>> "Kaiting" == Kaiting Chen <ktchen14@gmail.com> writes:

 Kaiting> I'll do some more testing to determine how this behaves in the
 Kaiting> presence of multibyte characters in UTF-8.

Excellent, thanks!

-- 
Andrew (irc:RhodiumToad)