Re: regexp_replace - Mailing list pgsql-patches

From Bruce Momjian
Subject Re: regexp_replace
Date
Msg-id 200408070104.i7714Yp00734@candle.pha.pa.us
Whole thread Raw
In response to regexp_replace  ("a_ogawa00" <a_ogawa00@yahoo.co.jp>)
List pgsql-patches
This has been saved for the 8.1 release:

    http:/momjian.postgresql.org/cgi-bin/pgpatches2

---------------------------------------------------------------------------

a_ogawa00 wrote:
>
> This patch provides a new function regexp_replace.
> regexp_replace extends a replace function and enables text search
> by the regular expression. And, a back reference can be used within
> a replace string.
> (This patch for PostgreSQL 7.4.3)
>
> Function: regexp_replace(str, pattern, replace_str)
> Retuen Type: text
> Description: Replace all matched string in str.
>              pattern is regular expression pattern.
>              replace_str is replace string that can use '\1' - '\9', and
> '\&'.
>              '\1' - '\9' is back reference to the n'th subexpression.
>              '\&' is matched string.
>
> (example1)
> select regexp_replace('ABC-DEF', '(\\w+)-(\\w+)', '\\2-\\1')
> result: DEF-ABC
>
> (example2)
> update tab1 set col1 = regexp_replace(col1, '[A-Z]', '');
>
> ---
> Atsushi Ogawa
> a_ogawa@hi-ho.ne.jp
>
> --- cut here ---
>
> *** ./src/backend/regex/regexec.c.orig    Tue Jul 20 08:45:39 2004
> --- ./src/backend/regex/regexec.c    Tue Jul 20 08:49:36 2004
> ***************
> *** 110,115 ****
> --- 110,116 ----
>       regmatch_t *pmatch;
>       rm_detail_t *details;
>       chr           *start;            /* start of string */
> +     chr           *search_start;    /* search start of string */
>       chr           *stop;            /* just past end of string */
>       int            err;            /* error code if any (0 none) */
>       regoff_t   *mem;            /* memory vector for backtracking */
> ***************
> *** 168,173 ****
> --- 169,175 ----
>   pg_regexec(regex_t *re,
>              const chr *string,
>              size_t len,
> +            size_t search_start,
>              rm_detail_t *details,
>              size_t nmatch,
>              regmatch_t pmatch[],
> ***************
> *** 219,224 ****
> --- 221,227 ----
>           v->pmatch = pmatch;
>       v->details = details;
>       v->start = (chr *) string;
> +     v->search_start = (chr *) string + search_start;
>       v->stop = (chr *) string + len;
>       v->err = 0;
>       if (backref)
> ***************
> *** 288,294 ****
>       NOERR();
>       MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
>       cold = NULL;
> !     close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *)
> NULL);
>       freedfa(s);
>       NOERR();
>       if (v->g->cflags & REG_EXPECT)
> --- 291,298 ----
>       NOERR();
>       MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
>       cold = NULL;
> !     close = shortest(v, s, v->search_start, v->search_start, v->stop,
> !                      &cold, (int *) NULL);
>       freedfa(s);
>       NOERR();
>       if (v->g->cflags & REG_EXPECT)
> ***************
> *** 415,421 ****
>
>       assert(d != NULL && s != NULL);
>       cold = NULL;
> !     close = v->start;
>       do
>       {
>           MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
> --- 419,425 ----
>
>       assert(d != NULL && s != NULL);
>       cold = NULL;
> !     close = v->search_start;
>       do
>       {
>           MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
> *** ./src/backend/utils/adt/regexp.c.orig    Tue Jul 20 08:50:08 2004
> --- ./src/backend/utils/adt/regexp.c    Tue Jul 20 09:00:05 2004
> ***************
> *** 80,116 ****
>
>
>   /*
> !  * RE_compile_and_execute - compile and execute a RE, caching if possible
>    *
> !  * Returns TRUE on match, FALSE on no match
>    *
> !  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
> !  *    dat --- the data to match against (need not be null-terminated)
> !  *    dat_len --- the length of the data string
> !  *    cflags --- compile options for the pattern
> !  *    nmatch, pmatch    --- optional return area for match details
>    *
> !  * Both pattern and data are given in the database encoding.  We
> internally
> !  * convert to array of pg_wchar which is what Spencer's regex package
> wants.
>    */
> ! static bool
> ! RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
> !                        int cflags, int nmatch, regmatch_t *pmatch)
>   {
>       int            text_re_len = VARSIZE(text_re);
> -     pg_wchar   *data;
> -     size_t        data_len;
>       pg_wchar   *pattern;
>       size_t        pattern_len;
>       int            i;
>       int            regcomp_result;
> -     int            regexec_result;
>       cached_re_str re_temp;
>
> -     /* Convert data string to wide characters */
> -     data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
> -     data_len = pg_mb2wchar_with_len(dat, data, dat_len);
> -
>       /*
>        * Look for a match among previously compiled REs.    Since the data
>        * structure is self-organizing with most-used entries at the front,
> --- 80,105 ----
>
>
>   /*
> !  * RE_compile_and_cache - compile a RE, caching if possible
>    *
> !  * Returns regex_t
>    *
> !  *  text_re --- the pattern, expressed as an *untoasted* TEXT object
> !  *  cflags --- compile options for the pattern
>    *
> !  * Pattern is given in the database encoding.  We internally convert to
> !  * array of pg_wchar which is what Spencer's regex package wants.
>    */
> ! static regex_t
> ! RE_compile_and_cache(text *text_re, int cflags)
>   {
>       int            text_re_len = VARSIZE(text_re);
>       pg_wchar   *pattern;
>       size_t        pattern_len;
>       int            i;
>       int            regcomp_result;
>       cached_re_str re_temp;
>
>       /*
>        * Look for a match among previously compiled REs.    Since the data
>        * structure is self-organizing with most-used entries at the front,
> ***************
> *** 132,149 ****
>                   re_array[0] = re_temp;
>               }
>
> !             /* Perform RE match and return result */
> !             regexec_result = pg_regexec(&re_array[0].cre_re,
> !                                         data,
> !                                         data_len,
> !                                         NULL,    /* no details */
> !                                         nmatch,
> !                                         pmatch,
> !                                         0);
> !
> !             pfree(data);
> !
> !             return (regexec_result == 0);
>           }
>       }
>
> --- 121,127 ----
>                   re_array[0] = re_temp;
>               }
>
> !             return re_array[0].cre_re;
>           }
>       }
>
> ***************
> *** 210,219 ****
> --- 188,231 ----
>       re_array[0] = re_temp;
>       num_res++;
>
> +     return re_array[0].cre_re;
> + }
> +
> + /*
> +  * RE_compile_and_execute - compile and execute a RE, caching if possible
> +  *
> +  * Returns TRUE on match, FALSE on no match
> +  *
> +  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
> +  *    dat --- the data to match against (need not be null-terminated)
> +  *    dat_len --- the length of the data string
> +  *    cflags --- compile options for the pattern
> +  *    nmatch, pmatch    --- optional return area for match details
> +  *
> +  * Both pattern and data are given in the database encoding.  We
> internally
> +  * convert to array of pg_wchar which is what Spencer's regex package
> wants.
> +  */
> + static bool
> + RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
> +                        int cflags, int nmatch, regmatch_t *pmatch)
> + {
> +     pg_wchar   *data;
> +     size_t        data_len;
> +     int            regexec_result;
> +     regex_t        re;
> +
> +     /* Convert data string to wide characters */
> +     data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
> +     data_len = pg_mb2wchar_with_len(dat, data, dat_len);
> +
> +     /* Compile RE */
> +     re = RE_compile_and_cache(text_re, cflags);
> +
>       /* Perform RE match and return result */
>       regexec_result = pg_regexec(&re_array[0].cre_re,
>                                   data,
>                                   data_len,
> +                                 0,
>                                   NULL,    /* no details */
>                                   nmatch,
>                                   pmatch,
> ***************
> *** 415,420 ****
> --- 427,452 ----
>       }
>
>       PG_RETURN_NULL();
> + }
> +
> + /*
> +  * textregexreplace()
> +  *      Return a replace string matched by a regular expression.
> +  */
> + Datum
> + textregexreplace(PG_FUNCTION_ARGS)
> + {
> +     text       *s = PG_GETARG_TEXT_P(0);
> +     text       *p = PG_GETARG_TEXT_P(1);
> +     text       *r = PG_GETARG_TEXT_P(2);
> +     regex_t        re;
> +
> +     re = RE_compile_and_cache(p, regex_flavor);
> +
> +     return (DirectFunctionCall3(replace_text_regexp,
> +                                 PointerGetDatum(s),
> +                                 PointerGetDatum(&re),
> +                                 PointerGetDatum(r)));
>   }
>
>   /* similar_escape()
> *** ./src/backend/utils/adt/varlena.c.orig    Tue Jul 20 09:00:17 2004
> --- ./src/backend/utils/adt/varlena.c    Tue Jul 20 10:23:32 2004
> ***************
> *** 28,33 ****
> --- 28,34 ----
>   #include "utils/builtins.h"
>   #include "utils/lsyscache.h"
>   #include "utils/pg_locale.h"
> + #include "regex/regex.h"
>
>
>   typedef struct varlena unknown;
> ***************
> *** 1971,1976 ****
> --- 1972,2122 ----
>       ret_text = PG_STR_GET_TEXT(str->data);
>       pfree(str->data);
>       pfree(str);
> +
> +     PG_RETURN_TEXT_P(ret_text);
> + }
> +
> + /*
> +  * have_escape_in_regexp_replace_str
> +  * check replace string have escape char
> +  */
> + static bool
> + have_escape_in_regexp_replace_str(const char *replace_str)
> + {
> +     return (strchr(replace_str, '\\') != NULL);
> + }
> +
> + #define REGEXP_REPLACE_BACKREF_CNT        10
> + /*
> +  * appendStringInfoRegexpSubstr
> +  * append string for regexp back references.
> +  */
> + static void
> + appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
> +     regmatch_t *pmatch, text *buf_text, int search_start)
> + {
> +     const char *pstart = PG_TEXT_GET_STR(replace_text);
> +     const char *p = pstart;
> +     const char *pnext;
> +
> +     text       *add_text;
> +     int            so;
> +     int            eo;
> +
> +     for(;;) {
> +         pnext = strchr(p, '\\');
> +         if(pnext == NULL) break;
> +
> +         add_text = text_substring(PointerGetDatum(replace_text),
> +                                   p - pstart + 1, pnext - p, false);
> +         appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
> +         pfree(add_text);
> +
> +         p = pnext + 1;
> +         so = eo = -1;
> +
> +         if(*p >= '1' && *p <= '9') {
> +             int        idx = *p - '0';
> +             so = pmatch[idx].rm_so - search_start;
> +             eo = pmatch[idx].rm_eo - search_start;
> +             p++;
> +         } else {
> +             switch(*p) {
> +             case '&':
> +                 so = pmatch[0].rm_so - search_start;
> +                 eo = pmatch[0].rm_eo - search_start;
> +                 p++;
> +                 break;
> +             }
> +         }
> +
> +         if(so != -1 && eo != -1) {
> +             add_text = text_substring(PointerGetDatum(buf_text),
> +                                       so + 1, (eo - so), false);
> +             appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
> +             pfree(add_text);
> +         }
> +     }
> +
> +     add_text = text_substring(PointerGetDatum(replace_text),
> +                               p - pstart + 1, -1, true);
> +     appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
> +     pfree(add_text);
> + }
> +
> + /*
> +  * replace_text_regexp
> +  * replace text using regexp
> + */
> + Datum
> + replace_text_regexp(PG_FUNCTION_ARGS)
> + {
> +     text       *left_text;
> +     text       *right_text;
> +     text       *buf_text;
> +     text       *ret_text;
> +     text       *src_text = PG_GETARG_TEXT_P(0);
> +     char       *src_text_str = PG_TEXT_GET_STR(src_text);
> +     int            src_text_len = TEXTLEN(src_text);
> +     regex_t       *re = (regex_t *)PG_GETARG_POINTER(1);
> +     text       *replace_text = PG_GETARG_TEXT_P(2);
> +     char       *replace_str = PG_TEXT_GET_STR(replace_text);
> +     StringInfo    str = makeStringInfo();
> +     int            regexec_result;
> +     regmatch_t    pmatch[REGEXP_REPLACE_BACKREF_CNT];
> +     pg_wchar   *data;
> +     size_t        data_len;
> +     int            search_start;
> +     bool        have_escape;
> +
> +     buf_text = TEXTDUP(src_text);
> +
> +     /* Convert data string to wide characters */
> +     data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
> +     data_len = pg_mb2wchar_with_len(src_text_str, data,
> strlen(src_text_str));
> +
> +     have_escape = have_escape_in_regexp_replace_str(replace_str);
> +
> +     for(search_start = 0; search_start <= data_len;) {
> +         regexec_result = pg_regexec(re,
> +                                     data,
> +                                     data_len,
> +                                     search_start,
> +                                     NULL,   /* no details */
> +                                     REGEXP_REPLACE_BACKREF_CNT,
> +                                     pmatch,
> +                                     0);
> +         if(regexec_result != 0) break;
> +
> +         left_text = text_substring(PointerGetDatum(buf_text),
> +                                    1, pmatch[0].rm_so - search_start, false);
> +         right_text = text_substring(PointerGetDatum(buf_text),
> +                                    pmatch[0].rm_eo - search_start + 1,
> +                                    -1, true);
> +
> +         appendStringInfoString(str, PG_TEXT_GET_STR(left_text));
> +         if(have_escape) {
> +             appendStringInfoRegexpSubstr(str, replace_text, pmatch,
> +                                          buf_text, search_start);
> +         } else {
> +             appendStringInfoString(str, replace_str);
> +         }
> +
> +         pfree(buf_text);
> +         pfree(left_text);
> +         buf_text = right_text;
> +
> +         search_start = pmatch[0].rm_eo;
> +         if(pmatch[0].rm_so == pmatch[0].rm_eo) search_start++;
> +     }
> +
> +     appendStringInfoString(str, PG_TEXT_GET_STR(buf_text));
> +     pfree(buf_text);
> +
> +     ret_text = PG_STR_GET_TEXT(str->data);
> +     pfree(str->data);
> +     pfree(str);
> +     pfree(data);
>
>       PG_RETURN_TEXT_P(ret_text);
>   }
> *** ./src/include/catalog/pg_proc.h.orig    Tue Jul 20 09:24:11 2004
> --- ./src/include/catalog/pg_proc.h    Tue Jul 20 09:26:11 2004
> ***************
> *** 2186,2191 ****
> --- 2186,2193 ----
>   DESCR("return portion of string");
>   DATA(insert OID =  2087 ( replace       PGNSP PGUID 12 f f t f i 3 25 "25 25
> 25"  replace_text - _null_ ));
>   DESCR("replace all occurrences of old_substr with new_substr in string");
> + DATA(insert OID =  2167 ( regexp_replace    PGNSP PGUID 12 f f t f i 3 25
> "25 25 25"  textregexreplace - _null_ ));
> + DESCR("replace text using regexp");
>   DATA(insert OID =  2088 ( split_part   PGNSP PGUID 12 f f t f i 3 25 "25
> 25 23"  split_text - _null_ ));
>   DESCR("split string by field_sep and return field_num");
>   DATA(insert OID =  2089 ( to_hex       PGNSP PGUID 12 f f t f i 1 25 "23"
> to_hex32 - _null_ ));
> *** ./src/include/regex/regex.h.orig    Tue Jul 20 08:51:06 2004
> --- ./src/include/regex/regex.h    Tue Jul 20 08:51:16 2004
> ***************
> *** 163,169 ****
>    * the prototypes for exported functions
>    */
>   extern int    pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
> ! extern int    pg_regexec(regex_t *, const pg_wchar *, size_t, rm_detail_t *,
> size_t, regmatch_t[], int);
>   extern void pg_regfree(regex_t *);
>   extern size_t pg_regerror(int, const regex_t *, char *, size_t);
>
> --- 163,169 ----
>    * the prototypes for exported functions
>    */
>   extern int    pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
> ! extern int    pg_regexec(regex_t *, const pg_wchar *, size_t, size_t,
> rm_detail_t *, size_t, regmatch_t[], int);
>   extern void pg_regfree(regex_t *);
>   extern size_t pg_regerror(int, const regex_t *, char *, size_t);
>
> *** ./src/include/utils/builtins.h.orig    Tue Jul 20 09:11:19 2004
> --- ./src/include/utils/builtins.h    Tue Jul 20 09:11:46 2004
> ***************
> *** 408,413 ****
> --- 408,414 ----
>   extern Datum texticregexeq(PG_FUNCTION_ARGS);
>   extern Datum texticregexne(PG_FUNCTION_ARGS);
>   extern Datum textregexsubstr(PG_FUNCTION_ARGS);
> + extern Datum textregexreplace(PG_FUNCTION_ARGS);
>   extern Datum similar_escape(PG_FUNCTION_ARGS);
>   extern const char *assign_regex_flavor(const char *value,
>                       bool doit, bool interactive);
> ***************
> *** 537,542 ****
> --- 538,544 ----
>   extern bool SplitIdentifierString(char *rawstring, char separator,
>                         List **namelist);
>   extern Datum replace_text(PG_FUNCTION_ARGS);
> + extern Datum replace_text_regexp(PG_FUNCTION_ARGS);
>   extern Datum split_text(PG_FUNCTION_ARGS);
>   extern Datum text_to_array(PG_FUNCTION_ARGS);
>   extern Datum array_to_text(PG_FUNCTION_ARGS);
>
> __________________________________________________
> Do You Yahoo!?
> http://bb.yahoo.co.jp/
>
>
> ---------------------------(end of broadcast)---------------------------
> TIP 3: if posting/reading through Usenet, please send an appropriate
>       subscribe-nomail command to majordomo@postgresql.org so that your
>       message can get through to the mailing list cleanly
>

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073

pgsql-patches by date:

Previous
From: Roger Leigh
Date:
Subject: psql groff patch (for 8.1)
Next
From: Bruce Momjian
Date:
Subject: Re: Autovacuum Integration Patch Take 5