regexp_replace - Mailing list pgsql-patches

From a_ogawa00
Subject regexp_replace
Date
Msg-id PIEMIKOOMKNIJLLLBCBBMEPLCAAA.a_ogawa00@yahoo.co.jp
Whole thread Raw
Responses Re: regexp_replace
Re: regexp_replace
Re: regexp_replace
List pgsql-patches
This patch provides a new function regexp_replace.
regexp_replace extends a replace function and enables text search
by the regular expression. And, a back reference can be used within
a replace string.
(This patch for PostgreSQL 7.4.3)

Function: regexp_replace(str, pattern, replace_str)
Retuen Type: text
Description: Replace all matched string in str.
             pattern is regular expression pattern.
             replace_str is replace string that can use '\1' - '\9', and
'\&'.
             '\1' - '\9' is back reference to the n'th subexpression.
             '\&' is matched string.

(example1)
select regexp_replace('ABC-DEF', '(\\w+)-(\\w+)', '\\2-\\1')
result: DEF-ABC

(example2)
update tab1 set col1 = regexp_replace(col1, '[A-Z]', '');

---
Atsushi Ogawa
a_ogawa@hi-ho.ne.jp

--- cut here ---

*** ./src/backend/regex/regexec.c.orig    Tue Jul 20 08:45:39 2004
--- ./src/backend/regex/regexec.c    Tue Jul 20 08:49:36 2004
***************
*** 110,115 ****
--- 110,116 ----
      regmatch_t *pmatch;
      rm_detail_t *details;
      chr           *start;            /* start of string */
+     chr           *search_start;    /* search start of string */
      chr           *stop;            /* just past end of string */
      int            err;            /* error code if any (0 none) */
      regoff_t   *mem;            /* memory vector for backtracking */
***************
*** 168,173 ****
--- 169,175 ----
  pg_regexec(regex_t *re,
             const chr *string,
             size_t len,
+            size_t search_start,
             rm_detail_t *details,
             size_t nmatch,
             regmatch_t pmatch[],
***************
*** 219,224 ****
--- 221,227 ----
          v->pmatch = pmatch;
      v->details = details;
      v->start = (chr *) string;
+     v->search_start = (chr *) string + search_start;
      v->stop = (chr *) string + len;
      v->err = 0;
      if (backref)
***************
*** 288,294 ****
      NOERR();
      MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
      cold = NULL;
!     close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *)
NULL);
      freedfa(s);
      NOERR();
      if (v->g->cflags & REG_EXPECT)
--- 291,298 ----
      NOERR();
      MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
      cold = NULL;
!     close = shortest(v, s, v->search_start, v->search_start, v->stop,
!                      &cold, (int *) NULL);
      freedfa(s);
      NOERR();
      if (v->g->cflags & REG_EXPECT)
***************
*** 415,421 ****

      assert(d != NULL && s != NULL);
      cold = NULL;
!     close = v->start;
      do
      {
          MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
--- 419,425 ----

      assert(d != NULL && s != NULL);
      cold = NULL;
!     close = v->search_start;
      do
      {
          MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
*** ./src/backend/utils/adt/regexp.c.orig    Tue Jul 20 08:50:08 2004
--- ./src/backend/utils/adt/regexp.c    Tue Jul 20 09:00:05 2004
***************
*** 80,116 ****


  /*
!  * RE_compile_and_execute - compile and execute a RE, caching if possible
   *
!  * Returns TRUE on match, FALSE on no match
   *
!  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
!  *    dat --- the data to match against (need not be null-terminated)
!  *    dat_len --- the length of the data string
!  *    cflags --- compile options for the pattern
!  *    nmatch, pmatch    --- optional return area for match details
   *
!  * Both pattern and data are given in the database encoding.  We
internally
!  * convert to array of pg_wchar which is what Spencer's regex package
wants.
   */
! static bool
! RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
!                        int cflags, int nmatch, regmatch_t *pmatch)
  {
      int            text_re_len = VARSIZE(text_re);
-     pg_wchar   *data;
-     size_t        data_len;
      pg_wchar   *pattern;
      size_t        pattern_len;
      int            i;
      int            regcomp_result;
-     int            regexec_result;
      cached_re_str re_temp;

-     /* Convert data string to wide characters */
-     data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
-     data_len = pg_mb2wchar_with_len(dat, data, dat_len);
-
      /*
       * Look for a match among previously compiled REs.    Since the data
       * structure is self-organizing with most-used entries at the front,
--- 80,105 ----


  /*
!  * RE_compile_and_cache - compile a RE, caching if possible
   *
!  * Returns regex_t
   *
!  *  text_re --- the pattern, expressed as an *untoasted* TEXT object
!  *  cflags --- compile options for the pattern
   *
!  * Pattern is given in the database encoding.  We internally convert to
!  * array of pg_wchar which is what Spencer's regex package wants.
   */
! static regex_t
! RE_compile_and_cache(text *text_re, int cflags)
  {
      int            text_re_len = VARSIZE(text_re);
      pg_wchar   *pattern;
      size_t        pattern_len;
      int            i;
      int            regcomp_result;
      cached_re_str re_temp;

      /*
       * Look for a match among previously compiled REs.    Since the data
       * structure is self-organizing with most-used entries at the front,
***************
*** 132,149 ****
                  re_array[0] = re_temp;
              }

!             /* Perform RE match and return result */
!             regexec_result = pg_regexec(&re_array[0].cre_re,
!                                         data,
!                                         data_len,
!                                         NULL,    /* no details */
!                                         nmatch,
!                                         pmatch,
!                                         0);
!
!             pfree(data);
!
!             return (regexec_result == 0);
          }
      }

--- 121,127 ----
                  re_array[0] = re_temp;
              }

!             return re_array[0].cre_re;
          }
      }

***************
*** 210,219 ****
--- 188,231 ----
      re_array[0] = re_temp;
      num_res++;

+     return re_array[0].cre_re;
+ }
+
+ /*
+  * RE_compile_and_execute - compile and execute a RE, caching if possible
+  *
+  * Returns TRUE on match, FALSE on no match
+  *
+  *    text_re --- the pattern, expressed as an *untoasted* TEXT object
+  *    dat --- the data to match against (need not be null-terminated)
+  *    dat_len --- the length of the data string
+  *    cflags --- compile options for the pattern
+  *    nmatch, pmatch    --- optional return area for match details
+  *
+  * Both pattern and data are given in the database encoding.  We
internally
+  * convert to array of pg_wchar which is what Spencer's regex package
wants.
+  */
+ static bool
+ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
+                        int cflags, int nmatch, regmatch_t *pmatch)
+ {
+     pg_wchar   *data;
+     size_t        data_len;
+     int            regexec_result;
+     regex_t        re;
+
+     /* Convert data string to wide characters */
+     data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+     data_len = pg_mb2wchar_with_len(dat, data, dat_len);
+
+     /* Compile RE */
+     re = RE_compile_and_cache(text_re, cflags);
+
      /* Perform RE match and return result */
      regexec_result = pg_regexec(&re_array[0].cre_re,
                                  data,
                                  data_len,
+                                 0,
                                  NULL,    /* no details */
                                  nmatch,
                                  pmatch,
***************
*** 415,420 ****
--- 427,452 ----
      }

      PG_RETURN_NULL();
+ }
+
+ /*
+  * textregexreplace()
+  *      Return a replace string matched by a regular expression.
+  */
+ Datum
+ textregexreplace(PG_FUNCTION_ARGS)
+ {
+     text       *s = PG_GETARG_TEXT_P(0);
+     text       *p = PG_GETARG_TEXT_P(1);
+     text       *r = PG_GETARG_TEXT_P(2);
+     regex_t        re;
+
+     re = RE_compile_and_cache(p, regex_flavor);
+
+     return (DirectFunctionCall3(replace_text_regexp,
+                                 PointerGetDatum(s),
+                                 PointerGetDatum(&re),
+                                 PointerGetDatum(r)));
  }

  /* similar_escape()
*** ./src/backend/utils/adt/varlena.c.orig    Tue Jul 20 09:00:17 2004
--- ./src/backend/utils/adt/varlena.c    Tue Jul 20 10:23:32 2004
***************
*** 28,33 ****
--- 28,34 ----
  #include "utils/builtins.h"
  #include "utils/lsyscache.h"
  #include "utils/pg_locale.h"
+ #include "regex/regex.h"


  typedef struct varlena unknown;
***************
*** 1971,1976 ****
--- 1972,2122 ----
      ret_text = PG_STR_GET_TEXT(str->data);
      pfree(str->data);
      pfree(str);
+
+     PG_RETURN_TEXT_P(ret_text);
+ }
+
+ /*
+  * have_escape_in_regexp_replace_str
+  * check replace string have escape char
+  */
+ static bool
+ have_escape_in_regexp_replace_str(const char *replace_str)
+ {
+     return (strchr(replace_str, '\\') != NULL);
+ }
+
+ #define REGEXP_REPLACE_BACKREF_CNT        10
+ /*
+  * appendStringInfoRegexpSubstr
+  * append string for regexp back references.
+  */
+ static void
+ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
+     regmatch_t *pmatch, text *buf_text, int search_start)
+ {
+     const char *pstart = PG_TEXT_GET_STR(replace_text);
+     const char *p = pstart;
+     const char *pnext;
+
+     text       *add_text;
+     int            so;
+     int            eo;
+
+     for(;;) {
+         pnext = strchr(p, '\\');
+         if(pnext == NULL) break;
+
+         add_text = text_substring(PointerGetDatum(replace_text),
+                                   p - pstart + 1, pnext - p, false);
+         appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
+         pfree(add_text);
+
+         p = pnext + 1;
+         so = eo = -1;
+
+         if(*p >= '1' && *p <= '9') {
+             int        idx = *p - '0';
+             so = pmatch[idx].rm_so - search_start;
+             eo = pmatch[idx].rm_eo - search_start;
+             p++;
+         } else {
+             switch(*p) {
+             case '&':
+                 so = pmatch[0].rm_so - search_start;
+                 eo = pmatch[0].rm_eo - search_start;
+                 p++;
+                 break;
+             }
+         }
+
+         if(so != -1 && eo != -1) {
+             add_text = text_substring(PointerGetDatum(buf_text),
+                                       so + 1, (eo - so), false);
+             appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
+             pfree(add_text);
+         }
+     }
+
+     add_text = text_substring(PointerGetDatum(replace_text),
+                               p - pstart + 1, -1, true);
+     appendStringInfoString(str, PG_TEXT_GET_STR(add_text));
+     pfree(add_text);
+ }
+
+ /*
+  * replace_text_regexp
+  * replace text using regexp
+ */
+ Datum
+ replace_text_regexp(PG_FUNCTION_ARGS)
+ {
+     text       *left_text;
+     text       *right_text;
+     text       *buf_text;
+     text       *ret_text;
+     text       *src_text = PG_GETARG_TEXT_P(0);
+     char       *src_text_str = PG_TEXT_GET_STR(src_text);
+     int            src_text_len = TEXTLEN(src_text);
+     regex_t       *re = (regex_t *)PG_GETARG_POINTER(1);
+     text       *replace_text = PG_GETARG_TEXT_P(2);
+     char       *replace_str = PG_TEXT_GET_STR(replace_text);
+     StringInfo    str = makeStringInfo();
+     int            regexec_result;
+     regmatch_t    pmatch[REGEXP_REPLACE_BACKREF_CNT];
+     pg_wchar   *data;
+     size_t        data_len;
+     int            search_start;
+     bool        have_escape;
+
+     buf_text = TEXTDUP(src_text);
+
+     /* Convert data string to wide characters */
+     data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
+     data_len = pg_mb2wchar_with_len(src_text_str, data,
strlen(src_text_str));
+
+     have_escape = have_escape_in_regexp_replace_str(replace_str);
+
+     for(search_start = 0; search_start <= data_len;) {
+         regexec_result = pg_regexec(re,
+                                     data,
+                                     data_len,
+                                     search_start,
+                                     NULL,   /* no details */
+                                     REGEXP_REPLACE_BACKREF_CNT,
+                                     pmatch,
+                                     0);
+         if(regexec_result != 0) break;
+
+         left_text = text_substring(PointerGetDatum(buf_text),
+                                    1, pmatch[0].rm_so - search_start, false);
+         right_text = text_substring(PointerGetDatum(buf_text),
+                                    pmatch[0].rm_eo - search_start + 1,
+                                    -1, true);
+
+         appendStringInfoString(str, PG_TEXT_GET_STR(left_text));
+         if(have_escape) {
+             appendStringInfoRegexpSubstr(str, replace_text, pmatch,
+                                          buf_text, search_start);
+         } else {
+             appendStringInfoString(str, replace_str);
+         }
+
+         pfree(buf_text);
+         pfree(left_text);
+         buf_text = right_text;
+
+         search_start = pmatch[0].rm_eo;
+         if(pmatch[0].rm_so == pmatch[0].rm_eo) search_start++;
+     }
+
+     appendStringInfoString(str, PG_TEXT_GET_STR(buf_text));
+     pfree(buf_text);
+
+     ret_text = PG_STR_GET_TEXT(str->data);
+     pfree(str->data);
+     pfree(str);
+     pfree(data);

      PG_RETURN_TEXT_P(ret_text);
  }
*** ./src/include/catalog/pg_proc.h.orig    Tue Jul 20 09:24:11 2004
--- ./src/include/catalog/pg_proc.h    Tue Jul 20 09:26:11 2004
***************
*** 2186,2191 ****
--- 2186,2193 ----
  DESCR("return portion of string");
  DATA(insert OID =  2087 ( replace       PGNSP PGUID 12 f f t f i 3 25 "25 25
25"  replace_text - _null_ ));
  DESCR("replace all occurrences of old_substr with new_substr in string");
+ DATA(insert OID =  2167 ( regexp_replace    PGNSP PGUID 12 f f t f i 3 25
"25 25 25"  textregexreplace - _null_ ));
+ DESCR("replace text using regexp");
  DATA(insert OID =  2088 ( split_part   PGNSP PGUID 12 f f t f i 3 25 "25
25 23"  split_text - _null_ ));
  DESCR("split string by field_sep and return field_num");
  DATA(insert OID =  2089 ( to_hex       PGNSP PGUID 12 f f t f i 1 25 "23"
to_hex32 - _null_ ));
*** ./src/include/regex/regex.h.orig    Tue Jul 20 08:51:06 2004
--- ./src/include/regex/regex.h    Tue Jul 20 08:51:16 2004
***************
*** 163,169 ****
   * the prototypes for exported functions
   */
  extern int    pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
! extern int    pg_regexec(regex_t *, const pg_wchar *, size_t, rm_detail_t *,
size_t, regmatch_t[], int);
  extern void pg_regfree(regex_t *);
  extern size_t pg_regerror(int, const regex_t *, char *, size_t);

--- 163,169 ----
   * the prototypes for exported functions
   */
  extern int    pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
! extern int    pg_regexec(regex_t *, const pg_wchar *, size_t, size_t,
rm_detail_t *, size_t, regmatch_t[], int);
  extern void pg_regfree(regex_t *);
  extern size_t pg_regerror(int, const regex_t *, char *, size_t);

*** ./src/include/utils/builtins.h.orig    Tue Jul 20 09:11:19 2004
--- ./src/include/utils/builtins.h    Tue Jul 20 09:11:46 2004
***************
*** 408,413 ****
--- 408,414 ----
  extern Datum texticregexeq(PG_FUNCTION_ARGS);
  extern Datum texticregexne(PG_FUNCTION_ARGS);
  extern Datum textregexsubstr(PG_FUNCTION_ARGS);
+ extern Datum textregexreplace(PG_FUNCTION_ARGS);
  extern Datum similar_escape(PG_FUNCTION_ARGS);
  extern const char *assign_regex_flavor(const char *value,
                      bool doit, bool interactive);
***************
*** 537,542 ****
--- 538,544 ----
  extern bool SplitIdentifierString(char *rawstring, char separator,
                        List **namelist);
  extern Datum replace_text(PG_FUNCTION_ARGS);
+ extern Datum replace_text_regexp(PG_FUNCTION_ARGS);
  extern Datum split_text(PG_FUNCTION_ARGS);
  extern Datum text_to_array(PG_FUNCTION_ARGS);
  extern Datum array_to_text(PG_FUNCTION_ARGS);

__________________________________________________
Do You Yahoo!?
http://bb.yahoo.co.jp/


pgsql-patches by date:

Previous
From: Peter Eisentraut
Date:
Subject: Re: pgxs: build infrastructure for extensions v4
Next
From: Alvaro Herrera
Date:
Subject: [subxacts] Savepoint syntax