Re: regexp_replace - Mailing list pgsql-patches
From | Bruce Momjian |
---|---|
Subject | Re: regexp_replace |
Date | |
Msg-id | 200408070104.i7714Yp00734@candle.pha.pa.us Whole thread Raw |
In response to | regexp_replace ("a_ogawa00" <a_ogawa00@yahoo.co.jp>) |
List | pgsql-patches |
This has been saved for the 8.1 release: http:/momjian.postgresql.org/cgi-bin/pgpatches2 --------------------------------------------------------------------------- a_ogawa00 wrote: > > This patch provides a new function regexp_replace. > regexp_replace extends a replace function and enables text search > by the regular expression. And, a back reference can be used within > a replace string. > (This patch for PostgreSQL 7.4.3) > > Function: regexp_replace(str, pattern, replace_str) > Retuen Type: text > Description: Replace all matched string in str. > pattern is regular expression pattern. > replace_str is replace string that can use '\1' - '\9', and > '\&'. > '\1' - '\9' is back reference to the n'th subexpression. > '\&' is matched string. > > (example1) > select regexp_replace('ABC-DEF', '(\\w+)-(\\w+)', '\\2-\\1') > result: DEF-ABC > > (example2) > update tab1 set col1 = regexp_replace(col1, '[A-Z]', ''); > > --- > Atsushi Ogawa > a_ogawa@hi-ho.ne.jp > > --- cut here --- > > *** ./src/backend/regex/regexec.c.orig Tue Jul 20 08:45:39 2004 > --- ./src/backend/regex/regexec.c Tue Jul 20 08:49:36 2004 > *************** > *** 110,115 **** > --- 110,116 ---- > regmatch_t *pmatch; > rm_detail_t *details; > chr *start; /* start of string */ > + chr *search_start; /* search start of string */ > chr *stop; /* just past end of string */ > int err; /* error code if any (0 none) */ > regoff_t *mem; /* memory vector for backtracking */ > *************** > *** 168,173 **** > --- 169,175 ---- > pg_regexec(regex_t *re, > const chr *string, > size_t len, > + size_t search_start, > rm_detail_t *details, > size_t nmatch, > regmatch_t pmatch[], > *************** > *** 219,224 **** > --- 221,227 ---- > v->pmatch = pmatch; > v->details = details; > v->start = (chr *) string; > + v->search_start = (chr *) string + search_start; > v->stop = (chr *) string + len; > v->err = 0; > if (backref) > *************** > *** 288,294 **** > NOERR(); > MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); > cold = NULL; > ! close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *) > NULL); > freedfa(s); > NOERR(); > if (v->g->cflags & REG_EXPECT) > --- 291,298 ---- > NOERR(); > MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); > cold = NULL; > ! close = shortest(v, s, v->search_start, v->search_start, v->stop, > ! &cold, (int *) NULL); > freedfa(s); > NOERR(); > if (v->g->cflags & REG_EXPECT) > *************** > *** 415,421 **** > > assert(d != NULL && s != NULL); > cold = NULL; > ! close = v->start; > do > { > MDEBUG(("\ncsearch at %ld\n", LOFF(close))); > --- 419,425 ---- > > assert(d != NULL && s != NULL); > cold = NULL; > ! close = v->search_start; > do > { > MDEBUG(("\ncsearch at %ld\n", LOFF(close))); > *** ./src/backend/utils/adt/regexp.c.orig Tue Jul 20 08:50:08 2004 > --- ./src/backend/utils/adt/regexp.c Tue Jul 20 09:00:05 2004 > *************** > *** 80,116 **** > > > /* > ! * RE_compile_and_execute - compile and execute a RE, caching if possible > * > ! * Returns TRUE on match, FALSE on no match > * > ! * text_re --- the pattern, expressed as an *untoasted* TEXT object > ! * dat --- the data to match against (need not be null-terminated) > ! * dat_len --- the length of the data string > ! * cflags --- compile options for the pattern > ! * nmatch, pmatch --- optional return area for match details > * > ! * Both pattern and data are given in the database encoding. We > internally > ! * convert to array of pg_wchar which is what Spencer's regex package > wants. > */ > ! static bool > ! RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, > ! int cflags, int nmatch, regmatch_t *pmatch) > { > int text_re_len = VARSIZE(text_re); > - pg_wchar *data; > - size_t data_len; > pg_wchar *pattern; > size_t pattern_len; > int i; > int regcomp_result; > - int regexec_result; > cached_re_str re_temp; > > - /* Convert data string to wide characters */ > - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); > - data_len = pg_mb2wchar_with_len(dat, data, dat_len); > - > /* > * Look for a match among previously compiled REs. Since the data > * structure is self-organizing with most-used entries at the front, > --- 80,105 ---- > > > /* > ! * RE_compile_and_cache - compile a RE, caching if possible > * > ! * Returns regex_t > * > ! * text_re --- the pattern, expressed as an *untoasted* TEXT object > ! * cflags --- compile options for the pattern > * > ! * Pattern is given in the database encoding. We internally convert to > ! * array of pg_wchar which is what Spencer's regex package wants. > */ > ! static regex_t > ! RE_compile_and_cache(text *text_re, int cflags) > { > int text_re_len = VARSIZE(text_re); > pg_wchar *pattern; > size_t pattern_len; > int i; > int regcomp_result; > cached_re_str re_temp; > > /* > * Look for a match among previously compiled REs. Since the data > * structure is self-organizing with most-used entries at the front, > *************** > *** 132,149 **** > re_array[0] = re_temp; > } > > ! /* Perform RE match and return result */ > ! regexec_result = pg_regexec(&re_array[0].cre_re, > ! data, > ! data_len, > ! NULL, /* no details */ > ! nmatch, > ! pmatch, > ! 0); > ! > ! pfree(data); > ! > ! return (regexec_result == 0); > } > } > > --- 121,127 ---- > re_array[0] = re_temp; > } > > ! return re_array[0].cre_re; > } > } > > *************** > *** 210,219 **** > --- 188,231 ---- > re_array[0] = re_temp; > num_res++; > > + return re_array[0].cre_re; > + } > + > + /* > + * RE_compile_and_execute - compile and execute a RE, caching if possible > + * > + * Returns TRUE on match, FALSE on no match > + * > + * text_re --- the pattern, expressed as an *untoasted* TEXT object > + * dat --- the data to match against (need not be null-terminated) > + * dat_len --- the length of the data string > + * cflags --- compile options for the pattern > + * nmatch, pmatch --- optional return area for match details > + * > + * Both pattern and data are given in the database encoding. We > internally > + * convert to array of pg_wchar which is what Spencer's regex package > wants. > + */ > + static bool > + RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, > + int cflags, int nmatch, regmatch_t *pmatch) > + { > + pg_wchar *data; > + size_t data_len; > + int regexec_result; > + regex_t re; > + > + /* Convert data string to wide characters */ > + data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); > + data_len = pg_mb2wchar_with_len(dat, data, dat_len); > + > + /* Compile RE */ > + re = RE_compile_and_cache(text_re, cflags); > + > /* Perform RE match and return result */ > regexec_result = pg_regexec(&re_array[0].cre_re, > data, > data_len, > + 0, > NULL, /* no details */ > nmatch, > pmatch, > *************** > *** 415,420 **** > --- 427,452 ---- > } > > PG_RETURN_NULL(); > + } > + > + /* > + * textregexreplace() > + * Return a replace string matched by a regular expression. > + */ > + Datum > + textregexreplace(PG_FUNCTION_ARGS) > + { > + text *s = PG_GETARG_TEXT_P(0); > + text *p = PG_GETARG_TEXT_P(1); > + text *r = PG_GETARG_TEXT_P(2); > + regex_t re; > + > + re = RE_compile_and_cache(p, regex_flavor); > + > + return (DirectFunctionCall3(replace_text_regexp, > + PointerGetDatum(s), > + PointerGetDatum(&re), > + PointerGetDatum(r))); > } > > /* similar_escape() > *** ./src/backend/utils/adt/varlena.c.orig Tue Jul 20 09:00:17 2004 > --- ./src/backend/utils/adt/varlena.c Tue Jul 20 10:23:32 2004 > *************** > *** 28,33 **** > --- 28,34 ---- > #include "utils/builtins.h" > #include "utils/lsyscache.h" > #include "utils/pg_locale.h" > + #include "regex/regex.h" > > > typedef struct varlena unknown; > *************** > *** 1971,1976 **** > --- 1972,2122 ---- > ret_text = PG_STR_GET_TEXT(str->data); > pfree(str->data); > pfree(str); > + > + PG_RETURN_TEXT_P(ret_text); > + } > + > + /* > + * have_escape_in_regexp_replace_str > + * check replace string have escape char > + */ > + static bool > + have_escape_in_regexp_replace_str(const char *replace_str) > + { > + return (strchr(replace_str, '\\') != NULL); > + } > + > + #define REGEXP_REPLACE_BACKREF_CNT 10 > + /* > + * appendStringInfoRegexpSubstr > + * append string for regexp back references. > + */ > + static void > + appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, > + regmatch_t *pmatch, text *buf_text, int search_start) > + { > + const char *pstart = PG_TEXT_GET_STR(replace_text); > + const char *p = pstart; > + const char *pnext; > + > + text *add_text; > + int so; > + int eo; > + > + for(;;) { > + pnext = strchr(p, '\\'); > + if(pnext == NULL) break; > + > + add_text = text_substring(PointerGetDatum(replace_text), > + p - pstart + 1, pnext - p, false); > + appendStringInfoString(str, PG_TEXT_GET_STR(add_text)); > + pfree(add_text); > + > + p = pnext + 1; > + so = eo = -1; > + > + if(*p >= '1' && *p <= '9') { > + int idx = *p - '0'; > + so = pmatch[idx].rm_so - search_start; > + eo = pmatch[idx].rm_eo - search_start; > + p++; > + } else { > + switch(*p) { > + case '&': > + so = pmatch[0].rm_so - search_start; > + eo = pmatch[0].rm_eo - search_start; > + p++; > + break; > + } > + } > + > + if(so != -1 && eo != -1) { > + add_text = text_substring(PointerGetDatum(buf_text), > + so + 1, (eo - so), false); > + appendStringInfoString(str, PG_TEXT_GET_STR(add_text)); > + pfree(add_text); > + } > + } > + > + add_text = text_substring(PointerGetDatum(replace_text), > + p - pstart + 1, -1, true); > + appendStringInfoString(str, PG_TEXT_GET_STR(add_text)); > + pfree(add_text); > + } > + > + /* > + * replace_text_regexp > + * replace text using regexp > + */ > + Datum > + replace_text_regexp(PG_FUNCTION_ARGS) > + { > + text *left_text; > + text *right_text; > + text *buf_text; > + text *ret_text; > + text *src_text = PG_GETARG_TEXT_P(0); > + char *src_text_str = PG_TEXT_GET_STR(src_text); > + int src_text_len = TEXTLEN(src_text); > + regex_t *re = (regex_t *)PG_GETARG_POINTER(1); > + text *replace_text = PG_GETARG_TEXT_P(2); > + char *replace_str = PG_TEXT_GET_STR(replace_text); > + StringInfo str = makeStringInfo(); > + int regexec_result; > + regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT]; > + pg_wchar *data; > + size_t data_len; > + int search_start; > + bool have_escape; > + > + buf_text = TEXTDUP(src_text); > + > + /* Convert data string to wide characters */ > + data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar)); > + data_len = pg_mb2wchar_with_len(src_text_str, data, > strlen(src_text_str)); > + > + have_escape = have_escape_in_regexp_replace_str(replace_str); > + > + for(search_start = 0; search_start <= data_len;) { > + regexec_result = pg_regexec(re, > + data, > + data_len, > + search_start, > + NULL, /* no details */ > + REGEXP_REPLACE_BACKREF_CNT, > + pmatch, > + 0); > + if(regexec_result != 0) break; > + > + left_text = text_substring(PointerGetDatum(buf_text), > + 1, pmatch[0].rm_so - search_start, false); > + right_text = text_substring(PointerGetDatum(buf_text), > + pmatch[0].rm_eo - search_start + 1, > + -1, true); > + > + appendStringInfoString(str, PG_TEXT_GET_STR(left_text)); > + if(have_escape) { > + appendStringInfoRegexpSubstr(str, replace_text, pmatch, > + buf_text, search_start); > + } else { > + appendStringInfoString(str, replace_str); > + } > + > + pfree(buf_text); > + pfree(left_text); > + buf_text = right_text; > + > + search_start = pmatch[0].rm_eo; > + if(pmatch[0].rm_so == pmatch[0].rm_eo) search_start++; > + } > + > + appendStringInfoString(str, PG_TEXT_GET_STR(buf_text)); > + pfree(buf_text); > + > + ret_text = PG_STR_GET_TEXT(str->data); > + pfree(str->data); > + pfree(str); > + pfree(data); > > PG_RETURN_TEXT_P(ret_text); > } > *** ./src/include/catalog/pg_proc.h.orig Tue Jul 20 09:24:11 2004 > --- ./src/include/catalog/pg_proc.h Tue Jul 20 09:26:11 2004 > *************** > *** 2186,2191 **** > --- 2186,2193 ---- > DESCR("return portion of string"); > DATA(insert OID = 2087 ( replace PGNSP PGUID 12 f f t f i 3 25 "25 25 > 25" replace_text - _null_ )); > DESCR("replace all occurrences of old_substr with new_substr in string"); > + DATA(insert OID = 2167 ( regexp_replace PGNSP PGUID 12 f f t f i 3 25 > "25 25 25" textregexreplace - _null_ )); > + DESCR("replace text using regexp"); > DATA(insert OID = 2088 ( split_part PGNSP PGUID 12 f f t f i 3 25 "25 > 25 23" split_text - _null_ )); > DESCR("split string by field_sep and return field_num"); > DATA(insert OID = 2089 ( to_hex PGNSP PGUID 12 f f t f i 1 25 "23" > to_hex32 - _null_ )); > *** ./src/include/regex/regex.h.orig Tue Jul 20 08:51:06 2004 > --- ./src/include/regex/regex.h Tue Jul 20 08:51:16 2004 > *************** > *** 163,169 **** > * the prototypes for exported functions > */ > extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int); > ! extern int pg_regexec(regex_t *, const pg_wchar *, size_t, rm_detail_t *, > size_t, regmatch_t[], int); > extern void pg_regfree(regex_t *); > extern size_t pg_regerror(int, const regex_t *, char *, size_t); > > --- 163,169 ---- > * the prototypes for exported functions > */ > extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int); > ! extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, > rm_detail_t *, size_t, regmatch_t[], int); > extern void pg_regfree(regex_t *); > extern size_t pg_regerror(int, const regex_t *, char *, size_t); > > *** ./src/include/utils/builtins.h.orig Tue Jul 20 09:11:19 2004 > --- ./src/include/utils/builtins.h Tue Jul 20 09:11:46 2004 > *************** > *** 408,413 **** > --- 408,414 ---- > extern Datum texticregexeq(PG_FUNCTION_ARGS); > extern Datum texticregexne(PG_FUNCTION_ARGS); > extern Datum textregexsubstr(PG_FUNCTION_ARGS); > + extern Datum textregexreplace(PG_FUNCTION_ARGS); > extern Datum similar_escape(PG_FUNCTION_ARGS); > extern const char *assign_regex_flavor(const char *value, > bool doit, bool interactive); > *************** > *** 537,542 **** > --- 538,544 ---- > extern bool SplitIdentifierString(char *rawstring, char separator, > List **namelist); > extern Datum replace_text(PG_FUNCTION_ARGS); > + extern Datum replace_text_regexp(PG_FUNCTION_ARGS); > extern Datum split_text(PG_FUNCTION_ARGS); > extern Datum text_to_array(PG_FUNCTION_ARGS); > extern Datum array_to_text(PG_FUNCTION_ARGS); > > __________________________________________________ > Do You Yahoo!? > http://bb.yahoo.co.jp/ > > > ---------------------------(end of broadcast)--------------------------- > TIP 3: if posting/reading through Usenet, please send an appropriate > subscribe-nomail command to majordomo@postgresql.org so that your > message can get through to the mailing list cleanly > -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 359-1001 + If your life is a hard drive, | 13 Roberts Road + Christ can be your backup. | Newtown Square, Pennsylvania 19073
pgsql-patches by date: