regexp_replace - Mailing list pgsql-patches
From | a_ogawa00 |
---|---|
Subject | regexp_replace |
Date | |
Msg-id | PIEMIKOOMKNIJLLLBCBBMEPLCAAA.a_ogawa00@yahoo.co.jp Whole thread Raw |
Responses |
Re: regexp_replace
Re: regexp_replace Re: regexp_replace |
List | pgsql-patches |
This patch provides a new function regexp_replace. regexp_replace extends a replace function and enables text search by the regular expression. And, a back reference can be used within a replace string. (This patch for PostgreSQL 7.4.3) Function: regexp_replace(str, pattern, replace_str) Retuen Type: text Description: Replace all matched string in str. pattern is regular expression pattern. replace_str is replace string that can use '\1' - '\9', and '\&'. '\1' - '\9' is back reference to the n'th subexpression. '\&' is matched string. (example1) select regexp_replace('ABC-DEF', '(\\w+)-(\\w+)', '\\2-\\1') result: DEF-ABC (example2) update tab1 set col1 = regexp_replace(col1, '[A-Z]', ''); --- Atsushi Ogawa a_ogawa@hi-ho.ne.jp --- cut here --- *** ./src/backend/regex/regexec.c.orig Tue Jul 20 08:45:39 2004 --- ./src/backend/regex/regexec.c Tue Jul 20 08:49:36 2004 *************** *** 110,115 **** --- 110,116 ---- regmatch_t *pmatch; rm_detail_t *details; chr *start; /* start of string */ + chr *search_start; /* search start of string */ chr *stop; /* just past end of string */ int err; /* error code if any (0 none) */ regoff_t *mem; /* memory vector for backtracking */ *************** *** 168,173 **** --- 169,175 ---- pg_regexec(regex_t *re, const chr *string, size_t len, + size_t search_start, rm_detail_t *details, size_t nmatch, regmatch_t pmatch[], *************** *** 219,224 **** --- 221,227 ---- v->pmatch = pmatch; v->details = details; v->start = (chr *) string; + v->search_start = (chr *) string + search_start; v->stop = (chr *) string + len; v->err = 0; if (backref) *************** *** 288,294 **** NOERR(); MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); cold = NULL; ! close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *) NULL); freedfa(s); NOERR(); if (v->g->cflags & REG_EXPECT) --- 291,298 ---- NOERR(); MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); cold = NULL; ! close = shortest(v, s, v->search_start, v->search_start, v->stop, ! &cold, (int *) NULL); freedfa(s); NOERR(); if (v->g->cflags & REG_EXPECT) *************** *** 415,421 **** assert(d != NULL && s != NULL); cold = NULL; ! close = v->start; do { MDEBUG(("\ncsearch at %ld\n", LOFF(close))); --- 419,425 ---- assert(d != NULL && s != NULL); cold = NULL; ! close = v->search_start; do { MDEBUG(("\ncsearch at %ld\n", LOFF(close))); *** ./src/backend/utils/adt/regexp.c.orig Tue Jul 20 08:50:08 2004 --- ./src/backend/utils/adt/regexp.c Tue Jul 20 09:00:05 2004 *************** *** 80,116 **** /* ! * RE_compile_and_execute - compile and execute a RE, caching if possible * ! * Returns TRUE on match, FALSE on no match * ! * text_re --- the pattern, expressed as an *untoasted* TEXT object ! * dat --- the data to match against (need not be null-terminated) ! * dat_len --- the length of the data string ! * cflags --- compile options for the pattern ! * nmatch, pmatch --- optional return area for match details * ! * Both pattern and data are given in the database encoding. We internally ! * convert to array of pg_wchar which is what Spencer's regex package wants. */ ! static bool ! RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, ! int cflags, int nmatch, regmatch_t *pmatch) { int text_re_len = VARSIZE(text_re); - pg_wchar *data; - size_t data_len; pg_wchar *pattern; size_t pattern_len; int i; int regcomp_result; - int regexec_result; cached_re_str re_temp; - /* Convert data string to wide characters */ - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); - data_len = pg_mb2wchar_with_len(dat, data, dat_len); - /* * Look for a match among previously compiled REs. Since the data * structure is self-organizing with most-used entries at the front, --- 80,105 ---- /* ! * RE_compile_and_cache - compile a RE, caching if possible * ! * Returns regex_t * ! * text_re --- the pattern, expressed as an *untoasted* TEXT object ! * cflags --- compile options for the pattern * ! * Pattern is given in the database encoding. We internally convert to ! * array of pg_wchar which is what Spencer's regex package wants. */ ! static regex_t ! RE_compile_and_cache(text *text_re, int cflags) { int text_re_len = VARSIZE(text_re); pg_wchar *pattern; size_t pattern_len; int i; int regcomp_result; cached_re_str re_temp; /* * Look for a match among previously compiled REs. Since the data * structure is self-organizing with most-used entries at the front, *************** *** 132,149 **** re_array[0] = re_temp; } ! /* Perform RE match and return result */ ! regexec_result = pg_regexec(&re_array[0].cre_re, ! data, ! data_len, ! NULL, /* no details */ ! nmatch, ! pmatch, ! 0); ! ! pfree(data); ! ! return (regexec_result == 0); } } --- 121,127 ---- re_array[0] = re_temp; } ! return re_array[0].cre_re; } } *************** *** 210,219 **** --- 188,231 ---- re_array[0] = re_temp; num_res++; + return re_array[0].cre_re; + } + + /* + * RE_compile_and_execute - compile and execute a RE, caching if possible + * + * Returns TRUE on match, FALSE on no match + * + * text_re --- the pattern, expressed as an *untoasted* TEXT object + * dat --- the data to match against (need not be null-terminated) + * dat_len --- the length of the data string + * cflags --- compile options for the pattern + * nmatch, pmatch --- optional return area for match details + * + * Both pattern and data are given in the database encoding. We internally + * convert to array of pg_wchar which is what Spencer's regex package wants. + */ + static bool + RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, + int cflags, int nmatch, regmatch_t *pmatch) + { + pg_wchar *data; + size_t data_len; + int regexec_result; + regex_t re; + + /* Convert data string to wide characters */ + data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(dat, data, dat_len); + + /* Compile RE */ + re = RE_compile_and_cache(text_re, cflags); + /* Perform RE match and return result */ regexec_result = pg_regexec(&re_array[0].cre_re, data, data_len, + 0, NULL, /* no details */ nmatch, pmatch, *************** *** 415,420 **** --- 427,452 ---- } PG_RETURN_NULL(); + } + + /* + * textregexreplace() + * Return a replace string matched by a regular expression. + */ + Datum + textregexreplace(PG_FUNCTION_ARGS) + { + text *s = PG_GETARG_TEXT_P(0); + text *p = PG_GETARG_TEXT_P(1); + text *r = PG_GETARG_TEXT_P(2); + regex_t re; + + re = RE_compile_and_cache(p, regex_flavor); + + return (DirectFunctionCall3(replace_text_regexp, + PointerGetDatum(s), + PointerGetDatum(&re), + PointerGetDatum(r))); } /* similar_escape() *** ./src/backend/utils/adt/varlena.c.orig Tue Jul 20 09:00:17 2004 --- ./src/backend/utils/adt/varlena.c Tue Jul 20 10:23:32 2004 *************** *** 28,33 **** --- 28,34 ---- #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" + #include "regex/regex.h" typedef struct varlena unknown; *************** *** 1971,1976 **** --- 1972,2122 ---- ret_text = PG_STR_GET_TEXT(str->data); pfree(str->data); pfree(str); + + PG_RETURN_TEXT_P(ret_text); + } + + /* + * have_escape_in_regexp_replace_str + * check replace string have escape char + */ + static bool + have_escape_in_regexp_replace_str(const char *replace_str) + { + return (strchr(replace_str, '\\') != NULL); + } + + #define REGEXP_REPLACE_BACKREF_CNT 10 + /* + * appendStringInfoRegexpSubstr + * append string for regexp back references. + */ + static void + appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, + regmatch_t *pmatch, text *buf_text, int search_start) + { + const char *pstart = PG_TEXT_GET_STR(replace_text); + const char *p = pstart; + const char *pnext; + + text *add_text; + int so; + int eo; + + for(;;) { + pnext = strchr(p, '\\'); + if(pnext == NULL) break; + + add_text = text_substring(PointerGetDatum(replace_text), + p - pstart + 1, pnext - p, false); + appendStringInfoString(str, PG_TEXT_GET_STR(add_text)); + pfree(add_text); + + p = pnext + 1; + so = eo = -1; + + if(*p >= '1' && *p <= '9') { + int idx = *p - '0'; + so = pmatch[idx].rm_so - search_start; + eo = pmatch[idx].rm_eo - search_start; + p++; + } else { + switch(*p) { + case '&': + so = pmatch[0].rm_so - search_start; + eo = pmatch[0].rm_eo - search_start; + p++; + break; + } + } + + if(so != -1 && eo != -1) { + add_text = text_substring(PointerGetDatum(buf_text), + so + 1, (eo - so), false); + appendStringInfoString(str, PG_TEXT_GET_STR(add_text)); + pfree(add_text); + } + } + + add_text = text_substring(PointerGetDatum(replace_text), + p - pstart + 1, -1, true); + appendStringInfoString(str, PG_TEXT_GET_STR(add_text)); + pfree(add_text); + } + + /* + * replace_text_regexp + * replace text using regexp + */ + Datum + replace_text_regexp(PG_FUNCTION_ARGS) + { + text *left_text; + text *right_text; + text *buf_text; + text *ret_text; + text *src_text = PG_GETARG_TEXT_P(0); + char *src_text_str = PG_TEXT_GET_STR(src_text); + int src_text_len = TEXTLEN(src_text); + regex_t *re = (regex_t *)PG_GETARG_POINTER(1); + text *replace_text = PG_GETARG_TEXT_P(2); + char *replace_str = PG_TEXT_GET_STR(replace_text); + StringInfo str = makeStringInfo(); + int regexec_result; + regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT]; + pg_wchar *data; + size_t data_len; + int search_start; + bool have_escape; + + buf_text = TEXTDUP(src_text); + + /* Convert data string to wide characters */ + data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(src_text_str, data, strlen(src_text_str)); + + have_escape = have_escape_in_regexp_replace_str(replace_str); + + for(search_start = 0; search_start <= data_len;) { + regexec_result = pg_regexec(re, + data, + data_len, + search_start, + NULL, /* no details */ + REGEXP_REPLACE_BACKREF_CNT, + pmatch, + 0); + if(regexec_result != 0) break; + + left_text = text_substring(PointerGetDatum(buf_text), + 1, pmatch[0].rm_so - search_start, false); + right_text = text_substring(PointerGetDatum(buf_text), + pmatch[0].rm_eo - search_start + 1, + -1, true); + + appendStringInfoString(str, PG_TEXT_GET_STR(left_text)); + if(have_escape) { + appendStringInfoRegexpSubstr(str, replace_text, pmatch, + buf_text, search_start); + } else { + appendStringInfoString(str, replace_str); + } + + pfree(buf_text); + pfree(left_text); + buf_text = right_text; + + search_start = pmatch[0].rm_eo; + if(pmatch[0].rm_so == pmatch[0].rm_eo) search_start++; + } + + appendStringInfoString(str, PG_TEXT_GET_STR(buf_text)); + pfree(buf_text); + + ret_text = PG_STR_GET_TEXT(str->data); + pfree(str->data); + pfree(str); + pfree(data); PG_RETURN_TEXT_P(ret_text); } *** ./src/include/catalog/pg_proc.h.orig Tue Jul 20 09:24:11 2004 --- ./src/include/catalog/pg_proc.h Tue Jul 20 09:26:11 2004 *************** *** 2186,2191 **** --- 2186,2193 ---- DESCR("return portion of string"); DATA(insert OID = 2087 ( replace PGNSP PGUID 12 f f t f i 3 25 "25 25 25" replace_text - _null_ )); DESCR("replace all occurrences of old_substr with new_substr in string"); + DATA(insert OID = 2167 ( regexp_replace PGNSP PGUID 12 f f t f i 3 25 "25 25 25" textregexreplace - _null_ )); + DESCR("replace text using regexp"); DATA(insert OID = 2088 ( split_part PGNSP PGUID 12 f f t f i 3 25 "25 25 23" split_text - _null_ )); DESCR("split string by field_sep and return field_num"); DATA(insert OID = 2089 ( to_hex PGNSP PGUID 12 f f t f i 1 25 "23" to_hex32 - _null_ )); *** ./src/include/regex/regex.h.orig Tue Jul 20 08:51:06 2004 --- ./src/include/regex/regex.h Tue Jul 20 08:51:16 2004 *************** *** 163,169 **** * the prototypes for exported functions */ extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int); ! extern int pg_regexec(regex_t *, const pg_wchar *, size_t, rm_detail_t *, size_t, regmatch_t[], int); extern void pg_regfree(regex_t *); extern size_t pg_regerror(int, const regex_t *, char *, size_t); --- 163,169 ---- * the prototypes for exported functions */ extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int); ! extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int); extern void pg_regfree(regex_t *); extern size_t pg_regerror(int, const regex_t *, char *, size_t); *** ./src/include/utils/builtins.h.orig Tue Jul 20 09:11:19 2004 --- ./src/include/utils/builtins.h Tue Jul 20 09:11:46 2004 *************** *** 408,413 **** --- 408,414 ---- extern Datum texticregexeq(PG_FUNCTION_ARGS); extern Datum texticregexne(PG_FUNCTION_ARGS); extern Datum textregexsubstr(PG_FUNCTION_ARGS); + extern Datum textregexreplace(PG_FUNCTION_ARGS); extern Datum similar_escape(PG_FUNCTION_ARGS); extern const char *assign_regex_flavor(const char *value, bool doit, bool interactive); *************** *** 537,542 **** --- 538,544 ---- extern bool SplitIdentifierString(char *rawstring, char separator, List **namelist); extern Datum replace_text(PG_FUNCTION_ARGS); + extern Datum replace_text_regexp(PG_FUNCTION_ARGS); extern Datum split_text(PG_FUNCTION_ARGS); extern Datum text_to_array(PG_FUNCTION_ARGS); extern Datum array_to_text(PG_FUNCTION_ARGS); __________________________________________________ Do You Yahoo!? http://bb.yahoo.co.jp/
pgsql-patches by date: