Re: UTF8MatchText - Mailing list pgsql-patches
From | Andrew Dunstan |
---|---|
Subject | Re: UTF8MatchText |
Date | |
Msg-id | 464D181D.9010307@dunslane.net Whole thread Raw |
In response to | Re: UTF8MatchText (Tom Lane <tgl@sss.pgh.pa.us>) |
Responses |
Re: UTF8MatchText
(Tom Lane <tgl@sss.pgh.pa.us>)
|
List | pgsql-patches |
Tom Lane wrote: > ITAGAKI Takahiro <itagaki.takahiro@oss.ntt.co.jp> writes: > >> Yes, I only used the 'disjoint representations for first-bytes and >> not-first-bytes of MB characters' feature in UTF8. Other encodings >> allows both [AB] and [BA] for MB character patterns. UTF8Match() does >> not cope with those encodings; If we have '[AB][AB]' in a table and >> search it with LIKE '%[BA]%', we judge that they are matched by mistake. >> > > AFAICS, the patch does *not* make that mistake because % will not > advance over a fractional character. > > Yeah, I think that's right. Attached is my current WIP patch. If we decide that this optimisation can in fact be applied to all backend encodings, that will be easily incorporated. It will simplify the code further. Note that all the common code in the MatchText and do_like_escape functions has been factored - and the bytea functions just call the single-byte text versions - AFAICS the effect will be identical to having the specialised versions. (I'm always happy when code volume can be reduced.) cheers andrew Index: src/backend/utils/adt/like.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/utils/adt/like.c,v retrieving revision 1.68 diff -c -r1.68 like.c *** src/backend/utils/adt/like.c 27 Feb 2007 23:48:08 -0000 1.68 --- src/backend/utils/adt/like.c 18 May 2007 02:47:41 -0000 *************** *** 28,48 **** #define LIKE_ABORT (-1) ! static int MatchText(char *t, int tlen, char *p, int plen); ! static int MatchTextIC(char *t, int tlen, char *p, int plen); ! static int MatchBytea(char *t, int tlen, char *p, int plen); ! static text *do_like_escape(text *, text *); ! static int MBMatchText(char *t, int tlen, char *p, int plen); ! static int MBMatchTextIC(char *t, int tlen, char *p, int plen); static text *MB_do_like_escape(text *, text *); /*-------------------- * Support routine for MatchText. Compares given multibyte streams * as wide characters. If they match, returns 1 otherwise returns 0. *-------------------- */ ! static int wchareq(char *p1, char *p2) { int p1_len; --- 28,50 ---- #define LIKE_ABORT (-1) ! static int SB_MatchText(char *t, int tlen, char *p, int plen); ! static int SB_MatchTextIC(char *t, int tlen, char *p, int plen); ! static text *SB_do_like_escape(text *, text *); ! static int MB_MatchText(char *t, int tlen, char *p, int plen); static text *MB_do_like_escape(text *, text *); + static int UTF8_MatchText(char *t, int tlen, char *p, int plen); + static int GenericMatchText(char *s, int slen, char* p, int plen); + static int mbtexticlike(text *str, text *pat); + /*-------------------- * Support routine for MatchText. Compares given multibyte streams * as wide characters. If they match, returns 1 otherwise returns 0. *-------------------- */ ! static __inline__ int wchareq(char *p1, char *p2) { int p1_len; *************** *** 72,86 **** * of getting a single character transformed to the system's wchar_t format. * So now, we just downcase the strings using lower() and apply regular LIKE * comparison. This should be revisited when we install better locale support. - * - * Note that MBMatchText and MBMatchTextIC do exactly the same thing now. - * Is it worth refactoring to avoid duplicated code? They might become - * different again in the future. */ /* Set up to compile like_match.c for multibyte characters */ #define CHAREQ(p1, p2) wchareq(p1, p2) - #define ICHAREQ(p1, p2) wchareq(p1, p2) #define NextChar(p, plen) \ do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0) #define CopyAdvChar(dst, src, srclen) \ --- 74,86 ---- * of getting a single character transformed to the system's wchar_t format. * So now, we just downcase the strings using lower() and apply regular LIKE * comparison. This should be revisited when we install better locale support. */ + #define NextByte(p, plen) ((p)++, (plen)--) + #define BYTEEQ(p1, p2) (*(p1) == *(p2)) + /* Set up to compile like_match.c for multibyte characters */ #define CHAREQ(p1, p2) wchareq(p1, p2) #define NextChar(p, plen) \ do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0) #define CopyAdvChar(dst, src, srclen) \ *************** *** 90,122 **** *(dst)++ = *(src)++; \ } while (0) ! #define MatchText MBMatchText ! #define MatchTextIC MBMatchTextIC #define do_like_escape MB_do_like_escape #include "like_match.c" - #undef CHAREQ - #undef ICHAREQ - #undef NextChar - #undef CopyAdvChar - #undef MatchText - #undef MatchTextIC - #undef do_like_escape - /* Set up to compile like_match.c for single-byte characters */ ! #define CHAREQ(p1, p2) (*(p1) == *(p2)) ! #define ICHAREQ(p1, p2) (tolower((unsigned char) *(p1)) == tolower((unsigned char) *(p2))) ! #define NextChar(p, plen) ((p)++, (plen)--) #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) #include "like_match.c" - /* And some support for BYTEA */ - #define BYTEA_CHAREQ(p1, p2) (*(p1) == *(p2)) - #define BYTEA_NextChar(p, plen) ((p)++, (plen)--) - #define BYTEA_CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) /* * interface routines called by the function manager --- 90,170 ---- *(dst)++ = *(src)++; \ } while (0) ! #define MatchText MB_MatchText #define do_like_escape MB_do_like_escape #include "like_match.c" /* Set up to compile like_match.c for single-byte characters */ ! #define CHAREQ(p1, p2) BYTEEQ(p1, p2) ! #define NextChar(p, plen) NextByte(p, plen) #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) + #define MatchText SB_MatchText + #define do_like_escape SB_do_like_escape + + #include "like_match.c" + + /* set up to compile like_match.c for single byte case insensitive matching */ + + #define CHAREQ(p1, p2) (tolower((unsigned char) *(p1)) == tolower((unsigned char) *(p2))) + #define NextChar(p, plen) NextByte(p, plen) + #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) + + #define MatchText SB_MatchTextIC + + #include "like_match.c" + + /* set up for UTF8 match optimisation */ + + #define CHAREQ(p1, p2) wchareq(p1, p2) + #define NextChar(p, plen) \ + do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0) + #define CopyAdvChar(dst, src, srclen) \ + do { int __l = pg_mblen(src); \ + (srclen) -= __l; \ + while (__l-- > 0) \ + *(dst)++ = *(src)++; \ + } while (0) + + #define MatchText UTF8_MatchText + #define UTF8_OPT + #include "like_match.c" + static __inline__ int + GenericMatchText(char *s, int slen, char* p, int plen) + { + if (pg_database_encoding_max_length() == 1) + return SB_MatchText(s, slen, p, plen); + else if (GetDatabaseEncoding() == PG_UTF8) + return UTF8_MatchText(s, slen, p, plen); + else + return MB_MatchText(s, slen, p, plen); + } + + static __inline__ int + mbtexticlike(text *str, text *pat) + { + char *s, + *p; + int slen, + plen; + + /* Force inputs to lower case to achieve case insensitivity */ + str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str))); + pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat))); + s = VARDATA(str); + slen = (VARSIZE(str) - VARHDRSZ); + p = VARDATA(pat); + plen = (VARSIZE(pat) - VARHDRSZ); + + if (GetDatabaseEncoding() == PG_UTF8) + return UTF8_MatchText(s, slen, p, plen); + else + return MB_MatchText(s, slen, p, plen); + } /* * interface routines called by the function manager *************** *** 138,147 **** p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! if (pg_database_encoding_max_length() == 1) ! result = (MatchText(s, slen, p, plen) == LIKE_TRUE); ! else ! result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE); PG_RETURN_BOOL(result); } --- 186,192 ---- p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE); PG_RETURN_BOOL(result); } *************** *** 162,171 **** p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! if (pg_database_encoding_max_length() == 1) ! result = (MatchText(s, slen, p, plen) != LIKE_TRUE); ! else ! result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE); PG_RETURN_BOOL(result); } --- 207,213 ---- p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE); PG_RETURN_BOOL(result); } *************** *** 186,195 **** p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! if (pg_database_encoding_max_length() == 1) ! result = (MatchText(s, slen, p, plen) == LIKE_TRUE); ! else ! result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE); PG_RETURN_BOOL(result); } --- 228,234 ---- p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE); PG_RETURN_BOOL(result); } *************** *** 210,219 **** p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! if (pg_database_encoding_max_length() == 1) ! result = (MatchText(s, slen, p, plen) != LIKE_TRUE); ! else ! result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE); PG_RETURN_BOOL(result); } --- 249,255 ---- p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE); PG_RETURN_BOOL(result); } *************** *** 234,240 **** p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (MatchBytea(s, slen, p, plen) == LIKE_TRUE); PG_RETURN_BOOL(result); } --- 270,276 ---- p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (SB_MatchText(s, slen, p, plen) == LIKE_TRUE); PG_RETURN_BOOL(result); } *************** *** 255,261 **** p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (MatchBytea(s, slen, p, plen) != LIKE_TRUE); PG_RETURN_BOOL(result); } --- 291,297 ---- p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (SB_MatchText(s, slen, p, plen) != LIKE_TRUE); PG_RETURN_BOOL(result); } *************** *** 281,305 **** slen = strlen(s); p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE); } else { - /* Force inputs to lower case to achieve case insensitivity */ text *strtext; strtext = DatumGetTextP(DirectFunctionCall1(name_text, NameGetDatum(str))); ! strtext = DatumGetTextP(DirectFunctionCall1(lower, ! PointerGetDatum(strtext))); ! pat = DatumGetTextP(DirectFunctionCall1(lower, ! PointerGetDatum(pat))); ! ! s = VARDATA(strtext); ! slen = (VARSIZE(strtext) - VARHDRSZ); ! p = VARDATA(pat); ! plen = (VARSIZE(pat) - VARHDRSZ); ! result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE); } PG_RETURN_BOOL(result); --- 317,331 ---- slen = strlen(s); p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (SB_MatchTextIC(s, slen, p, plen) == LIKE_TRUE); } else { text *strtext; strtext = DatumGetTextP(DirectFunctionCall1(name_text, NameGetDatum(str))); ! result = (mbtexticlike(strtext, pat) == LIKE_TRUE); } PG_RETURN_BOOL(result); *************** *** 322,346 **** slen = strlen(s); p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE); } else { - /* Force inputs to lower case to achieve case insensitivity */ text *strtext; strtext = DatumGetTextP(DirectFunctionCall1(name_text, NameGetDatum(str))); ! strtext = DatumGetTextP(DirectFunctionCall1(lower, ! PointerGetDatum(strtext))); ! pat = DatumGetTextP(DirectFunctionCall1(lower, ! PointerGetDatum(pat))); ! ! s = VARDATA(strtext); ! slen = (VARSIZE(strtext) - VARHDRSZ); ! p = VARDATA(pat); ! plen = (VARSIZE(pat) - VARHDRSZ); ! result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE); } PG_RETURN_BOOL(result); --- 348,362 ---- slen = strlen(s); p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (SB_MatchTextIC(s, slen, p, plen) != LIKE_TRUE); } else { text *strtext; strtext = DatumGetTextP(DirectFunctionCall1(name_text, NameGetDatum(str))); ! result = (mbtexticlike(strtext, pat) != LIKE_TRUE); } PG_RETURN_BOOL(result); *************** *** 363,383 **** slen = (VARSIZE(str) - VARHDRSZ); p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE); } else ! { ! /* Force inputs to lower case to achieve case insensitivity */ ! str = DatumGetTextP(DirectFunctionCall1(lower, ! PointerGetDatum(str))); ! pat = DatumGetTextP(DirectFunctionCall1(lower, ! PointerGetDatum(pat))); ! s = VARDATA(str); ! slen = (VARSIZE(str) - VARHDRSZ); ! p = VARDATA(pat); ! plen = (VARSIZE(pat) - VARHDRSZ); ! result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE); ! } PG_RETURN_BOOL(result); } --- 379,388 ---- slen = (VARSIZE(str) - VARHDRSZ); p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (SB_MatchTextIC(s, slen, p, plen) == LIKE_TRUE); } else ! result = (mbtexticlike(str, pat) == LIKE_TRUE); PG_RETURN_BOOL(result); } *************** *** 399,419 **** slen = (VARSIZE(str) - VARHDRSZ); p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE); } else ! { ! /* Force inputs to lower case to achieve case insensitivity */ ! str = DatumGetTextP(DirectFunctionCall1(lower, ! PointerGetDatum(str))); ! pat = DatumGetTextP(DirectFunctionCall1(lower, ! PointerGetDatum(pat))); ! s = VARDATA(str); ! slen = (VARSIZE(str) - VARHDRSZ); ! p = VARDATA(pat); ! plen = (VARSIZE(pat) - VARHDRSZ); ! result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE); ! } PG_RETURN_BOOL(result); } --- 404,413 ---- slen = (VARSIZE(str) - VARHDRSZ); p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! result = (SB_MatchTextIC(s, slen, p, plen) != LIKE_TRUE); } else ! result = (mbtexticlike(str, pat) != LIKE_TRUE); PG_RETURN_BOOL(result); } *************** *** 430,436 **** text *result; if (pg_database_encoding_max_length() == 1) ! result = do_like_escape(pat, esc); else result = MB_do_like_escape(pat, esc); --- 424,430 ---- text *result; if (pg_database_encoding_max_length() == 1) ! result = SB_do_like_escape(pat, esc); else result = MB_do_like_escape(pat, esc); *************** *** 446,624 **** { bytea *pat = PG_GETARG_BYTEA_P(0); bytea *esc = PG_GETARG_BYTEA_P(1); ! bytea *result; ! char *p, ! *e, ! *r; ! int plen, ! elen; ! bool afterescape; ! ! p = VARDATA(pat); ! plen = (VARSIZE(pat) - VARHDRSZ); ! e = VARDATA(esc); ! elen = (VARSIZE(esc) - VARHDRSZ); ! ! /* ! * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth ! * trying to calculate the size more accurately than that. ! */ ! result = (text *) palloc(plen * 2 + VARHDRSZ); ! r = VARDATA(result); ! ! if (elen == 0) ! { ! /* ! * No escape character is wanted. Double any backslashes in the ! * pattern to make them act like ordinary characters. ! */ ! while (plen > 0) ! { ! if (*p == '\\') ! *r++ = '\\'; ! BYTEA_CopyAdvChar(r, p, plen); ! } ! } ! else ! { ! /* ! * The specified escape must be only a single character. ! */ ! BYTEA_NextChar(e, elen); ! if (elen != 0) ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), ! errmsg("invalid escape string"), ! errhint("Escape string must be empty or one character."))); ! ! e = VARDATA(esc); ! ! /* ! * If specified escape is '\', just copy the pattern as-is. ! */ ! if (*e == '\\') ! { ! memcpy(result, pat, VARSIZE(pat)); ! PG_RETURN_BYTEA_P(result); ! } ! ! /* ! * Otherwise, convert occurrences of the specified escape character to ! * '\', and double occurrences of '\' --- unless they immediately ! * follow an escape character! ! */ ! afterescape = false; ! while (plen > 0) ! { ! if (BYTEA_CHAREQ(p, e) && !afterescape) ! { ! *r++ = '\\'; ! BYTEA_NextChar(p, plen); ! afterescape = true; ! } ! else if (*p == '\\') ! { ! *r++ = '\\'; ! if (!afterescape) ! *r++ = '\\'; ! BYTEA_NextChar(p, plen); ! afterescape = false; ! } ! else ! { ! BYTEA_CopyAdvChar(r, p, plen); ! afterescape = false; ! } ! } ! } ! ! SET_VARSIZE(result, r - ((char *) result)); ! PG_RETURN_BYTEA_P(result); } - /* - * Same as above, but specifically for bytea (binary) datatype - */ - static int - MatchBytea(char *t, int tlen, char *p, int plen) - { - /* Fast path for match-everything pattern */ - if ((plen == 1) && (*p == '%')) - return LIKE_TRUE; - - while ((tlen > 0) && (plen > 0)) - { - if (*p == '\\') - { - /* Next pattern char must match literally, whatever it is */ - BYTEA_NextChar(p, plen); - if ((plen <= 0) || !BYTEA_CHAREQ(t, p)) - return LIKE_FALSE; - } - else if (*p == '%') - { - /* %% is the same as % according to the SQL standard */ - /* Advance past all %'s */ - while ((plen > 0) && (*p == '%')) - BYTEA_NextChar(p, plen); - /* Trailing percent matches everything. */ - if (plen <= 0) - return LIKE_TRUE; - - /* - * Otherwise, scan for a text position at which we can match the - * rest of the pattern. - */ - while (tlen > 0) - { - /* - * Optimization to prevent most recursion: don't recurse - * unless first pattern char might match this text char. - */ - if (BYTEA_CHAREQ(t, p) || (*p == '\\') || (*p == '_')) - { - int matched = MatchBytea(t, tlen, p, plen); - - if (matched != LIKE_FALSE) - return matched; /* TRUE or ABORT */ - } - - BYTEA_NextChar(t, tlen); - } - - /* - * End of text with no match, so no point in trying later places - * to start matching this pattern. - */ - return LIKE_ABORT; - } - else if ((*p != '_') && !BYTEA_CHAREQ(t, p)) - { - /* - * Not the single-character wildcard and no explicit match? Then - * time to quit... - */ - return LIKE_FALSE; - } - - BYTEA_NextChar(t, tlen); - BYTEA_NextChar(p, plen); - } - - if (tlen > 0) - return LIKE_FALSE; /* end of pattern, but not of text */ - - /* End of input string. Do we have matching pattern remaining? */ - while ((plen > 0) && (*p == '%')) /* allow multiple %'s at end of - * pattern */ - BYTEA_NextChar(p, plen); - if (plen <= 0) - return LIKE_TRUE; - - /* - * End of text with no match, so no point in trying later places to start - * matching this pattern. - */ - return LIKE_ABORT; - } /* MatchBytea() */ --- 440,447 ---- { bytea *pat = PG_GETARG_BYTEA_P(0); bytea *esc = PG_GETARG_BYTEA_P(1); ! bytea *result = SB_do_like_escape((text *)pat, (text *)esc); ! PG_RETURN_BYTEA_P((bytea *)result); } Index: src/backend/utils/adt/like_match.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/utils/adt/like_match.c,v retrieving revision 1.15 diff -c -r1.15 like_match.c *** src/backend/utils/adt/like_match.c 27 Feb 2007 23:48:08 -0000 1.15 --- src/backend/utils/adt/like_match.c 18 May 2007 02:47:41 -0000 *************** *** 9,19 **** * Before the inclusion, we need to define following macros: * * CHAREQ - * ICHAREQ * NextChar * CopyAdvChar * MatchText (MBMatchText) - * MatchTextIC (MBMatchTextIC) * do_like_escape (MB_do_like_escape) * * Copyright (c) 1996-2007, PostgreSQL Global Development Group --- 9,17 ---- *************** *** 82,88 **** if (*p == '\\') { /* Next pattern char must match literally, whatever it is */ ! NextChar(p, plen); if ((plen <= 0) || !CHAREQ(t, p)) return LIKE_FALSE; } --- 80,86 ---- if (*p == '\\') { /* Next pattern char must match literally, whatever it is */ ! NextByte(p, plen); if ((plen <= 0) || !CHAREQ(t, p)) return LIKE_FALSE; } *************** *** 91,97 **** /* %% is the same as % according to the SQL standard */ /* Advance past all %'s */ while ((plen > 0) && (*p == '%')) ! NextChar(p, plen); /* Trailing percent matches everything. */ if (plen <= 0) return LIKE_TRUE; --- 89,95 ---- /* %% is the same as % according to the SQL standard */ /* Advance past all %'s */ while ((plen > 0) && (*p == '%')) ! NextByte(p, plen); /* Trailing percent matches everything. */ if (plen <= 0) return LIKE_TRUE; *************** *** 123,129 **** */ return LIKE_ABORT; } ! else if ((*p != '_') && !CHAREQ(t, p)) { /* * Not the single-character wildcard and no explicit match? Then --- 121,146 ---- */ return LIKE_ABORT; } ! #ifdef UTF8_OPT ! /* ! * UTF8 is optimised to do byte at a time matching in most cases, ! * thus saving expensive calls to NextChar. ! * ! * UTF8 has disjoint representations for first-bytes and ! * not-first-bytes of MB characters, and thus it is ! * impossible to make a false match in which an MB pattern ! * character is matched to the end of one data character ! * plus the start of another. ! * In character sets without that property, we have to use the ! * slow way to ensure we don't make out-of-sync matches. ! */ ! else if (*p == '_') ! { ! NextChar(t, tlen); ! NextByte(p, plen); ! continue; ! } ! else if (!BYTEEQ(t, p)) { /* * Not the single-character wildcard and no explicit match? Then *************** *** 132,215 **** return LIKE_FALSE; } ! NextChar(t, tlen); ! NextChar(p, plen); ! } ! ! if (tlen > 0) ! return LIKE_FALSE; /* end of pattern, but not of text */ ! ! /* End of input string. Do we have matching pattern remaining? */ ! while ((plen > 0) && (*p == '%')) /* allow multiple %'s at end of ! * pattern */ ! NextChar(p, plen); ! if (plen <= 0) ! return LIKE_TRUE; ! ! /* ! * End of text with no match, so no point in trying later places to start ! * matching this pattern. ! */ ! return LIKE_ABORT; ! } /* MatchText() */ ! ! /* ! * Same as above, but ignore case ! */ ! static int ! MatchTextIC(char *t, int tlen, char *p, int plen) ! { ! /* Fast path for match-everything pattern */ ! if ((plen == 1) && (*p == '%')) ! return LIKE_TRUE; ! ! while ((tlen > 0) && (plen > 0)) ! { ! if (*p == '\\') ! { ! /* Next pattern char must match literally, whatever it is */ ! NextChar(p, plen); ! if ((plen <= 0) || !ICHAREQ(t, p)) ! return LIKE_FALSE; ! } ! else if (*p == '%') ! { ! /* %% is the same as % according to the SQL standard */ ! /* Advance past all %'s */ ! while ((plen > 0) && (*p == '%')) ! NextChar(p, plen); ! /* Trailing percent matches everything. */ ! if (plen <= 0) ! return LIKE_TRUE; ! ! /* ! * Otherwise, scan for a text position at which we can match the ! * rest of the pattern. ! */ ! while (tlen > 0) ! { ! /* ! * Optimization to prevent most recursion: don't recurse ! * unless first pattern char might match this text char. ! */ ! if (ICHAREQ(t, p) || (*p == '\\') || (*p == '_')) ! { ! int matched = MatchTextIC(t, tlen, p, plen); ! ! if (matched != LIKE_FALSE) ! return matched; /* TRUE or ABORT */ ! } ! ! NextChar(t, tlen); ! } ! ! /* ! * End of text with no match, so no point in trying later places ! * to start matching this pattern. ! */ ! return LIKE_ABORT; ! } ! else if ((*p != '_') && !ICHAREQ(t, p)) { /* * Not the single-character wildcard and no explicit match? Then --- 149,163 ---- return LIKE_FALSE; } ! NextByte(t, tlen); ! NextByte(p, plen); ! #else ! /* ! * Branch for non-utf8 multi-byte charsets and also for single-byte ! * charsets which don't gain any benefir from the above optimisation. ! */ ! ! else if ((*p != '_') && !CHAREQ(t, p)) { /* * Not the single-character wildcard and no explicit match? Then *************** *** 220,225 **** --- 168,175 ---- NextChar(t, tlen); NextChar(p, plen); + + #endif /* UTF8_OPT */ } if (tlen > 0) *************** *** 228,234 **** /* End of input string. Do we have matching pattern remaining? */ while ((plen > 0) && (*p == '%')) /* allow multiple %'s at end of * pattern */ ! NextChar(p, plen); if (plen <= 0) return LIKE_TRUE; --- 178,184 ---- /* End of input string. Do we have matching pattern remaining? */ while ((plen > 0) && (*p == '%')) /* allow multiple %'s at end of * pattern */ ! NextByte(p, plen); if (plen <= 0) return LIKE_TRUE; *************** *** 237,248 **** * matching this pattern. */ return LIKE_ABORT; ! } /* MatchTextIC() */ /* * like_escape() --- given a pattern and an ESCAPE string, * convert the pattern to use Postgres' standard backslash escape convention. */ static text * do_like_escape(text *pat, text *esc) { --- 187,200 ---- * matching this pattern. */ return LIKE_ABORT; ! } /* MatchText() */ /* * like_escape() --- given a pattern and an ESCAPE string, * convert the pattern to use Postgres' standard backslash escape convention. */ + #ifdef do_like_escape + static text * do_like_escape(text *pat, text *esc) { *************** *** 336,338 **** --- 288,304 ---- return result; } + #endif /* do_like_escape */ + + #undef CHAREQ + #undef NextChar + #undef CopyAdvChar + #undef MatchText + + #ifdef do_like_escape + #undef do_like_escape + #endif + + #ifdef UTF8_OPT + #undef UTF8_OPT + #endif
pgsql-patches by date: