Home > mailing lists
Re: UTF8MatchText - Mailing list pgsql-patches

From	Andrew Dunstan
Subject	Re: UTF8MatchText
Date	May 18, 2007 00:06:30
Msg-id	464D181D.9010307@dunslane.net Whole thread Raw
In response to	Re: UTF8MatchText (Tom Lane <tgl@sss.pgh.pa.us>)
Responses	Re: UTF8MatchText
List	pgsql-patches
Tree view

Tom Lane wrote:
> ITAGAKI Takahiro <itagaki.takahiro@oss.ntt.co.jp> writes:
>
>> Yes, I only used the 'disjoint representations for first-bytes and
>> not-first-bytes of MB characters' feature in UTF8. Other encodings
>> allows both [AB] and [BA] for MB character patterns. UTF8Match() does
>> not cope with those encodings; If we have '[AB][AB]' in a table and
>> search it with LIKE '%[BA]%', we judge that they are matched by mistake.
>>
>
> AFAICS, the patch does *not* make that mistake because % will not
> advance over a fractional character.
>
>


Yeah, I think that's right.

Attached is my current WIP patch. If we decide that this optimisation
can in fact be applied to all backend encodings, that will be easily
incorporated. It will simplify the code further. Note that all the
common code in the MatchText and do_like_escape functions has been
factored - and the bytea functions just call the single-byte text
versions - AFAICS the effect will be identical to having the specialised
versions. (I'm always happy when code volume can be reduced.)

cheers

andrew

Index: src/backend/utils/adt/like.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/like.c,v
retrieving revision 1.68
diff -c -r1.68 like.c
*** src/backend/utils/adt/like.c    27 Feb 2007 23:48:08 -0000    1.68
--- src/backend/utils/adt/like.c    18 May 2007 02:47:41 -0000
***************
*** 28,48 ****
  #define LIKE_ABORT                        (-1)


! static int    MatchText(char *t, int tlen, char *p, int plen);
! static int    MatchTextIC(char *t, int tlen, char *p, int plen);
! static int    MatchBytea(char *t, int tlen, char *p, int plen);
! static text *do_like_escape(text *, text *);

! static int    MBMatchText(char *t, int tlen, char *p, int plen);
! static int    MBMatchTextIC(char *t, int tlen, char *p, int plen);
  static text *MB_do_like_escape(text *, text *);

  /*--------------------
   * Support routine for MatchText. Compares given multibyte streams
   * as wide characters. If they match, returns 1 otherwise returns 0.
   *--------------------
   */
! static int
  wchareq(char *p1, char *p2)
  {
      int            p1_len;
--- 28,50 ----
  #define LIKE_ABORT                        (-1)


! static int    SB_MatchText(char *t, int tlen, char *p, int plen);
! static int    SB_MatchTextIC(char *t, int tlen, char *p, int plen);
! static text *SB_do_like_escape(text *, text *);

! static int    MB_MatchText(char *t, int tlen, char *p, int plen);
  static text *MB_do_like_escape(text *, text *);

+ static int    UTF8_MatchText(char *t, int tlen, char *p, int plen);
+ static int    GenericMatchText(char *s, int slen, char* p, int plen);
+ static int    mbtexticlike(text *str, text *pat);
+
  /*--------------------
   * Support routine for MatchText. Compares given multibyte streams
   * as wide characters. If they match, returns 1 otherwise returns 0.
   *--------------------
   */
! static __inline__ int
  wchareq(char *p1, char *p2)
  {
      int            p1_len;
***************
*** 72,86 ****
   * of getting a single character transformed to the system's wchar_t format.
   * So now, we just downcase the strings using lower() and apply regular LIKE
   * comparison.    This should be revisited when we install better locale support.
-  *
-  * Note that MBMatchText and MBMatchTextIC do exactly the same thing now.
-  * Is it worth refactoring to avoid duplicated code?  They might become
-  * different again in the future.
   */

  /* Set up to compile like_match.c for multibyte characters */
  #define CHAREQ(p1, p2) wchareq(p1, p2)
- #define ICHAREQ(p1, p2) wchareq(p1, p2)
  #define NextChar(p, plen) \
      do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
  #define CopyAdvChar(dst, src, srclen) \
--- 74,86 ----
   * of getting a single character transformed to the system's wchar_t format.
   * So now, we just downcase the strings using lower() and apply regular LIKE
   * comparison.    This should be revisited when we install better locale support.
   */

+ #define NextByte(p, plen)    ((p)++, (plen)--)
+ #define BYTEEQ(p1, p2)        (*(p1) == *(p2))
+
  /* Set up to compile like_match.c for multibyte characters */
  #define CHAREQ(p1, p2) wchareq(p1, p2)
  #define NextChar(p, plen) \
      do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
  #define CopyAdvChar(dst, src, srclen) \
***************
*** 90,122 ****
               *(dst)++ = *(src)++; \
         } while (0)

! #define MatchText    MBMatchText
! #define MatchTextIC MBMatchTextIC
  #define do_like_escape    MB_do_like_escape

  #include "like_match.c"

- #undef CHAREQ
- #undef ICHAREQ
- #undef NextChar
- #undef CopyAdvChar
- #undef MatchText
- #undef MatchTextIC
- #undef do_like_escape
-
  /* Set up to compile like_match.c for single-byte characters */
! #define CHAREQ(p1, p2) (*(p1) == *(p2))
! #define ICHAREQ(p1, p2) (tolower((unsigned char) *(p1)) == tolower((unsigned char) *(p2)))
! #define NextChar(p, plen) ((p)++, (plen)--)
  #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)

  #include "like_match.c"

- /* And some support for BYTEA */
- #define BYTEA_CHAREQ(p1, p2) (*(p1) == *(p2))
- #define BYTEA_NextChar(p, plen) ((p)++, (plen)--)
- #define BYTEA_CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)


  /*
   *    interface routines called by the function manager
--- 90,170 ----
               *(dst)++ = *(src)++; \
         } while (0)

! #define MatchText    MB_MatchText
  #define do_like_escape    MB_do_like_escape

  #include "like_match.c"

  /* Set up to compile like_match.c for single-byte characters */
! #define CHAREQ(p1, p2) BYTEEQ(p1, p2)
! #define NextChar(p, plen) NextByte(p, plen)
  #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)

+ #define MatchText    SB_MatchText
+ #define do_like_escape    SB_do_like_escape
+
+ #include "like_match.c"
+
+ /* set up to compile like_match.c for single byte case insensitive matching */
+
+ #define CHAREQ(p1, p2) (tolower((unsigned char) *(p1)) == tolower((unsigned char) *(p2)))
+ #define NextChar(p, plen) NextByte(p, plen)
+ #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
+
+ #define MatchText    SB_MatchTextIC
+
+ #include "like_match.c"
+
+ /* set up for UTF8 match optimisation */
+
+ #define CHAREQ(p1, p2) wchareq(p1, p2)
+ #define NextChar(p, plen) \
+     do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
+ #define CopyAdvChar(dst, src, srclen) \
+     do { int __l = pg_mblen(src); \
+          (srclen) -= __l; \
+          while (__l-- > 0) \
+              *(dst)++ = *(src)++; \
+        } while (0)
+
+ #define MatchText    UTF8_MatchText
+ #define UTF8_OPT
+
  #include "like_match.c"


+ static __inline__ int
+ GenericMatchText(char *s, int slen, char* p, int plen)
+ {
+     if (pg_database_encoding_max_length() == 1)
+         return SB_MatchText(s, slen, p, plen);
+     else if (GetDatabaseEncoding() == PG_UTF8)
+         return UTF8_MatchText(s, slen, p, plen);
+     else
+         return MB_MatchText(s, slen, p, plen);
+ }
+
+ static __inline__ int
+ mbtexticlike(text *str, text *pat)
+ {
+     char       *s,
+                *p;
+     int            slen,
+                 plen;
+
+     /* Force inputs to lower case to achieve case insensitivity */
+     str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str)));
+     pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat)));
+     s = VARDATA(str);
+     slen = (VARSIZE(str) - VARHDRSZ);
+     p = VARDATA(pat);
+     plen = (VARSIZE(pat) - VARHDRSZ);
+
+     if (GetDatabaseEncoding() == PG_UTF8)
+         return UTF8_MatchText(s, slen, p, plen);
+     else
+         return MB_MatchText(s, slen, p, plen);
+ }

  /*
   *    interface routines called by the function manager
***************
*** 138,147 ****
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     if (pg_database_encoding_max_length() == 1)
!         result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
!     else
!         result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
--- 186,192 ----
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
***************
*** 162,171 ****
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     if (pg_database_encoding_max_length() == 1)
!         result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
!     else
!         result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
--- 207,213 ----
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
***************
*** 186,195 ****
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     if (pg_database_encoding_max_length() == 1)
!         result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
!     else
!         result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
--- 228,234 ----
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
***************
*** 210,219 ****
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     if (pg_database_encoding_max_length() == 1)
!         result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
!     else
!         result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
--- 249,255 ----
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
***************
*** 234,240 ****
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     result = (MatchBytea(s, slen, p, plen) == LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
--- 270,276 ----
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     result = (SB_MatchText(s, slen, p, plen) == LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
***************
*** 255,261 ****
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     result = (MatchBytea(s, slen, p, plen) != LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
--- 291,297 ----
      p = VARDATA(pat);
      plen = (VARSIZE(pat) - VARHDRSZ);

!     result = (SB_MatchText(s, slen, p, plen) != LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
***************
*** 281,305 ****
          slen = strlen(s);
          p = VARDATA(pat);
          plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
      }
      else
      {
-         /* Force inputs to lower case to achieve case insensitivity */
          text       *strtext;

          strtext = DatumGetTextP(DirectFunctionCall1(name_text,
                                                      NameGetDatum(str)));
!         strtext = DatumGetTextP(DirectFunctionCall1(lower,
!                                                   PointerGetDatum(strtext)));
!         pat = DatumGetTextP(DirectFunctionCall1(lower,
!                                                 PointerGetDatum(pat)));
!
!         s = VARDATA(strtext);
!         slen = (VARSIZE(strtext) - VARHDRSZ);
!         p = VARDATA(pat);
!         plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE);
      }

      PG_RETURN_BOOL(result);
--- 317,331 ----
          slen = strlen(s);
          p = VARDATA(pat);
          plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (SB_MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
      }
      else
      {
          text       *strtext;

          strtext = DatumGetTextP(DirectFunctionCall1(name_text,
                                                      NameGetDatum(str)));
!         result = (mbtexticlike(strtext, pat) == LIKE_TRUE);
      }

      PG_RETURN_BOOL(result);
***************
*** 322,346 ****
          slen = strlen(s);
          p = VARDATA(pat);
          plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
      }
      else
      {
-         /* Force inputs to lower case to achieve case insensitivity */
          text       *strtext;

          strtext = DatumGetTextP(DirectFunctionCall1(name_text,
                                                      NameGetDatum(str)));
!         strtext = DatumGetTextP(DirectFunctionCall1(lower,
!                                                   PointerGetDatum(strtext)));
!         pat = DatumGetTextP(DirectFunctionCall1(lower,
!                                                 PointerGetDatum(pat)));
!
!         s = VARDATA(strtext);
!         slen = (VARSIZE(strtext) - VARHDRSZ);
!         p = VARDATA(pat);
!         plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE);
      }

      PG_RETURN_BOOL(result);
--- 348,362 ----
          slen = strlen(s);
          p = VARDATA(pat);
          plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (SB_MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
      }
      else
      {
          text       *strtext;

          strtext = DatumGetTextP(DirectFunctionCall1(name_text,
                                                      NameGetDatum(str)));
!         result = (mbtexticlike(strtext, pat) != LIKE_TRUE);
      }

      PG_RETURN_BOOL(result);
***************
*** 363,383 ****
          slen = (VARSIZE(str) - VARHDRSZ);
          p = VARDATA(pat);
          plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
      }
      else
!     {
!         /* Force inputs to lower case to achieve case insensitivity */
!         str = DatumGetTextP(DirectFunctionCall1(lower,
!                                                 PointerGetDatum(str)));
!         pat = DatumGetTextP(DirectFunctionCall1(lower,
!                                                 PointerGetDatum(pat)));
!         s = VARDATA(str);
!         slen = (VARSIZE(str) - VARHDRSZ);
!         p = VARDATA(pat);
!         plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE);
!     }

      PG_RETURN_BOOL(result);
  }
--- 379,388 ----
          slen = (VARSIZE(str) - VARHDRSZ);
          p = VARDATA(pat);
          plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (SB_MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
      }
      else
!         result = (mbtexticlike(str, pat) == LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
***************
*** 399,419 ****
          slen = (VARSIZE(str) - VARHDRSZ);
          p = VARDATA(pat);
          plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
      }
      else
!     {
!         /* Force inputs to lower case to achieve case insensitivity */
!         str = DatumGetTextP(DirectFunctionCall1(lower,
!                                                 PointerGetDatum(str)));
!         pat = DatumGetTextP(DirectFunctionCall1(lower,
!                                                 PointerGetDatum(pat)));
!         s = VARDATA(str);
!         slen = (VARSIZE(str) - VARHDRSZ);
!         p = VARDATA(pat);
!         plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE);
!     }

      PG_RETURN_BOOL(result);
  }
--- 404,413 ----
          slen = (VARSIZE(str) - VARHDRSZ);
          p = VARDATA(pat);
          plen = (VARSIZE(pat) - VARHDRSZ);
!         result = (SB_MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
      }
      else
!         result = (mbtexticlike(str, pat) != LIKE_TRUE);

      PG_RETURN_BOOL(result);
  }
***************
*** 430,436 ****
      text       *result;

      if (pg_database_encoding_max_length() == 1)
!         result = do_like_escape(pat, esc);
      else
          result = MB_do_like_escape(pat, esc);

--- 424,430 ----
      text       *result;

      if (pg_database_encoding_max_length() == 1)
!         result = SB_do_like_escape(pat, esc);
      else
          result = MB_do_like_escape(pat, esc);

***************
*** 446,624 ****
  {
      bytea       *pat = PG_GETARG_BYTEA_P(0);
      bytea       *esc = PG_GETARG_BYTEA_P(1);
!     bytea       *result;
!     char       *p,
!                *e,
!                *r;
!     int            plen,
!                 elen;
!     bool        afterescape;
!
!     p = VARDATA(pat);
!     plen = (VARSIZE(pat) - VARHDRSZ);
!     e = VARDATA(esc);
!     elen = (VARSIZE(esc) - VARHDRSZ);
!
!     /*
!      * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
!      * trying to calculate the size more accurately than that.
!      */
!     result = (text *) palloc(plen * 2 + VARHDRSZ);
!     r = VARDATA(result);
!
!     if (elen == 0)
!     {
!         /*
!          * No escape character is wanted.  Double any backslashes in the
!          * pattern to make them act like ordinary characters.
!          */
!         while (plen > 0)
!         {
!             if (*p == '\\')
!                 *r++ = '\\';
!             BYTEA_CopyAdvChar(r, p, plen);
!         }
!     }
!     else
!     {
!         /*
!          * The specified escape must be only a single character.
!          */
!         BYTEA_NextChar(e, elen);
!         if (elen != 0)
!             ereport(ERROR,
!                     (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
!                      errmsg("invalid escape string"),
!                   errhint("Escape string must be empty or one character.")));
!
!         e = VARDATA(esc);
!
!         /*
!          * If specified escape is '\', just copy the pattern as-is.
!          */
!         if (*e == '\\')
!         {
!             memcpy(result, pat, VARSIZE(pat));
!             PG_RETURN_BYTEA_P(result);
!         }
!
!         /*
!          * Otherwise, convert occurrences of the specified escape character to
!          * '\', and double occurrences of '\' --- unless they immediately
!          * follow an escape character!
!          */
!         afterescape = false;
!         while (plen > 0)
!         {
!             if (BYTEA_CHAREQ(p, e) && !afterescape)
!             {
!                 *r++ = '\\';
!                 BYTEA_NextChar(p, plen);
!                 afterescape = true;
!             }
!             else if (*p == '\\')
!             {
!                 *r++ = '\\';
!                 if (!afterescape)
!                     *r++ = '\\';
!                 BYTEA_NextChar(p, plen);
!                 afterescape = false;
!             }
!             else
!             {
!                 BYTEA_CopyAdvChar(r, p, plen);
!                 afterescape = false;
!             }
!         }
!     }
!
!     SET_VARSIZE(result, r - ((char *) result));

!     PG_RETURN_BYTEA_P(result);
  }

- /*
-  * Same as above, but specifically for bytea (binary) datatype
-  */
- static int
- MatchBytea(char *t, int tlen, char *p, int plen)
- {
-     /* Fast path for match-everything pattern */
-     if ((plen == 1) && (*p == '%'))
-         return LIKE_TRUE;
-
-     while ((tlen > 0) && (plen > 0))
-     {
-         if (*p == '\\')
-         {
-             /* Next pattern char must match literally, whatever it is */
-             BYTEA_NextChar(p, plen);
-             if ((plen <= 0) || !BYTEA_CHAREQ(t, p))
-                 return LIKE_FALSE;
-         }
-         else if (*p == '%')
-         {
-             /* %% is the same as % according to the SQL standard */
-             /* Advance past all %'s */
-             while ((plen > 0) && (*p == '%'))
-                 BYTEA_NextChar(p, plen);
-             /* Trailing percent matches everything. */
-             if (plen <= 0)
-                 return LIKE_TRUE;
-
-             /*
-              * Otherwise, scan for a text position at which we can match the
-              * rest of the pattern.
-              */
-             while (tlen > 0)
-             {
-                 /*
-                  * Optimization to prevent most recursion: don't recurse
-                  * unless first pattern char might match this text char.
-                  */
-                 if (BYTEA_CHAREQ(t, p) || (*p == '\\') || (*p == '_'))
-                 {
-                     int            matched = MatchBytea(t, tlen, p, plen);
-
-                     if (matched != LIKE_FALSE)
-                         return matched; /* TRUE or ABORT */
-                 }
-
-                 BYTEA_NextChar(t, tlen);
-             }
-
-             /*
-              * End of text with no match, so no point in trying later places
-              * to start matching this pattern.
-              */
-             return LIKE_ABORT;
-         }
-         else if ((*p != '_') && !BYTEA_CHAREQ(t, p))
-         {
-             /*
-              * Not the single-character wildcard and no explicit match? Then
-              * time to quit...
-              */
-             return LIKE_FALSE;
-         }
-
-         BYTEA_NextChar(t, tlen);
-         BYTEA_NextChar(p, plen);
-     }
-
-     if (tlen > 0)
-         return LIKE_FALSE;        /* end of pattern, but not of text */
-
-     /* End of input string.  Do we have matching pattern remaining? */
-     while ((plen > 0) && (*p == '%'))    /* allow multiple %'s at end of
-                                          * pattern */
-         BYTEA_NextChar(p, plen);
-     if (plen <= 0)
-         return LIKE_TRUE;
-
-     /*
-      * End of text with no match, so no point in trying later places to start
-      * matching this pattern.
-      */
-     return LIKE_ABORT;
- }    /* MatchBytea() */
--- 440,447 ----
  {
      bytea       *pat = PG_GETARG_BYTEA_P(0);
      bytea       *esc = PG_GETARG_BYTEA_P(1);
!     bytea       *result = SB_do_like_escape((text *)pat, (text *)esc);

!     PG_RETURN_BYTEA_P((bytea *)result);
  }

Index: src/backend/utils/adt/like_match.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/like_match.c,v
retrieving revision 1.15
diff -c -r1.15 like_match.c
*** src/backend/utils/adt/like_match.c    27 Feb 2007 23:48:08 -0000    1.15
--- src/backend/utils/adt/like_match.c    18 May 2007 02:47:41 -0000
***************
*** 9,19 ****
   * Before the inclusion, we need to define following macros:
   *
   * CHAREQ
-  * ICHAREQ
   * NextChar
   * CopyAdvChar
   * MatchText (MBMatchText)
-  * MatchTextIC (MBMatchTextIC)
   * do_like_escape (MB_do_like_escape)
   *
   * Copyright (c) 1996-2007, PostgreSQL Global Development Group
--- 9,17 ----
***************
*** 82,88 ****
          if (*p == '\\')
          {
              /* Next pattern char must match literally, whatever it is */
!             NextChar(p, plen);
              if ((plen <= 0) || !CHAREQ(t, p))
                  return LIKE_FALSE;
          }
--- 80,86 ----
          if (*p == '\\')
          {
              /* Next pattern char must match literally, whatever it is */
!             NextByte(p, plen);
              if ((plen <= 0) || !CHAREQ(t, p))
                  return LIKE_FALSE;
          }
***************
*** 91,97 ****
              /* %% is the same as % according to the SQL standard */
              /* Advance past all %'s */
              while ((plen > 0) && (*p == '%'))
!                 NextChar(p, plen);
              /* Trailing percent matches everything. */
              if (plen <= 0)
                  return LIKE_TRUE;
--- 89,95 ----
              /* %% is the same as % according to the SQL standard */
              /* Advance past all %'s */
              while ((plen > 0) && (*p == '%'))
!                 NextByte(p, plen);
              /* Trailing percent matches everything. */
              if (plen <= 0)
                  return LIKE_TRUE;
***************
*** 123,129 ****
               */
              return LIKE_ABORT;
          }
!         else if ((*p != '_') && !CHAREQ(t, p))
          {
              /*
               * Not the single-character wildcard and no explicit match? Then
--- 121,146 ----
               */
              return LIKE_ABORT;
          }
! #ifdef UTF8_OPT
!         /*
!          * UTF8 is optimised to do byte at a time matching in most cases,
!          * thus saving expensive calls to NextChar.
!          *
!          * UTF8 has disjoint representations for first-bytes and
!          * not-first-bytes of MB characters, and thus it is
!          * impossible to make a false match in which an MB pattern
!          * character is matched to the end of one data character
!          * plus the start of another.
!          * In character sets without that property, we have to use the
!          * slow way to ensure we don't make out-of-sync matches.
!          */
!         else if (*p == '_')
!         {
!             NextChar(t, tlen);
!             NextByte(p, plen);
!             continue;
!         }
!         else if (!BYTEEQ(t, p))
          {
              /*
               * Not the single-character wildcard and no explicit match? Then
***************
*** 132,215 ****
              return LIKE_FALSE;
          }

!         NextChar(t, tlen);
!         NextChar(p, plen);
!     }
!
!     if (tlen > 0)
!         return LIKE_FALSE;        /* end of pattern, but not of text */
!
!     /* End of input string.  Do we have matching pattern remaining? */
!     while ((plen > 0) && (*p == '%'))    /* allow multiple %'s at end of
!                                          * pattern */
!         NextChar(p, plen);
!     if (plen <= 0)
!         return LIKE_TRUE;
!
!     /*
!      * End of text with no match, so no point in trying later places to start
!      * matching this pattern.
!      */
!     return LIKE_ABORT;
! }    /* MatchText() */
!
! /*
!  * Same as above, but ignore case
!  */
! static int
! MatchTextIC(char *t, int tlen, char *p, int plen)
! {
!     /* Fast path for match-everything pattern */
!     if ((plen == 1) && (*p == '%'))
!         return LIKE_TRUE;
!
!     while ((tlen > 0) && (plen > 0))
!     {
!         if (*p == '\\')
!         {
!             /* Next pattern char must match literally, whatever it is */
!             NextChar(p, plen);
!             if ((plen <= 0) || !ICHAREQ(t, p))
!                 return LIKE_FALSE;
!         }
!         else if (*p == '%')
!         {
!             /* %% is the same as % according to the SQL standard */
!             /* Advance past all %'s */
!             while ((plen > 0) && (*p == '%'))
!                 NextChar(p, plen);
!             /* Trailing percent matches everything. */
!             if (plen <= 0)
!                 return LIKE_TRUE;
!
!             /*
!              * Otherwise, scan for a text position at which we can match the
!              * rest of the pattern.
!              */
!             while (tlen > 0)
!             {
!                 /*
!                  * Optimization to prevent most recursion: don't recurse
!                  * unless first pattern char might match this text char.
!                  */
!                 if (ICHAREQ(t, p) || (*p == '\\') || (*p == '_'))
!                 {
!                     int            matched = MatchTextIC(t, tlen, p, plen);
!
!                     if (matched != LIKE_FALSE)
!                         return matched; /* TRUE or ABORT */
!                 }
!
!                 NextChar(t, tlen);
!             }
!
!             /*
!              * End of text with no match, so no point in trying later places
!              * to start matching this pattern.
!              */
!             return LIKE_ABORT;
!         }
!         else if ((*p != '_') && !ICHAREQ(t, p))
          {
              /*
               * Not the single-character wildcard and no explicit match? Then
--- 149,163 ----
              return LIKE_FALSE;
          }

!         NextByte(t, tlen);
!         NextByte(p, plen);
! #else
!         /*
!          * Branch for non-utf8 multi-byte charsets and also for single-byte
!          * charsets which don't gain any benefir from the above optimisation.
!          */
!
!         else if ((*p != '_') && !CHAREQ(t, p))
          {
              /*
               * Not the single-character wildcard and no explicit match? Then
***************
*** 220,225 ****
--- 168,175 ----

          NextChar(t, tlen);
          NextChar(p, plen);
+
+ #endif /* UTF8_OPT */
      }

      if (tlen > 0)
***************
*** 228,234 ****
      /* End of input string.  Do we have matching pattern remaining? */
      while ((plen > 0) && (*p == '%'))    /* allow multiple %'s at end of
                                           * pattern */
!         NextChar(p, plen);
      if (plen <= 0)
          return LIKE_TRUE;

--- 178,184 ----
      /* End of input string.  Do we have matching pattern remaining? */
      while ((plen > 0) && (*p == '%'))    /* allow multiple %'s at end of
                                           * pattern */
!         NextByte(p, plen);
      if (plen <= 0)
          return LIKE_TRUE;

***************
*** 237,248 ****
       * matching this pattern.
       */
      return LIKE_ABORT;
! }    /* MatchTextIC() */

  /*
   * like_escape() --- given a pattern and an ESCAPE string,
   * convert the pattern to use Postgres' standard backslash escape convention.
   */
  static text *
  do_like_escape(text *pat, text *esc)
  {
--- 187,200 ----
       * matching this pattern.
       */
      return LIKE_ABORT;
! }    /* MatchText() */

  /*
   * like_escape() --- given a pattern and an ESCAPE string,
   * convert the pattern to use Postgres' standard backslash escape convention.
   */
+ #ifdef do_like_escape
+
  static text *
  do_like_escape(text *pat, text *esc)
  {
***************
*** 336,338 ****
--- 288,304 ----

      return result;
  }
+ #endif /* do_like_escape */
+
+ #undef CHAREQ
+ #undef NextChar
+ #undef CopyAdvChar
+ #undef MatchText
+
+ #ifdef do_like_escape
+ #undef do_like_escape
+ #endif
+
+ #ifdef UTF8_OPT
+ #undef UTF8_OPT
+ #endif
pgsql-patches by date:
From: Tom Lane
Date: 17 May 2007, 23:35:49
Subject: Re: UTF8MatchText
From: Tom Lane
Date: 18 May 2007, 00:32:34
Subject: Re: UTF8MatchText
Re: UTF8MatchText - Mailing list pgsql-patches

Previous

Next