Re: BUG #2895: Private Use Unicode character crashes server when using ILIKE - Mailing list pgsql-bugs

From Tom Lane
Subject Re: BUG #2895: Private Use Unicode character crashes server when using ILIKE
Date
Msg-id 2876.1169658904@sss.pgh.pa.us
Whole thread Raw
In response to Re: BUG #2895: Private Use Unicode character crashes server when using ILIKE  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-bugs
I wrote:
> Nonetheless, the code is certainly giving wrong answers for 4-byte
> characters.  Will go fix...

I've applied the attached patch for 8.1, and related patches in all
supported branches.

            regards, tom lane


Index: wchar.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/mb/wchar.c,v
retrieving revision 1.47.2.4
diff -c -r1.47.2.4 wchar.c
*** wchar.c    22 Aug 2006 12:11:38 -0000    1.47.2.4
--- wchar.c    24 Jan 2007 16:16:27 -0000
***************
*** 345,362 ****
  }

  /*
!  * convert UTF8 string to pg_wchar (UCS-2)
!  * caller should allocate enough space for "to"
   * len: length of from.
   * "from" not necessarily null terminated.
   */
  static int
  pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  {
-     unsigned char c1,
-                 c2,
-                 c3;
      int            cnt = 0;

      while (len > 0 && *from)
      {
--- 345,363 ----
  }

  /*
!  * convert UTF8 string to pg_wchar (UCS-4)
!  * caller must allocate enough space for "to", including a trailing zero!
   * len: length of from.
   * "from" not necessarily null terminated.
   */
  static int
  pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  {
      int            cnt = 0;
+     uint32        c1,
+                 c2,
+                 c3,
+                 c4;

      while (len > 0 && *from)
      {
***************
*** 365,390 ****
              *to = *from++;
              len--;
          }
!         else if ((*from & 0xe0) == 0xc0 && len >= 2)
          {
              c1 = *from++ & 0x1f;
              c2 = *from++ & 0x3f;
!             *to = c1 << 6;
!             *to |= c2;
              len -= 2;
          }
!         else if ((*from & 0xe0) == 0xe0 && len >= 3)
          {
              c1 = *from++ & 0x0f;
              c2 = *from++ & 0x3f;
              c3 = *from++ & 0x3f;
!             *to = c1 << 12;
!             *to |= c2 << 6;
!             *to |= c3;
              len -= 3;
          }
          else
          {
              *to = *from++;
              len--;
          }
--- 366,404 ----
              *to = *from++;
              len--;
          }
!         else if ((*from & 0xe0) == 0xc0)
          {
+             if (len < 2)
+                 break;            /* drop trailing incomplete char */
              c1 = *from++ & 0x1f;
              c2 = *from++ & 0x3f;
!             *to = (c1 << 6) | c2;
              len -= 2;
          }
!         else if ((*from & 0xf0) == 0xe0)
          {
+             if (len < 3)
+                 break;            /* drop trailing incomplete char */
              c1 = *from++ & 0x0f;
              c2 = *from++ & 0x3f;
              c3 = *from++ & 0x3f;
!             *to = (c1 << 12) | (c2 << 6) | c3;
              len -= 3;
          }
+         else if ((*from & 0xf8) == 0xf0)
+         {
+             if (len < 4)
+                 break;            /* drop trailing incomplete char */
+             c1 = *from++ & 0x07;
+             c2 = *from++ & 0x3f;
+             c3 = *from++ & 0x3f;
+             c4 = *from++ & 0x3f;
+             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
+             len -= 4;
+         }
          else
          {
+             /* treat a bogus char as length 1; not ours to raise error */
              *to = *from++;
              len--;
          }
***************
*** 396,407 ****
  }

  /*
!  * returns the byte length of a UTF8 character pointed to by s
   */
  int
  pg_utf_mblen(const unsigned char *s)
  {
!     int            len = 1;

      if ((*s & 0x80) == 0)
          len = 1;
--- 410,429 ----
  }

  /*
!  * Return the byte length of a UTF8 character pointed to by s
!  *
!  * Note: in the current implementation we do not support UTF8 sequences
!  * of more than 4 bytes; hence do NOT return a value larger than 4.
!  * We return "1" for any leading byte that is either flat-out illegal or
!  * indicates a length larger than we support.
!  *
!  * pg_utf2wchar_with_len(), utf2ucs(), pg_utf8_islegal(), and perhaps
!  * other places would need to be fixed to change this.
   */
  int
  pg_utf_mblen(const unsigned char *s)
  {
!     int            len;

      if ((*s & 0x80) == 0)
          len = 1;
***************
*** 411,421 ****
          len = 3;
      else if ((*s & 0xf8) == 0xf0)
          len = 4;
      else if ((*s & 0xfc) == 0xf8)
          len = 5;
      else if ((*s & 0xfe) == 0xfc)
          len = 6;
!     return (len);
  }

  static int
--- 433,447 ----
          len = 3;
      else if ((*s & 0xf8) == 0xf0)
          len = 4;
+ #ifdef NOT_USED
      else if ((*s & 0xfc) == 0xf8)
          len = 5;
      else if ((*s & 0xfe) == 0xfc)
          len = 6;
! #endif
!     else
!         len = 1;
!     return len;
  }

  static int

pgsql-bugs by date:

Previous
From: Jeff Trout
Date:
Subject: Function returns wrong data after datatype change
Next
From: Tom Lane
Date:
Subject: Re: Function returns wrong data after datatype change