Thread: Unicode characters above 0x10000 #2

Unicode characters above 0x10000 #2

From

John Hansen

Date:

21 November 2004, 07:56:02

Hello,

Seing that the limit is still in place, attached patch against CVS.


Kind Regards,

John Hansen

Attachment

unicode.diff

Re: Unicode characters above 0x10000 #2

From

"John Hansen"

Date:

21 November 2004, 09:19:48

Updated patch,.... Disregard old one, it broke ucs2.


... John

Attachment

cvs.diff

Re: Unicode characters above 0x10000 #2

From

"John Hansen"

Date:

21 November 2004, 10:50:26

3 times lucky?

Last one broke utf8.... Grrrr

This one works,.... Too tired, sorry for the inconvenience..

... John

Attachment

cvs.diff

Re: Unicode characters above 0x10000 #2

From

Bruce Momjian

Date:

02 December 2004, 22:37:29

Patch applied.  Thanks.

---------------------------------------------------------------------------


John Hansen wrote:
> 3 times lucky?
>
> Last one broke utf8.... Grrrr
>
> This one works,.... Too tired, sorry for the inconvenience..
>
> ... John

Content-Description: cvs.diff

[ Attachment, skipping... ]

>
> ---------------------------(end of broadcast)---------------------------
> TIP 9: the planner will ignore your desire to choose an index scan if your
>       joining column's datatypes do not match

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073

Re: Unicode characters above 0x10000 #2

From

Bruce Momjian

Date:

03 December 2004, 01:21:53

I have backed out this patch.  It is unclear it is a bug fix.

It will be saved for 8.1.

---------------------------------------------------------------------------

pgman wrote:
>
> Patch applied.  Thanks.
>
> ---------------------------------------------------------------------------
>
>
> John Hansen wrote:
> > 3 times lucky?
> >
> > Last one broke utf8.... Grrrr
> >
> > This one works,.... Too tired, sorry for the inconvenience..
> >
> > ... John
>
> Content-Description: cvs.diff
>
> [ Attachment, skipping... ]
>
> >
> > ---------------------------(end of broadcast)---------------------------
> > TIP 9: the planner will ignore your desire to choose an index scan if your
> >       joining column's datatypes do not match
>
> --
>   Bruce Momjian                        |  http://candle.pha.pa.us
>   pgman@candle.pha.pa.us               |  (610) 359-1001
>   +  If your life is a hard drive,     |  13 Roberts Road
>   +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/wchar.c,v
retrieving revision 1.38
diff -c -r1.38 wchar.c
*** src/backend/utils/mb/wchar.c    17 Sep 2004 21:59:57 -0000    1.38
--- src/backend/utils/mb/wchar.c    21 Nov 2004 09:58:36 -0000
***************
*** 343,348 ****
--- 343,373 ----
      return (pg_euc_dsplen(s));
  }

+ bool isLegalUTF8(const UTF8 *source, int len) {
+         UTF8 a;
+         const UTF8 *srcptr = source+len;
+         if(!source || (pg_utf_mblen(source) != len)) return false;
+         switch (len) {
+             default: return false;
+             /* Everything else falls through when "true"... */
+             case 6: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+             case 5: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+             case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+             case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+             case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+             switch (*source) {
+                     /* no fall-through in this inner switch */
+                     case 0xE0: if (a < 0xA0) return false; break;
+                     case 0xF0: if (a < 0x90) return false; break;
+                     case 0xF4: if (a > 0x8F) return false; break;
+                     default:  if (a < 0x80) return false;
+             }
+             case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+             if (*source > 0xFD) return false;
+         }
+         return true;
+ }
+
  /*
   * convert UTF-8 string to pg_wchar (UCS-2)
   * caller should allocate enough space for "to"
***************
*** 398,404 ****
   * returns the byte length of a UTF-8 word pointed to by s
   */
  int
! pg_utf_mblen(const unsigned char *s)
  {
      int            len = 1;

--- 423,429 ----
   * returns the byte length of a UTF-8 word pointed to by s
   */
  int
! pg_utf_mblen(const UTF8 *s)
  {
      int            len = 1;

***************
*** 406,418 ****
          len = 1;
      else if ((*s & 0xe0) == 0xc0)
          len = 2;
!     else if ((*s & 0xe0) == 0xe0)
!         len = 3;
      return (len);
  }

  static int
! pg_utf_dsplen(const unsigned char *s)
  {
      return 1;                    /* XXX fix me! */
  }
--- 431,449 ----
          len = 1;
      else if ((*s & 0xe0) == 0xc0)
          len = 2;
!         else if ((*s & 0xf0) == 0xe0)
!                 len = 3;
!         else if ((*s & 0xf8) == 0xf0)
!                 len = 4;
!         else if ((*s & 0xfc) == 0xf8)
!                 len = 5;
!         else if ((*s & 0xfe) == 0xfc)
!                 len = 6;
      return (len);
  }

  static int
! pg_utf_dsplen(const UTF8 *s)
  {
      return 1;                    /* XXX fix me! */
  }
***************
*** 721,728 ****
      {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3},        /* 3; PG_EUC_KR */
      {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3},        /* 4; PG_EUC_TW */
      {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3},        /* 5; PG_JOHAB */
!     {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3},    /* 6; PG_UNICODE */
!     {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 8; PG_LATIN1 */
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 9; PG_LATIN2 */
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 10; PG_LATIN3 */
--- 752,759 ----
      {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3},        /* 3; PG_EUC_KR */
      {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3},        /* 4; PG_EUC_TW */
      {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3},        /* 5; PG_JOHAB */
!     {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 6},        /* 6; PG_UNICODE */
!     {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3},        /* 7; PG_MULE_INTERNAL */
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 8; PG_LATIN1 */
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 9; PG_LATIN2 */
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 10; PG_LATIN3 */
***************
*** 744,754 ****
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 26; ISO-8859-7 */
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 27; ISO-8859-8 */
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 28; PG_WIN1250 */
!     {0, pg_sjis_mblen, pg_sjis_dsplen, 2},        /* 29; PG_SJIS */
!     {0, pg_big5_mblen, pg_big5_dsplen, 2},        /* 30; PG_BIG5 */
!     {0, pg_gbk_mblen, pg_gbk_dsplen, 2},        /* 31; PG_GBK */
!     {0, pg_uhc_mblen, pg_uhc_dsplen, 2},        /* 32; PG_UHC */
!     {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
  };

  /* returns the byte length of a word for mule internal code */
--- 775,785 ----
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 26; ISO-8859-7 */
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 27; ISO-8859-8 */
      {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},    /* 28; PG_WIN1250 */
!     {0, pg_sjis_mblen, pg_sjis_dsplen, 2},                    /* 29; PG_SJIS */
!     {0, pg_big5_mblen, pg_big5_dsplen, 2},                    /* 30; PG_BIG5 */
!     {0, pg_gbk_mblen, pg_gbk_dsplen, 2},                    /* 31; PG_GBK */
!     {0, pg_uhc_mblen, pg_uhc_dsplen, 2},                    /* 32; PG_UHC */
!     {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2}                /* 33; PG_GB18030 */
  };

  /* returns the byte length of a word for mule internal code */
***************
*** 822,872 ****

      while (len > 0 && *mbstr)
      {
-         /* special UTF-8 check */
-         if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
-         {
-             if (noError)
-                 return false;
-             ereport(ERROR,
-                     (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                      errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
-         }
-
          l = pg_mblen(mbstr);

!         for (i = 1; i < l; i++)
!         {
!             /*
!              * we expect that every multibyte char consists of bytes
!              * having the 8th bit set
!              */
!             if (i >= len || (mbstr[i] & 0x80) == 0)
              {
!                 char        buf[8 * 2 + 1];
!                 char       *p = buf;
!                 int            j,
                              jlimit;

!                 if (noError)
!                     return false;

!                 jlimit = Min(l, len);
!                 jlimit = Min(jlimit, 8);        /* prevent buffer overrun */

!                 for (j = 0; j < jlimit; j++)
!                     p += sprintf(p, "%02x", mbstr[j]);

!                 ereport(ERROR,
!                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                 errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
!                        GetDatabaseEncodingName(), buf)));
              }
-         }

          len -= l;
          mbstr += l;
      }
-
      return true;
  }

--- 853,900 ----

      while (len > 0 && *mbstr)
      {
          l = pg_mblen(mbstr);

!         /* special UTF-8 check */
!         if (encoding == PG_UTF8) {
!             if(!isLegalUTF8(mbstr,l)) {
!                 if (noError) return false;
!                 ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence
detectednear character %c",*mbstr))); 
!             }
!         } else {
!             for (i = 1; i < l; i++)
              {
!                 /*
!                  * we expect that every multibyte char consists of bytes
!                  * having the 8th bit set
!                  */
!                 if (i >= len || (mbstr[i] & 0x80) == 0)
!                 {
!                     char        buf[8 * 2 + 1];
!                     char       *p = buf;
!                     int            j,
                              jlimit;

!                     if (noError)
!                         return false;

!                     jlimit = Min(l, len);
!                     jlimit = Min(jlimit, 8);        /* prevent buffer overrun */

!                     for (j = 0; j < jlimit; j++)
!                         p += sprintf(p, "%02x", mbstr[j]);

!                     ereport(ERROR,
!                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                     errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
!                         GetDatabaseEncodingName(), buf)));
!                 }
              }

+         }
          len -= l;
          mbstr += l;
      }
      return true;
  }

Index: src/include/mb/pg_wchar.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/mb/pg_wchar.h,v
retrieving revision 1.52
diff -c -r1.52 pg_wchar.h
*** src/include/mb/pg_wchar.h    17 Sep 2004 21:59:57 -0000    1.52
--- src/include/mb/pg_wchar.h    21 Nov 2004 09:58:36 -0000
***************
*** 17,22 ****
--- 17,30 ----
   */
  typedef unsigned int pg_wchar;

+
+ /*
+  * The UTF types
+  */
+ typedef unsigned int    UTF32;  /* at least 32 bits */
+ typedef unsigned short    UTF16;  /* at least 16 bits */
+ typedef unsigned char    UTF8;   /* typically 8 bits */
+
  /*
   * various definitions for EUC
   */
***************
*** 339,342 ****
--- 347,352 ----
  extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
  extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);

+ extern bool isLegalUTF8(const UTF8 *source, int len);
+
  #endif   /* PG_WCHAR_H */

Re: Unicode characters above 0x10000 #2

From

"John Hansen"

Date:

05 June 2005, 01:51:53

Bruce,

Attached patch replaces the original, applied today against CVS HEAD.
Fixes the surrogates, and limits to 4 byte utf8 as per spec.

Also extends UtfToLocal to 4 byte characters (tho, it does not add any,
just enables the code to handle them. If my interpretation of this code
is wrong, please let me know, and correct it).

... John

> -----Original Message-----
> From: Bruce Momjian [mailto:pgman@candle.pha.pa.us]
> Sent: Sunday, June 05, 2005 11:23 AM
> To: pgman@candle.pha.pa.us
> Cc: John Hansen; pgsql-hackers@postgresql.org; PostgreSQL-patches
> Subject: Re: [PATCHES] Unicode characters above 0x10000 #2
>
>
> Your patch has been added to the PostgreSQL unapplied patches list at:
>
>     http://momjian.postgresql.org/cgi-bin/pgpatches
>
> It will be applied as soon as one of the PostgreSQL
> committers reviews and approves it.
>
> --------------------------------------------------------------
> -------------
>
>
> pgman wrote:
> >
> > I have backed out this patch.  It is unclear it is a bug fix.
> >
> > It will be saved for 8.1.
> >
> >
> ----------------------------------------------------------------------
> > -----
> >
> > pgman wrote:
> > >
> > > Patch applied.  Thanks.
> > >
> > >
> --------------------------------------------------------------------
> > > -------
> > >
> > >
> > > John Hansen wrote:
> > > > 3 times lucky?
> > > >
> > > > Last one broke utf8.... Grrrr
> > > >
> > > > This one works,.... Too tired, sorry for the inconvenience..
> > > >
> > > > ... John
> > >
> > > Content-Description: cvs.diff
> > >
> > > [ Attachment, skipping... ]
> > >
> > > >
> > > > ---------------------------(end of
> > > > broadcast)---------------------------
> > > > TIP 9: the planner will ignore your desire to choose an
> index scan if your
> > > >       joining column's datatypes do not match
> > >
> > > --
> > >   Bruce Momjian                        |  http://candle.pha.pa.us
> > >   pgman@candle.pha.pa.us               |  (610) 359-1001
> > >   +  If your life is a hard drive,     |  13 Roberts Road
> > >   +  Christ can be your backup.        |  Newtown Square,
> Pennsylvania 19073
> >
> > --
> >   Bruce Momjian                        |  http://candle.pha.pa.us
> >   pgman@candle.pha.pa.us               |  (610) 359-1001
> >   +  If your life is a hard drive,     |  13 Roberts Road
> >   +  Christ can be your backup.        |  Newtown Square,
> Pennsylvania 19073
>
> > ===================================================================
> > RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/wchar.c,v
> > retrieving revision 1.38
> > diff -c -r1.38 wchar.c
> > *** src/backend/utils/mb/wchar.c    17 Sep 2004 21:59:57
> -0000    1.38
> > --- src/backend/utils/mb/wchar.c    21 Nov 2004 09:58:36 -0000
> > ***************
> > *** 343,348 ****
> > --- 343,373 ----
> >       return (pg_euc_dsplen(s));
> >   }
> >
> > + bool isLegalUTF8(const UTF8 *source, int len) {
> > +         UTF8 a;
> > +         const UTF8 *srcptr = source+len;
> > +         if(!source || (pg_utf_mblen(source) != len)) return false;
> > +         switch (len) {
> > +             default: return false;
> > +             /* Everything else falls through when "true"... */
> > +             case 6: if ((a = (*--srcptr)) < 0x80 || a >
> 0xBF) return false;
> > +             case 5: if ((a = (*--srcptr)) < 0x80 || a >
> 0xBF) return false;
> > +             case 4: if ((a = (*--srcptr)) < 0x80 || a >
> 0xBF) return false;
> > +             case 3: if ((a = (*--srcptr)) < 0x80 || a >
> 0xBF) return false;
> > +             case 2: if ((a = (*--srcptr)) > 0xBF) return false;
> > +             switch (*source) {
> > +                     /* no fall-through in this inner switch */
> > +                     case 0xE0: if (a < 0xA0) return false; break;
> > +                     case 0xF0: if (a < 0x90) return false; break;
> > +                     case 0xF4: if (a > 0x8F) return false; break;
> > +                     default:  if (a < 0x80) return false;
> > +             }
> > +             case 1: if (*source >= 0x80 && *source <
> 0xC2) return false;
> > +             if (*source > 0xFD) return false;
> > +         }
> > +         return true;
> > + }
> > +
> >   /*
> >    * convert UTF-8 string to pg_wchar (UCS-2)
> >    * caller should allocate enough space for "to"
> > ***************
> > *** 398,404 ****
> >    * returns the byte length of a UTF-8 word pointed to by s
> >    */
> >   int
> > ! pg_utf_mblen(const unsigned char *s)
> >   {
> >       int            len = 1;
> >
> > --- 423,429 ----
> >    * returns the byte length of a UTF-8 word pointed to by s
> >    */
> >   int
> > ! pg_utf_mblen(const UTF8 *s)
> >   {
> >       int            len = 1;
> >
> > ***************
> > *** 406,418 ****
> >           len = 1;
> >       else if ((*s & 0xe0) == 0xc0)
> >           len = 2;
> > !     else if ((*s & 0xe0) == 0xe0)
> > !         len = 3;
> >       return (len);
> >   }
> >
> >   static int
> > ! pg_utf_dsplen(const unsigned char *s)
> >   {
> >       return 1;                    /* XXX
> fix me! */
> >   }
> > --- 431,449 ----
> >           len = 1;
> >       else if ((*s & 0xe0) == 0xc0)
> >           len = 2;
> > !         else if ((*s & 0xf0) == 0xe0)
> > !                 len = 3;
> > !         else if ((*s & 0xf8) == 0xf0)
> > !                 len = 4;
> > !         else if ((*s & 0xfc) == 0xf8)
> > !                 len = 5;
> > !         else if ((*s & 0xfe) == 0xfc)
> > !                 len = 6;
> >       return (len);
> >   }
> >
> >   static int
> > ! pg_utf_dsplen(const UTF8 *s)
> >   {
> >       return 1;                    /* XXX
> fix me! */
> >   }
> > ***************
> > *** 721,728 ****
> >       {pg_euckr2wchar_with_len, pg_euckr_mblen,
> pg_euckr_dsplen, 3},        /* 3; PG_EUC_KR */
> >       {pg_euctw2wchar_with_len, pg_euctw_mblen,
> pg_euctw_dsplen, 3},        /* 4; PG_EUC_TW */
> >       {pg_johab2wchar_with_len, pg_johab_mblen,
> pg_johab_dsplen, 3},        /* 5; PG_JOHAB */
> > !     {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen,
> 3},    /* 6; PG_UNICODE */
> > !     {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen,
> 3}, /* 7; PG_MULE_INTERNAL */
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 8; PG_LATIN1 */
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 9; PG_LATIN2 */
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 10; PG_LATIN3 */
> > --- 752,759 ----
> >       {pg_euckr2wchar_with_len, pg_euckr_mblen,
> pg_euckr_dsplen, 3},        /* 3; PG_EUC_KR */
> >       {pg_euctw2wchar_with_len, pg_euctw_mblen,
> pg_euctw_dsplen, 3},        /* 4; PG_EUC_TW */
> >       {pg_johab2wchar_with_len, pg_johab_mblen,
> pg_johab_dsplen, 3},        /* 5; PG_JOHAB */
> > !     {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen,
> 6},        /* 6; PG_UNICODE */
> > !     {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen,
> 3},        /* 7; PG_MULE_INTERNAL */
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 8; PG_LATIN1 */
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 9; PG_LATIN2 */
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 10; PG_LATIN3 */
> > ***************
> > *** 744,754 ****
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 26; ISO-8859-7 */
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 27; ISO-8859-8 */
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 28; PG_WIN1250 */
> > !     {0, pg_sjis_mblen, pg_sjis_dsplen, 2},        /* 29;
> PG_SJIS */
> > !     {0, pg_big5_mblen, pg_big5_dsplen, 2},        /* 30;
> PG_BIG5 */
> > !     {0, pg_gbk_mblen, pg_gbk_dsplen, 2},        /* 31; PG_GBK */
> > !     {0, pg_uhc_mblen, pg_uhc_dsplen, 2},        /* 32; PG_UHC */
> > !     {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
> >   };
> >
> >   /* returns the byte length of a word for mule internal code */
> > --- 775,785 ----
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 26; ISO-8859-7 */
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 27; ISO-8859-8 */
> >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> pg_latin1_dsplen, 1},    /* 28; PG_WIN1250 */
> > !     {0, pg_sjis_mblen, pg_sjis_dsplen, 2},
>         /* 29; PG_SJIS */
> > !     {0, pg_big5_mblen, pg_big5_dsplen, 2},
>         /* 30; PG_BIG5 */
> > !     {0, pg_gbk_mblen, pg_gbk_dsplen, 2},
>         /* 31; PG_GBK */
> > !     {0, pg_uhc_mblen, pg_uhc_dsplen, 2},
>         /* 32; PG_UHC */
> > !     {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2}
>         /* 33; PG_GB18030 */
> >   };
> >
> >   /* returns the byte length of a word for mule internal code */
> > ***************
> > *** 822,872 ****
> >
> >       while (len > 0 && *mbstr)
> >       {
> > -         /* special UTF-8 check */
> > -         if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
> > -         {
> > -             if (noError)
> > -                 return false;
> > -             ereport(ERROR,
> > -
> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> > -                      errmsg("Unicode
> characters greater than or equal to 0x10000 are not supported")));
> > -         }
> > -
> >           l = pg_mblen(mbstr);
> >
> > !         for (i = 1; i < l; i++)
> > !         {
> > !             /*
> > !              * we expect that every multibyte char
> consists of bytes
> > !              * having the 8th bit set
> > !              */
> > !             if (i >= len || (mbstr[i] & 0x80) == 0)
> >               {
> > !                 char        buf[8 * 2 + 1];
> > !                 char       *p = buf;
> > !                 int            j,
> >                               jlimit;
> >
> > !                 if (noError)
> > !                     return false;
> >
> > !                 jlimit = Min(l, len);
> > !                 jlimit = Min(jlimit, 8);
>     /* prevent buffer overrun */
> >
> > !                 for (j = 0; j < jlimit; j++)
> > !                     p += sprintf(p, "%02x",
> mbstr[j]);
> >
> > !                 ereport(ERROR,
> > !
> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> > !                 errmsg("invalid byte sequence
> for encoding \"%s\": 0x%s",
> > !
> GetDatabaseEncodingName(), buf)));
> >               }
> > -         }
> >
> >           len -= l;
> >           mbstr += l;
> >       }
> > -
> >       return true;
> >   }
> >
> > --- 853,900 ----
> >
> >       while (len > 0 && *mbstr)
> >       {
> >           l = pg_mblen(mbstr);
> >
> > !         /* special UTF-8 check */
> > !         if (encoding == PG_UTF8) {
> > !             if(!isLegalUTF8(mbstr,l)) {
> > !                 if (noError) return false;
> > !
> ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),er
> rmsg("Invalid UNICODE byte sequence detected near character
> %c",*mbstr)));
> > !             }
> > !         } else {
> > !             for (i = 1; i < l; i++)
> >               {
> > !                 /*
> > !                  * we expect that every
> multibyte char consists of bytes
> > !                  * having the 8th bit set
> > !                  */
> > !                 if (i >= len || (mbstr[i] & 0x80) == 0)
> > !                 {
> > !                     char        buf[8 * 2 + 1];
> > !                     char       *p = buf;
> > !                     int            j,
> >                               jlimit;
> >
> > !                     if (noError)
> > !                         return false;
> >
> > !                     jlimit = Min(l, len);
> > !                     jlimit = Min(jlimit,
> 8);        /* prevent buffer overrun */
> >
> > !                     for (j = 0; j < jlimit; j++)
> > !                         p += sprintf(p,
> "%02x", mbstr[j]);
> >
> > !                     ereport(ERROR,
> > !
> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> > !                     errmsg("invalid byte
> sequence for encoding \"%s\": 0x%s",
> > !
> GetDatabaseEncodingName(), buf)));
> > !                 }
> >               }
> >
> > +         }
> >           len -= l;
> >           mbstr += l;
> >       }
> >       return true;
> >   }
> >
> > Index: src/include/mb/pg_wchar.h
> > ===================================================================
> > RCS file: /projects/cvsroot/pgsql/src/include/mb/pg_wchar.h,v
> > retrieving revision 1.52
> > diff -c -r1.52 pg_wchar.h
> > *** src/include/mb/pg_wchar.h    17 Sep 2004 21:59:57
> -0000    1.52
> > --- src/include/mb/pg_wchar.h    21 Nov 2004 09:58:36 -0000
> > ***************
> > *** 17,22 ****
> > --- 17,30 ----
> >    */
> >   typedef unsigned int pg_wchar;
> >
> > +
> > + /*
> > +  * The UTF types
> > +  */
> > + typedef unsigned int    UTF32;  /* at least 32 bits */
> > + typedef unsigned short    UTF16;  /* at least 16 bits */
> > + typedef unsigned char    UTF8;   /* typically 8 bits */
> > +
> >   /*
> >    * various definitions for EUC
> >    */
> > ***************
> > *** 339,342 ****
> > --- 347,352 ----
> >   extern void latin2mic_with_table(unsigned char *l,
> unsigned char *p, int len, int lc, unsigned char *tab);
> >   extern void mic2latin_with_table(unsigned char *mic,
> unsigned char
> > *p, int len, int lc, unsigned char *tab);
> >
> > + extern bool isLegalUTF8(const UTF8 *source, int len);
> > +
> >   #endif   /* PG_WCHAR_H */
> >
> >
>
> --
>   Bruce Momjian                        |  http://candle.pha.pa.us
>   pgman@candle.pha.pa.us               |  (610) 359-1001
>   +  If your life is a hard drive,     |  13 Roberts Road
>   +  Christ can be your backup.        |  Newtown Square,
> Pennsylvania 19073
>
>

Re: Unicode characters above 0x10000 #2

From

Bruce Momjian

Date:

14 June 2005, 21:15:19

Patch applied.  Thanks.

---------------------------------------------------------------------------


John Hansen wrote:
> Bruce,
>
> Attached patch replaces the original, applied today against CVS HEAD.
> Fixes the surrogates, and limits to 4 byte utf8 as per spec.
>
> Also extends UtfToLocal to 4 byte characters (tho, it does not add any,
> just enables the code to handle them. If my interpretation of this code
> is wrong, please let me know, and correct it).
>
> ... John
>
> > -----Original Message-----
> > From: Bruce Momjian [mailto:pgman@candle.pha.pa.us]
> > Sent: Sunday, June 05, 2005 11:23 AM
> > To: pgman@candle.pha.pa.us
> > Cc: John Hansen; pgsql-hackers@postgresql.org; PostgreSQL-patches
> > Subject: Re: [PATCHES] Unicode characters above 0x10000 #2
> >
> >
> > Your patch has been added to the PostgreSQL unapplied patches list at:
> >
> >     http://momjian.postgresql.org/cgi-bin/pgpatches
> >
> > It will be applied as soon as one of the PostgreSQL
> > committers reviews and approves it.
> >
> > --------------------------------------------------------------
> > -------------
> >
> >
> > pgman wrote:
> > >
> > > I have backed out this patch.  It is unclear it is a bug fix.
> > >
> > > It will be saved for 8.1.
> > >
> > >
> > ----------------------------------------------------------------------
> > > -----
> > >
> > > pgman wrote:
> > > >
> > > > Patch applied.  Thanks.
> > > >
> > > >
> > --------------------------------------------------------------------
> > > > -------
> > > >
> > > >
> > > > John Hansen wrote:
> > > > > 3 times lucky?
> > > > >
> > > > > Last one broke utf8.... Grrrr
> > > > >
> > > > > This one works,.... Too tired, sorry for the inconvenience..
> > > > >
> > > > > ... John
> > > >
> > > > Content-Description: cvs.diff
> > > >
> > > > [ Attachment, skipping... ]
> > > >
> > > > >
> > > > > ---------------------------(end of
> > > > > broadcast)---------------------------
> > > > > TIP 9: the planner will ignore your desire to choose an
> > index scan if your
> > > > >       joining column's datatypes do not match
> > > >
> > > > --
> > > >   Bruce Momjian                        |  http://candle.pha.pa.us
> > > >   pgman@candle.pha.pa.us               |  (610) 359-1001
> > > >   +  If your life is a hard drive,     |  13 Roberts Road
> > > >   +  Christ can be your backup.        |  Newtown Square,
> > Pennsylvania 19073
> > >
> > > --
> > >   Bruce Momjian                        |  http://candle.pha.pa.us
> > >   pgman@candle.pha.pa.us               |  (610) 359-1001
> > >   +  If your life is a hard drive,     |  13 Roberts Road
> > >   +  Christ can be your backup.        |  Newtown Square,
> > Pennsylvania 19073
> >
> > > ===================================================================
> > > RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/wchar.c,v
> > > retrieving revision 1.38
> > > diff -c -r1.38 wchar.c
> > > *** src/backend/utils/mb/wchar.c    17 Sep 2004 21:59:57
> > -0000    1.38
> > > --- src/backend/utils/mb/wchar.c    21 Nov 2004 09:58:36 -0000
> > > ***************
> > > *** 343,348 ****
> > > --- 343,373 ----
> > >       return (pg_euc_dsplen(s));
> > >   }
> > >
> > > + bool isLegalUTF8(const UTF8 *source, int len) {
> > > +         UTF8 a;
> > > +         const UTF8 *srcptr = source+len;
> > > +         if(!source || (pg_utf_mblen(source) != len)) return false;
> > > +         switch (len) {
> > > +             default: return false;
> > > +             /* Everything else falls through when "true"... */
> > > +             case 6: if ((a = (*--srcptr)) < 0x80 || a >
> > 0xBF) return false;
> > > +             case 5: if ((a = (*--srcptr)) < 0x80 || a >
> > 0xBF) return false;
> > > +             case 4: if ((a = (*--srcptr)) < 0x80 || a >
> > 0xBF) return false;
> > > +             case 3: if ((a = (*--srcptr)) < 0x80 || a >
> > 0xBF) return false;
> > > +             case 2: if ((a = (*--srcptr)) > 0xBF) return false;
> > > +             switch (*source) {
> > > +                     /* no fall-through in this inner switch */
> > > +                     case 0xE0: if (a < 0xA0) return false; break;
> > > +                     case 0xF0: if (a < 0x90) return false; break;
> > > +                     case 0xF4: if (a > 0x8F) return false; break;
> > > +                     default:  if (a < 0x80) return false;
> > > +             }
> > > +             case 1: if (*source >= 0x80 && *source <
> > 0xC2) return false;
> > > +             if (*source > 0xFD) return false;
> > > +         }
> > > +         return true;
> > > + }
> > > +
> > >   /*
> > >    * convert UTF-8 string to pg_wchar (UCS-2)
> > >    * caller should allocate enough space for "to"
> > > ***************
> > > *** 398,404 ****
> > >    * returns the byte length of a UTF-8 word pointed to by s
> > >    */
> > >   int
> > > ! pg_utf_mblen(const unsigned char *s)
> > >   {
> > >       int            len = 1;
> > >
> > > --- 423,429 ----
> > >    * returns the byte length of a UTF-8 word pointed to by s
> > >    */
> > >   int
> > > ! pg_utf_mblen(const UTF8 *s)
> > >   {
> > >       int            len = 1;
> > >
> > > ***************
> > > *** 406,418 ****
> > >           len = 1;
> > >       else if ((*s & 0xe0) == 0xc0)
> > >           len = 2;
> > > !     else if ((*s & 0xe0) == 0xe0)
> > > !         len = 3;
> > >       return (len);
> > >   }
> > >
> > >   static int
> > > ! pg_utf_dsplen(const unsigned char *s)
> > >   {
> > >       return 1;                    /* XXX
> > fix me! */
> > >   }
> > > --- 431,449 ----
> > >           len = 1;
> > >       else if ((*s & 0xe0) == 0xc0)
> > >           len = 2;
> > > !         else if ((*s & 0xf0) == 0xe0)
> > > !                 len = 3;
> > > !         else if ((*s & 0xf8) == 0xf0)
> > > !                 len = 4;
> > > !         else if ((*s & 0xfc) == 0xf8)
> > > !                 len = 5;
> > > !         else if ((*s & 0xfe) == 0xfc)
> > > !                 len = 6;
> > >       return (len);
> > >   }
> > >
> > >   static int
> > > ! pg_utf_dsplen(const UTF8 *s)
> > >   {
> > >       return 1;                    /* XXX
> > fix me! */
> > >   }
> > > ***************
> > > *** 721,728 ****
> > >       {pg_euckr2wchar_with_len, pg_euckr_mblen,
> > pg_euckr_dsplen, 3},        /* 3; PG_EUC_KR */
> > >       {pg_euctw2wchar_with_len, pg_euctw_mblen,
> > pg_euctw_dsplen, 3},        /* 4; PG_EUC_TW */
> > >       {pg_johab2wchar_with_len, pg_johab_mblen,
> > pg_johab_dsplen, 3},        /* 5; PG_JOHAB */
> > > !     {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen,
> > 3},    /* 6; PG_UNICODE */
> > > !     {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen,
> > 3}, /* 7; PG_MULE_INTERNAL */
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 8; PG_LATIN1 */
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 9; PG_LATIN2 */
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 10; PG_LATIN3 */
> > > --- 752,759 ----
> > >       {pg_euckr2wchar_with_len, pg_euckr_mblen,
> > pg_euckr_dsplen, 3},        /* 3; PG_EUC_KR */
> > >       {pg_euctw2wchar_with_len, pg_euctw_mblen,
> > pg_euctw_dsplen, 3},        /* 4; PG_EUC_TW */
> > >       {pg_johab2wchar_with_len, pg_johab_mblen,
> > pg_johab_dsplen, 3},        /* 5; PG_JOHAB */
> > > !     {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen,
> > 6},        /* 6; PG_UNICODE */
> > > !     {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen,
> > 3},        /* 7; PG_MULE_INTERNAL */
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 8; PG_LATIN1 */
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 9; PG_LATIN2 */
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 10; PG_LATIN3 */
> > > ***************
> > > *** 744,754 ****
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 26; ISO-8859-7 */
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 27; ISO-8859-8 */
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 28; PG_WIN1250 */
> > > !     {0, pg_sjis_mblen, pg_sjis_dsplen, 2},        /* 29;
> > PG_SJIS */
> > > !     {0, pg_big5_mblen, pg_big5_dsplen, 2},        /* 30;
> > PG_BIG5 */
> > > !     {0, pg_gbk_mblen, pg_gbk_dsplen, 2},        /* 31; PG_GBK */
> > > !     {0, pg_uhc_mblen, pg_uhc_dsplen, 2},        /* 32; PG_UHC */
> > > !     {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
> > >   };
> > >
> > >   /* returns the byte length of a word for mule internal code */
> > > --- 775,785 ----
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 26; ISO-8859-7 */
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 27; ISO-8859-8 */
> > >       {pg_latin12wchar_with_len, pg_latin1_mblen,
> > pg_latin1_dsplen, 1},    /* 28; PG_WIN1250 */
> > > !     {0, pg_sjis_mblen, pg_sjis_dsplen, 2},
> >         /* 29; PG_SJIS */
> > > !     {0, pg_big5_mblen, pg_big5_dsplen, 2},
> >         /* 30; PG_BIG5 */
> > > !     {0, pg_gbk_mblen, pg_gbk_dsplen, 2},
> >         /* 31; PG_GBK */
> > > !     {0, pg_uhc_mblen, pg_uhc_dsplen, 2},
> >         /* 32; PG_UHC */
> > > !     {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2}
> >         /* 33; PG_GB18030 */
> > >   };
> > >
> > >   /* returns the byte length of a word for mule internal code */
> > > ***************
> > > *** 822,872 ****
> > >
> > >       while (len > 0 && *mbstr)
> > >       {
> > > -         /* special UTF-8 check */
> > > -         if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
> > > -         {
> > > -             if (noError)
> > > -                 return false;
> > > -             ereport(ERROR,
> > > -
> > (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> > > -                      errmsg("Unicode
> > characters greater than or equal to 0x10000 are not supported")));
> > > -         }
> > > -
> > >           l = pg_mblen(mbstr);
> > >
> > > !         for (i = 1; i < l; i++)
> > > !         {
> > > !             /*
> > > !              * we expect that every multibyte char
> > consists of bytes
> > > !              * having the 8th bit set
> > > !              */
> > > !             if (i >= len || (mbstr[i] & 0x80) == 0)
> > >               {
> > > !                 char        buf[8 * 2 + 1];
> > > !                 char       *p = buf;
> > > !                 int            j,
> > >                               jlimit;
> > >
> > > !                 if (noError)
> > > !                     return false;
> > >
> > > !                 jlimit = Min(l, len);
> > > !                 jlimit = Min(jlimit, 8);
> >     /* prevent buffer overrun */
> > >
> > > !                 for (j = 0; j < jlimit; j++)
> > > !                     p += sprintf(p, "%02x",
> > mbstr[j]);
> > >
> > > !                 ereport(ERROR,
> > > !
> > (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> > > !                 errmsg("invalid byte sequence
> > for encoding \"%s\": 0x%s",
> > > !
> > GetDatabaseEncodingName(), buf)));
> > >               }
> > > -         }
> > >
> > >           len -= l;
> > >           mbstr += l;
> > >       }
> > > -
> > >       return true;
> > >   }
> > >
> > > --- 853,900 ----
> > >
> > >       while (len > 0 && *mbstr)
> > >       {
> > >           l = pg_mblen(mbstr);
> > >
> > > !         /* special UTF-8 check */
> > > !         if (encoding == PG_UTF8) {
> > > !             if(!isLegalUTF8(mbstr,l)) {
> > > !                 if (noError) return false;
> > > !
> > ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),er
> > rmsg("Invalid UNICODE byte sequence detected near character
> > %c",*mbstr)));
> > > !             }
> > > !         } else {
> > > !             for (i = 1; i < l; i++)
> > >               {
> > > !                 /*
> > > !                  * we expect that every
> > multibyte char consists of bytes
> > > !                  * having the 8th bit set
> > > !                  */
> > > !                 if (i >= len || (mbstr[i] & 0x80) == 0)
> > > !                 {
> > > !                     char        buf[8 * 2 + 1];
> > > !                     char       *p = buf;
> > > !                     int            j,
> > >                               jlimit;
> > >
> > > !                     if (noError)
> > > !                         return false;
> > >
> > > !                     jlimit = Min(l, len);
> > > !                     jlimit = Min(jlimit,
> > 8);        /* prevent buffer overrun */
> > >
> > > !                     for (j = 0; j < jlimit; j++)
> > > !                         p += sprintf(p,
> > "%02x", mbstr[j]);
> > >
> > > !                     ereport(ERROR,
> > > !
> > (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> > > !                     errmsg("invalid byte
> > sequence for encoding \"%s\": 0x%s",
> > > !
> > GetDatabaseEncodingName(), buf)));
> > > !                 }
> > >               }
> > >
> > > +         }
> > >           len -= l;
> > >           mbstr += l;
> > >       }
> > >       return true;
> > >   }
> > >
> > > Index: src/include/mb/pg_wchar.h
> > > ===================================================================
> > > RCS file: /projects/cvsroot/pgsql/src/include/mb/pg_wchar.h,v
> > > retrieving revision 1.52
> > > diff -c -r1.52 pg_wchar.h
> > > *** src/include/mb/pg_wchar.h    17 Sep 2004 21:59:57
> > -0000    1.52
> > > --- src/include/mb/pg_wchar.h    21 Nov 2004 09:58:36 -0000
> > > ***************
> > > *** 17,22 ****
> > > --- 17,30 ----
> > >    */
> > >   typedef unsigned int pg_wchar;
> > >
> > > +
> > > + /*
> > > +  * The UTF types
> > > +  */
> > > + typedef unsigned int    UTF32;  /* at least 32 bits */
> > > + typedef unsigned short    UTF16;  /* at least 16 bits */
> > > + typedef unsigned char    UTF8;   /* typically 8 bits */
> > > +
> > >   /*
> > >    * various definitions for EUC
> > >    */
> > > ***************
> > > *** 339,342 ****
> > > --- 347,352 ----
> > >   extern void latin2mic_with_table(unsigned char *l,
> > unsigned char *p, int len, int lc, unsigned char *tab);
> > >   extern void mic2latin_with_table(unsigned char *mic,
> > unsigned char
> > > *p, int len, int lc, unsigned char *tab);
> > >
> > > + extern bool isLegalUTF8(const UTF8 *source, int len);
> > > +
> > >   #endif   /* PG_WCHAR_H */
> > >
> > >
> >
> > --
> >   Bruce Momjian                        |  http://candle.pha.pa.us
> >   pgman@candle.pha.pa.us               |  (610) 359-1001
> >   +  If your life is a hard drive,     |  13 Roberts Road
> >   +  Christ can be your backup.        |  Newtown Square,
> > Pennsylvania 19073
> >
> >

Content-Description: unicode.diff

[ Attachment, skipping... ]

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073