Re: Win32 unicode vs ICU - Mailing list pgsql-patches

From Tom Lane
Subject Re: Win32 unicode vs ICU
Date
Msg-id 3649.1124817370@sss.pgh.pa.us
Whole thread Raw
List pgsql-patches
I looked over the proposed patch a bit and found some problems --- in
particular, if I read M$'s documentation about MultiByteToWideChar
correctly, they chose an API that fails for zero-length input, and
so you gotta program around that.  Also, varstr_cmp() cannot assume
it gets null-terminated input.

I cannot test the attached revised patch; please check it out.

            regards, tom lane

Index: oracle_compat.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/oracle_compat.c,v
retrieving revision 1.60
diff -c -r1.60 oracle_compat.c
*** oracle_compat.c    7 May 2005 15:18:17 -0000    1.60
--- oracle_compat.c    23 Aug 2005 17:13:11 -0000
***************
*** 149,154 ****
--- 149,265 ----
  #endif   /* USE_WIDE_UPPER_LOWER */


+ /*
+  * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding.
+  * To make use of the upper/lower functionality, we need to map UTF8 to
+  * UTF16, which for some reason mbstowcs and wcstombs won't do for us.
+  * This conversion layer takes care of it.
+  */
+
+ #ifdef WIN32
+
+ /* texttowcs for the case of UTF8 to UTF16 */
+ static wchar_t *
+ win32_utf8_texttowcs(const text *txt)
+ {
+     int            nbytes = VARSIZE(txt) - VARHDRSZ;
+     wchar_t    *result;
+     int         r;
+
+     /* Overflow paranoia */
+     if (nbytes < 0 ||
+         nbytes > (int) (INT_MAX / sizeof(wchar_t)) -1)
+         ereport(ERROR,
+                 (errcode(ERRCODE_OUT_OF_MEMORY),
+                  errmsg("out of memory")));
+
+     /* Output workspace cannot have more codes than input bytes */
+     result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
+
+     /* stupid Microsloth API does not work for zero-length input */
+     if (nbytes == 0)
+         r = 0;
+     else
+     {
+         /* Do the conversion */
+         r = MultiByteToWideChar(CP_UTF8, 0, VARDATA(txt), nbytes,
+                                 result, nbytes);
+
+         if (!r)                    /* assume it's NO_UNICODE_TRANSLATION */
+         {
+             /* see notes above about error reporting */
+             pg_verifymbstr(VARDATA(txt), nbytes, false);
+             ereport(ERROR,
+                     (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                      errmsg("invalid multibyte character for locale"),
+                      errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
+         }
+     }
+
+     Assert(r <= nbytes);
+     result[r] = 0;
+
+     return result;
+ }
+
+ /* wcstotext for the case of UTF16 to UTF8 */
+ static text *
+ win32_utf8_wcstotext(const wchar_t *str)
+ {
+     text        *result;
+     int             nbytes;
+     int             r;
+
+     nbytes = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
+     if (nbytes == 0)            /* shouldn't happen */
+         ereport(ERROR,
+                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                  errmsg("UTF16 to UTF8 translation failed: %lu",
+                         GetLastError())));
+
+     result = palloc(nbytes+VARHDRSZ);
+
+     r = WideCharToMultiByte(CP_UTF8, 0, str, -1, VARDATA(result), nbytes,
+                             NULL, NULL);
+     if (r == 0)                    /* shouldn't happen */
+         ereport(ERROR,
+                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                  errmsg("UTF16 to UTF8 translation failed: %lu",
+                         GetLastError())));
+
+     VARATT_SIZEP(result) = nbytes + VARHDRSZ - 1; /* -1 to ignore null */
+
+     return result;
+ }
+
+ /* interface layer to check which encoding is in use */
+
+ static wchar_t *
+ win32_texttowcs(const text *txt)
+ {
+     if (GetDatabaseEncoding() == PG_UTF8)
+         return win32_utf8_texttowcs(txt);
+     else
+         return texttowcs(txt);
+ }
+
+ static text *
+ win32_wcstotext(const wchar_t *str, int ncodes)
+ {
+     if (GetDatabaseEncoding() == PG_UTF8)
+         return win32_utf8_wcstotext(str);
+     else
+         return wcstotext(str, ncodes);
+ }
+
+ /* use macros to cause routines below to call interface layer */
+
+ #define texttowcs    win32_texttowcs
+ #define wcstotext    win32_wcstotext
+
+ #endif /* WIN32 */
+
+
  /********************************************************************
   *
   * lower
Index: varlena.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/varlena.c,v
retrieving revision 1.131
diff -c -r1.131 varlena.c
*** varlena.c    2 Aug 2005 16:11:57 -0000    1.131
--- varlena.c    23 Aug 2005 17:13:12 -0000
***************
*** 849,854 ****
--- 849,856 ----
          char       *a1p,
                     *a2p;

+ #ifndef WIN32
+
          if (len1 >= STACKBUFLEN)
              a1p = (char *) palloc(len1 + 1);
          else
***************
*** 865,874 ****

          result = strcoll(a1p, a2p);

!         if (len1 >= STACKBUFLEN)
              pfree(a1p);
!         if (len2 >= STACKBUFLEN)
              pfree(a2p);
      }
      else
      {
--- 867,953 ----

          result = strcoll(a1p, a2p);

!         if (a1p != a1buf)
              pfree(a1p);
!         if (a2p != a2buf)
              pfree(a2p);
+
+ #else /* WIN32 */
+
+         /* Win32 does not have UTF-8, so we need to map to UTF-16 */
+         if (GetDatabaseEncoding() == PG_UTF8)
+         {
+             int a1len;
+             int a2len;
+             int r;
+
+             if (len1 >= STACKBUFLEN/2)
+             {
+                 a1len = len1 * 2 + 2;
+                 a1p = palloc(a1len);
+             }
+             else
+             {
+                 a1len = STACKBUFLEN;
+                 a1p = a1buf;
+             }
+             if (len2 >= STACKBUFLEN/2)
+             {
+                 a2len = len2 * 2 + 2;
+                 a2p = palloc(a2len);
+             }
+             else
+             {
+                 a2len = STACKBUFLEN;
+                 a2p = a2buf;
+             }
+
+             /* stupid Microsloth API does not work for zero-length input */
+             if (len1 == 0)
+                 r = 0;
+             else
+             {
+                 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
+                                         (LPWSTR) a1p, a1len/2);
+                 if (!r)
+                     ereport(ERROR,
+                             (errmsg("could not convert string to UTF16: %lu",
+                                     GetLastError())));
+             }
+             ((LPWSTR) a1p)[r] = 0;
+
+             if (len2 == 0)
+                 r = 0;
+             else
+             {
+                 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
+                                         (LPWSTR) a2p, a2len/2);
+                 if (!r)
+                     ereport(ERROR,
+                             (errmsg("could not convert string to UTF16: %lu",
+                                     GetLastError())));
+             }
+             ((LPWSTR) a2p)[r] = 0;
+
+             errno = 0;
+             result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
+             if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
+                 ereport(ERROR,
+                         (errmsg("could not compare unicode strings: %d",
+                                 errno)));
+
+             if (a1p != a1buf)
+                 pfree(a1p);
+             if (a2p != a2buf)
+                 pfree(a2p);
+
+             return result;
+         }
+
+         /* Win32 has strncoll(), so use it to avoid copying */
+         return _strncoll(arg1, arg2, Min(len1, len2));
+
+ #endif /* WIN32 */
      }
      else
      {

pgsql-patches by date:

Previous
From: Tom Lane
Date:
Subject: Re: [pgsql-hackers-win32] win32 random number generator
Next
From: Bruce Momjian
Date:
Subject: Re: Win32 Thread Safety