Thread: Re: [GENERAL] trouble with to_char('L')

Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Hiroshi Inoue wrote:
> Bruce Momjian wrote:
> > Hiroshi Inoue wrote:
> >> Bruce Momjian wrote:
> >>> Where are we on this issue?
> >> Oops I forgot it completely.
> >> I have a little improved version and would post it tonight.
> >
> > Ah, very good.  Thanks.
>
> Attached is an improved version.

I spent many hours on this patch and am attaching an updated version.
I have restructured the code and added many comments, but this is the
main one:

    *  Ideally, the server encoding and locale settings would
    *  always match.  Unfortunately, WIN32 does not support UTF-8
    *  values for setlocale(), even though PostgreSQL runs fine with
    *  a UTF-8 encoding on Windows:
    *
    *      http://msdn.microsoft.com/en-us/library/x99tb11d.aspx
    *
    *  Therefore, we must set LC_CTYPE to match LC_NUMERIC and
    *  LC_MONETARY, call localeconv(), and use mbstowcs() to
    *  convert the locale-aware string, e.g. Euro symbol, which
    *  is not in UTF-8 to the server encoding.

I need someone with WIN32 experience to review and test this patch.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  PG East:  http://www.enterprisedb.com/community/nav-pg-east-2010.do
Index: src/backend/utils/adt/pg_locale.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/pg_locale.c,v
retrieving revision 1.53
diff -c -c -r1.53 pg_locale.c
*** src/backend/utils/adt/pg_locale.c    27 Feb 2010 20:20:44 -0000    1.53
--- src/backend/utils/adt/pg_locale.c    28 Feb 2010 03:59:14 -0000
***************
*** 4,10 ****
   *
   * Portions Copyright (c) 2002-2010, PostgreSQL Global Development Group
   *
!  * $PostgreSQL: pgsql/src/backend/utils/adt/pg_locale.c,v 1.53 2010/02/27 20:20:44 momjian Exp $
   *
   *-----------------------------------------------------------------------
   */
--- 4,10 ----
   *
   * Portions Copyright (c) 2002-2010, PostgreSQL Global Development Group
   *
!  * $PostgreSQL: pgsql/src/backend/utils/adt/pg_locale.c,v 1.51 2010/01/02 16:57:54 momjian Exp $
   *
   *-----------------------------------------------------------------------
   */
***************
*** 386,391 ****
--- 386,459 ----
          free(s->positive_sign);
  }

+ #ifdef    WIN32
+ /*
+  *    This converts the LC_CTYPE-encoded string returned from the
+  *    locale routines to the database encoding.
+  */
+ static char *db_encoding_strdup(const char *item, const char *str)
+ {
+     int    db_encoding = GetDatabaseEncoding();
+     size_t    wchars, ilen, wclen, dstlen;
+     int    utflen, bytes_per_char;
+     wchar_t    *wbuf;
+     char    *dst;
+
+     if (!str[0])
+         return strdup(str);
+     ilen = strlen(str) + 1;
+     wclen = ilen * sizeof(wchar_t);
+     wbuf = (wchar_t *) palloc(wclen);
+
+     /* Convert multi-byte string using current LC_CTYPE to a wide-character string */
+     wchars = mbstowcs(wbuf, str, ilen);
+     if (wchars == (size_t) -1)
+         elog(ERROR,
+             "could not convert string to wide characters: error %lu", GetLastError());
+
+     /* allocate target string */
+     bytes_per_char = pg_encoding_max_length(PG_UTF8);
+     if (pg_encoding_max_length(db_encoding) > bytes_per_char)
+         bytes_per_char = pg_encoding_max_length(db_encoding);
+     dstlen = wchars * bytes_per_char + 1;
+     if ((dst = malloc(dstlen)) == NULL)
+         elog(ERROR, "could not allocate a destination buffer");
+
+     /* Convert wide string to UTF8 */
+     utflen = WideCharToMultiByte(CP_UTF8, 0, wbuf, wchars, dst, dstlen, NULL, NULL);
+     if (utflen == 0)
+         elog(ERROR,
+             "could not convert string %04x to UTF-8: error %lu", wbuf[0], GetLastError());
+     pfree(wbuf);
+
+     dst[utflen] = '\0';
+     if (db_encoding != PG_UTF8)
+     {
+         PG_TRY();
+         {
+             char *convstr = pg_do_encoding_conversion(dst, utflen, PG_UTF8, db_encoding);
+             if (dst != convstr)
+             {
+                 strlcpy(dst, convstr, dstlen);
+                 pfree(convstr);
+             }
+         }
+         PG_CATCH();
+         {
+             FlushErrorState();
+             dst[0] = '\0';
+         }
+         PG_END_TRY();
+     }
+
+     return dst;
+ }
+ #else
+ static char *db_encoding_strdup(const char *item, const char *str)
+ {
+     return strdup(str);
+ }
+ #endif /* WIN32 */

  /*
   * Return the POSIX lconv struct (contains number/money formatting
***************
*** 398,403 ****
--- 466,475 ----
      struct lconv *extlconv;
      char       *save_lc_monetary;
      char       *save_lc_numeric;
+ #ifdef    WIN32
+     char       *save_lc_ctype = NULL;
+     bool        lc_ctype_was_null = false;
+ #endif

      /* Did we do it already? */
      if (CurrentLocaleConvValid)
***************
*** 413,442 ****
      if (save_lc_numeric)
          save_lc_numeric = pstrdup(save_lc_numeric);

      setlocale(LC_MONETARY, locale_monetary);
      setlocale(LC_NUMERIC, locale_numeric);
!
!     /* Get formatting information */
      extlconv = localeconv();

      /*
!      * Must copy all values since restoring internal settings may overwrite
       * localeconv()'s results.
       */
      CurrentLocaleConv = *extlconv;
!     CurrentLocaleConv.currency_symbol = strdup(extlconv->currency_symbol);
!     CurrentLocaleConv.decimal_point = strdup(extlconv->decimal_point);
!     CurrentLocaleConv.grouping = strdup(extlconv->grouping);
!     CurrentLocaleConv.thousands_sep = strdup(extlconv->thousands_sep);
!     CurrentLocaleConv.int_curr_symbol = strdup(extlconv->int_curr_symbol);
!     CurrentLocaleConv.mon_decimal_point = strdup(extlconv->mon_decimal_point);
      CurrentLocaleConv.mon_grouping = strdup(extlconv->mon_grouping);
!     CurrentLocaleConv.mon_thousands_sep = strdup(extlconv->mon_thousands_sep);
!     CurrentLocaleConv.negative_sign = strdup(extlconv->negative_sign);
!     CurrentLocaleConv.positive_sign = strdup(extlconv->positive_sign);
      CurrentLocaleConv.n_sign_posn = extlconv->n_sign_posn;

!     /* Try to restore internal settings */
      if (save_lc_monetary)
      {
          setlocale(LC_MONETARY, save_lc_monetary);
--- 485,564 ----
      if (save_lc_numeric)
          save_lc_numeric = pstrdup(save_lc_numeric);

+ #ifdef    WIN32
+     /*
+      *    Ideally, the server encoding and locale settings would
+      *    always match.  Unfortunately, WIN32 does not support UTF-8
+      *    values for setlocale(), even though PostgreSQL runs fine with
+      *    a UTF-8 encoding on Windows:
+      *
+      *        http://msdn.microsoft.com/en-us/library/x99tb11d.aspx
+      *
+      *    Therefore, we must set LC_CTYPE to match LC_NUMERIC and
+      *    LC_MONETARY, call localeconv(), and use mbstowcs() to
+      *    convert the locale-aware string, e.g. Euro symbol, which
+      *    is not in UTF-8 to the server encoding.
+      */
+
+     /*
+      *    We unconditionally restore LC_CTYPE because we are setting it
+      *    to an unusual value.
+      */
+     if ((save_lc_ctype = setlocale(LC_CTYPE, NULL)) != NULL)
+         save_lc_ctype = pstrdup(save_lc_ctype);
+     else
+         /* This is actually the C locale */
+         save_lc_ctype = pstrdup("");
+
+     /* Set LC_CTYPE to match LC_MONETARY? */
+     if (pg_strcasecmp(save_lc_ctype, locale_monetary) != 0)
+         setlocale(LC_CTYPE, locale_monetary);
+ #endif
+
      setlocale(LC_MONETARY, locale_monetary);
      setlocale(LC_NUMERIC, locale_numeric);
!     /*
!      *    Get formatting information for LC_MONETARY, and LC_NUMERIC if they
!      *    are the same.
!      */
      extlconv = localeconv();

      /*
!      * Must copy all values since restoring internal settings might overwrite
       * localeconv()'s results.
       */
      CurrentLocaleConv = *extlconv;
!
!     /* The first argument of db_encoding_strdup() is only used on WIN32 */
!     CurrentLocaleConv.currency_symbol = db_encoding_strdup("currency_symbol", extlconv->currency_symbol);
!     CurrentLocaleConv.int_curr_symbol = db_encoding_strdup("int_curr_symbol", extlconv->int_curr_symbol);
!     CurrentLocaleConv.mon_decimal_point = db_encoding_strdup("mon_decimal_point", extlconv->mon_decimal_point);
      CurrentLocaleConv.mon_grouping = strdup(extlconv->mon_grouping);
!     CurrentLocaleConv.mon_thousands_sep = db_encoding_strdup("mon_thousands_sep", extlconv->mon_thousands_sep);
!     CurrentLocaleConv.negative_sign = db_encoding_strdup("negative_sign", extlconv->negative_sign);
!     CurrentLocaleConv.positive_sign = db_encoding_strdup("positive_sign", extlconv->positive_sign);
      CurrentLocaleConv.n_sign_posn = extlconv->n_sign_posn;

! #ifdef    WIN32
!     if (pg_strcasecmp(locale_numeric, locale_monetary) != 0)
!     {
!         setlocale(LC_CTYPE, locale_numeric);
!         /* Get formatting information for LC_NUMERIC with matching LC_CTYPE */
!         extlconv = localeconv();
!     }
! #endif
!
!     CurrentLocaleConv.decimal_point = db_encoding_strdup("decimal_point", extlconv->decimal_point);
!     CurrentLocaleConv.grouping = strdup(extlconv->grouping);
!     CurrentLocaleConv.thousands_sep = db_encoding_strdup("thousands_sep", extlconv->thousands_sep);
!
!     /*
!      *    Restore internal settings
!      */
! #ifdef    WIN32
!     setlocale(LC_CTYPE, save_lc_ctype);
!     pfree(save_lc_ctype);
! #endif
      if (save_lc_monetary)
      {
          setlocale(LC_MONETARY, save_lc_monetary);
***************
*** 533,542 ****
      elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);

  #ifdef WIN32
!     /* set user's value of ctype locale */
      save_lc_ctype = setlocale(LC_CTYPE, NULL);
      if (save_lc_ctype)
          save_lc_ctype = pstrdup(save_lc_ctype);

      setlocale(LC_CTYPE, locale_time);
  #endif
--- 655,666 ----
      elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);

  #ifdef WIN32
!     /* See the WIN32 comment near the top of PGLC_localeconv() */
      save_lc_ctype = setlocale(LC_CTYPE, NULL);
      if (save_lc_ctype)
          save_lc_ctype = pstrdup(save_lc_ctype);
+     else
+         save_lc_ctype = pstrdup("");

      setlocale(LC_CTYPE, locale_time);
  #endif

Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Bruce Momjian wrote:
> Hiroshi Inoue wrote:
> > Bruce Momjian wrote:
> > > Hiroshi Inoue wrote:
> > >> Bruce Momjian wrote:
> > >>> Where are we on this issue?
> > >> Oops I forgot it completely.
> > >> I have a little improved version and would post it tonight.
> > > 
> > > Ah, very good.  Thanks.
> > 
> > Attached is an improved version.
> 
> I spent many hours on this patch and am attaching an updated version.
> I have restructured the code and added many comments, but this is the
> main one:
> 
>     *  Ideally, the server encoding and locale settings would
>     *  always match.  Unfortunately, WIN32 does not support UTF-8
>     *  values for setlocale(), even though PostgreSQL runs fine with
>     *  a UTF-8 encoding on Windows:
>     *
>     *      http://msdn.microsoft.com/en-us/library/x99tb11d.aspx
>     *
>     *  Therefore, we must set LC_CTYPE to match LC_NUMERIC and
>     *  LC_MONETARY, call localeconv(), and use mbstowcs() to
>     *  convert the locale-aware string, e.g. Euro symbol, which
>     *  is not in UTF-8 to the server encoding.
> 
> I need someone with WIN32 experience to review and test this patch.

I don't understand why cache_locale_time() works on Windows.  It sets
the LC_CTYPE but does not do any encoding coversion.  Do month and
day-of-week names not work either, or do they work and the encoding
conversion for numeric/money, e.g. Euro, it not necessary?

--  Bruce Momjian  <bruce@momjian.us>        http://momjian.us EnterpriseDB
http://enterprisedb.com
 PG East:  http://www.enterprisedb.com/community/nav-pg-east-2010.do


Re: [GENERAL] trouble with to_char('L')

From
Hiroshi Inoue
Date:
Bruce Momjian wrote:
> Bruce Momjian wrote:
>> Hiroshi Inoue wrote:
>>> Bruce Momjian wrote:
>>>> Hiroshi Inoue wrote:
>>>>> Bruce Momjian wrote:
>>>>>> Where are we on this issue?
>>>>> Oops I forgot it completely.
>>>>> I have a little improved version and would post it tonight.
>>>> Ah, very good.  Thanks.
>>> Attached is an improved version.
>> I spent many hours on this patch and am attaching an updated version.
>> I have restructured the code and added many comments, but this is the
>> main one:
>>
>>     *  Ideally, the server encoding and locale settings would
>>     *  always match.  Unfortunately, WIN32 does not support UTF-8
>>     *  values for setlocale(), even though PostgreSQL runs fine with
>>     *  a UTF-8 encoding on Windows:
>>     *
>>     *      http://msdn.microsoft.com/en-us/library/x99tb11d.aspx
>>     *
>>     *  Therefore, we must set LC_CTYPE to match LC_NUMERIC and
>>     *  LC_MONETARY, call localeconv(), and use mbstowcs() to
>>     *  convert the locale-aware string, e.g. Euro symbol, which
>>     *  is not in UTF-8 to the server encoding.
>>
>> I need someone with WIN32 experience to review and test this patch.
> 
> I don't understand why cache_locale_time() works on Windows.  It sets
> the LC_CTYPE but does not do any encoding coversion.

Doesn't strftime_win32 do the conversion?

> Do month and
> day-of-week names not work either, or do they work and the encoding
> conversion for numeric/money, e.g. Euro, it not necessary?

db_strdup does the conversion.

regards,
Hiroshi Inoue


Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Hiroshi Inoue wrote:
> Bruce Momjian wrote:
> > Bruce Momjian wrote:
> >> Hiroshi Inoue wrote:
> >>> Bruce Momjian wrote:
> >>>> Hiroshi Inoue wrote:
> >>>>> Bruce Momjian wrote:
> >>>>>> Where are we on this issue?
> >>>>> Oops I forgot it completely.
> >>>>> I have a little improved version and would post it tonight.
> >>>> Ah, very good.  Thanks.
> >>> Attached is an improved version.
> >> I spent many hours on this patch and am attaching an updated version.
> >> I have restructured the code and added many comments, but this is the
> >> main one:
> >>
> >>     *  Ideally, the server encoding and locale settings would
> >>     *  always match.  Unfortunately, WIN32 does not support UTF-8
> >>     *  values for setlocale(), even though PostgreSQL runs fine with
> >>     *  a UTF-8 encoding on Windows:
> >>     *
> >>     *      http://msdn.microsoft.com/en-us/library/x99tb11d.aspx
> >>     *
> >>     *  Therefore, we must set LC_CTYPE to match LC_NUMERIC and
> >>     *  LC_MONETARY, call localeconv(), and use mbstowcs() to
> >>     *  convert the locale-aware string, e.g. Euro symbol, which
> >>     *  is not in UTF-8 to the server encoding.
> >>
> >> I need someone with WIN32 experience to review and test this patch.
> > 
> > I don't understand why cache_locale_time() works on Windows.  It sets
> > the LC_CTYPE but does not do any encoding coversion.
> 
> Doesn't strftime_win32 do the conversion?

Oh, I now see strftime is redefined as a macro in that C files.  Thanks.

> > Do month and
> > day-of-week names not work either, or do they work and the encoding
> > conversion for numeric/money, e.g. Euro, it not necessary?
> 
> db_strdup does the conversion.

Should we pull the encoding conversion into a separate function and have
strftime_win32() and db_strdup() both call it?

--  Bruce Momjian  <bruce@momjian.us>        http://momjian.us EnterpriseDB
http://enterprisedb.com
 PG East:  http://www.enterprisedb.com/community/nav-pg-east-2010.do


Re: [GENERAL] trouble with to_char('L')

From
Hiroshi Inoue
Date:
Bruce Momjian wrote:
> Hiroshi Inoue wrote:
>> Bruce Momjian wrote:
>>> Bruce Momjian wrote:
>>>> Hiroshi Inoue wrote:
>>>>> Bruce Momjian wrote:
>>>>>> Hiroshi Inoue wrote:
>>>>>>> Bruce Momjian wrote:
>>>>>>>> Where are we on this issue?
>>>>>>> Oops I forgot it completely.
>>>>>>> I have a little improved version and would post it tonight.
>>>>>> Ah, very good.  Thanks.
>>>>> Attached is an improved version.
>>>> I spent many hours on this patch and am attaching an updated version.
>>>> I have restructured the code and added many comments, but this is the
>>>> main one:
>>>>
>>>>     *  Ideally, the server encoding and locale settings would
>>>>     *  always match.  Unfortunately, WIN32 does not support UTF-8
>>>>     *  values for setlocale(), even though PostgreSQL runs fine with
>>>>     *  a UTF-8 encoding on Windows:
>>>>     *
>>>>     *      http://msdn.microsoft.com/en-us/library/x99tb11d.aspx
>>>>     *
>>>>     *  Therefore, we must set LC_CTYPE to match LC_NUMERIC and
>>>>     *  LC_MONETARY, call localeconv(), and use mbstowcs() to
>>>>     *  convert the locale-aware string, e.g. Euro symbol, which
>>>>     *  is not in UTF-8 to the server encoding.
>>>>
>>>> I need someone with WIN32 experience to review and test this patch.
>>> I don't understand why cache_locale_time() works on Windows.  It sets
>>> the LC_CTYPE but does not do any encoding coversion.
>> Doesn't strftime_win32 do the conversion?
> 
> Oh, I now see strftime is redefined as a macro in that C files.  Thanks.
> 
>>> Do month and
>>> day-of-week names not work either, or do they work and the encoding
>>> conversion for numeric/money, e.g. Euro, it not necessary?
>> db_strdup does the conversion.
> 
> Should we pull the encoding conversion into a separate function and have
> strftime_win32() and db_strdup() both call it?

We may be able to pull the conversion WideChars => UTF8 =>
a PG encoding into an function.

BTW both PGLC_localeconv() and cache_locale_time() save the current LC_CTYPE first and restore them just before
returningthe functions.
 
I'm suspicious if it's OK when errors occur in middle of the functions.

regards,
Hiroshi Inoue


Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Hiroshi Inoue wrote:
> >>>> I need someone with WIN32 experience to review and test this patch.
> >>> I don't understand why cache_locale_time() works on Windows.  It sets
> >>> the LC_CTYPE but does not do any encoding coversion.
> >> Doesn't strftime_win32 do the conversion?
> >
> > Oh, I now see strftime is redefined as a macro in that C files.  Thanks.
> >
> >>> Do month and
> >>> day-of-week names not work either, or do they work and the encoding
> >>> conversion for numeric/money, e.g. Euro, it not necessary?
> >> db_strdup does the conversion.
> >
> > Should we pull the encoding conversion into a separate function and have
> > strftime_win32() and db_strdup() both call it?
>
> We may be able to pull the conversion WideChars => UTF8 =>
> a PG encoding into an function.

OK, I have created a new function, win32_wchar_to_db_encoding(), to
share the conversion from wide characters to the database encoding.
New patch attached.

> BTW both PGLC_localeconv() and cache_locale_time() save the current
>   LC_CTYPE first and restore them just before returning the functions.
> I'm suspicious if it's OK when errors occur in middle of the functions.

Yea, I added a comment questioning if that is a problem.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  PG East:  http://www.enterprisedb.com/community/nav-pg-east-2010.do
Index: src/backend/utils/adt/pg_locale.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/pg_locale.c,v
retrieving revision 1.53
diff -c -c -r1.53 pg_locale.c
*** src/backend/utils/adt/pg_locale.c    27 Feb 2010 20:20:44 -0000    1.53
--- src/backend/utils/adt/pg_locale.c    2 Mar 2010 18:11:41 -0000
***************
*** 4,10 ****
   *
   * Portions Copyright (c) 2002-2010, PostgreSQL Global Development Group
   *
!  * $PostgreSQL: pgsql/src/backend/utils/adt/pg_locale.c,v 1.53 2010/02/27 20:20:44 momjian Exp $
   *
   *-----------------------------------------------------------------------
   */
--- 4,10 ----
   *
   * Portions Copyright (c) 2002-2010, PostgreSQL Global Development Group
   *
!  * $PostgreSQL: pgsql/src/backend/utils/adt/pg_locale.c,v 1.51 2010/01/02 16:57:54 momjian Exp $
   *
   *-----------------------------------------------------------------------
   */
***************
*** 96,101 ****
--- 96,109 ----
  static char *IsoLocaleName(const char *);        /* MSVC specific */
  #endif

+ #ifdef WIN32
+ static size_t win32_wchar_to_db_encoding(const wchar_t *wbuf,
+                                 const size_t wchars, char *dst, size_t dstlen);
+ static char *db_encoding_strdup(const char *item, const char *str);
+ static size_t strftime_win32(char *dst, size_t dstlen, const wchar_t *format,
+                              const struct tm *tm);
+ #endif
+

  /*
   * pg_perm_setlocale
***************
*** 387,392 ****
--- 395,488 ----
  }


+ #ifdef    WIN32
+ /*
+  *    Convert wide character string (UTF16 on Win32) to UTF8, and then
+  *    optionally to the db encoding.
+  */
+ static size_t win32_wchar_to_db_encoding(const wchar_t *wbuf,
+                                 const size_t wchars, char *dst, size_t dstlen)
+ {
+     int    db_encoding = GetDatabaseEncoding();
+     int    utf8len;
+
+     /* Convert wide string (UTF16) to UTF8 */
+     utf8len = WideCharToMultiByte(CP_UTF8, 0, wbuf, wchars, dst, dstlen, NULL, NULL);
+     if (utf8len == 0)
+         /* Does this leave LC_CTYPE set incorrectly? */
+         elog(ERROR,
+             "could not convert string %04x to UTF-8: error %lu", wbuf[0], GetLastError());
+     pfree(wbuf);
+
+     dst[utf8len] = '\0';
+     if (db_encoding != PG_UTF8)
+     {
+         PG_TRY();
+         {
+             char *convstr = pg_do_encoding_conversion(dst, utf8len, PG_UTF8, db_encoding);
+             if (dst != convstr)
+             {
+                 strlcpy(dst, convstr, dstlen);
+                 pfree(convstr);
+             }
+         }
+         PG_CATCH();
+         {
+             FlushErrorState();
+             dst[0] = '\0';
+         }
+         PG_END_TRY();
+     }
+
+     return pg_mbstrlen(dst);
+ }
+
+ /*
+  *    This converts the LC_CTYPE-encoded string returned from the
+  *    locale routines to the database encoding.
+  */
+ static char *db_encoding_strdup(const char *item, const char *str)
+ {
+     int    db_encoding = GetDatabaseEncoding();
+     size_t    wchars, ilen, wclen, dstlen;
+     int    bytes_per_char;
+     wchar_t    *wbuf;
+     char    *dst;
+
+     if (!str[0])
+         return strdup(str);
+
+     /* allocate wide character string */
+     ilen = strlen(str) + 1;
+     wclen = ilen * sizeof(wchar_t);
+     wbuf = (wchar_t *) palloc(wclen);
+
+     /* Convert multi-byte string using current LC_CTYPE to a wide-character string */
+     wchars = mbstowcs(wbuf, str, ilen);
+     if (wchars == (size_t) -1)
+         elog(ERROR,
+             "could not convert string to wide characters: error %lu", GetLastError());
+
+     /* allocate target string */
+     bytes_per_char = pg_encoding_max_length(PG_UTF8);
+     if (pg_encoding_max_length(db_encoding) > bytes_per_char)
+         bytes_per_char = pg_encoding_max_length(db_encoding);
+     dstlen = wchars * bytes_per_char + 1;
+     if ((dst = malloc(dstlen)) == NULL)
+         elog(ERROR, "could not allocate a destination buffer");
+
+     /* Convert wide string (UTF16) to db encoding */
+     win32_wchar_to_db_encoding(wbuf, wchars, dst, dstlen);
+
+     return dst;
+ }
+ #else
+ static char *db_encoding_strdup(const char *item, const char *str)
+ {
+     return strdup(str);
+ }
+ #endif /* WIN32 */
+
  /*
   * Return the POSIX lconv struct (contains number/money formatting
   * information) with locale information for all categories.
***************
*** 398,403 ****
--- 494,502 ----
      struct lconv *extlconv;
      char       *save_lc_monetary;
      char       *save_lc_numeric;
+ #ifdef    WIN32
+     char       *save_lc_ctype = NULL;
+ #endif

      /* Did we do it already? */
      if (CurrentLocaleConvValid)
***************
*** 413,442 ****
      if (save_lc_numeric)
          save_lc_numeric = pstrdup(save_lc_numeric);

      setlocale(LC_MONETARY, locale_monetary);
      setlocale(LC_NUMERIC, locale_numeric);
!
!     /* Get formatting information */
      extlconv = localeconv();

      /*
!      * Must copy all values since restoring internal settings may overwrite
       * localeconv()'s results.
       */
      CurrentLocaleConv = *extlconv;
!     CurrentLocaleConv.currency_symbol = strdup(extlconv->currency_symbol);
!     CurrentLocaleConv.decimal_point = strdup(extlconv->decimal_point);
!     CurrentLocaleConv.grouping = strdup(extlconv->grouping);
!     CurrentLocaleConv.thousands_sep = strdup(extlconv->thousands_sep);
!     CurrentLocaleConv.int_curr_symbol = strdup(extlconv->int_curr_symbol);
!     CurrentLocaleConv.mon_decimal_point = strdup(extlconv->mon_decimal_point);
      CurrentLocaleConv.mon_grouping = strdup(extlconv->mon_grouping);
!     CurrentLocaleConv.mon_thousands_sep = strdup(extlconv->mon_thousands_sep);
!     CurrentLocaleConv.negative_sign = strdup(extlconv->negative_sign);
!     CurrentLocaleConv.positive_sign = strdup(extlconv->positive_sign);
      CurrentLocaleConv.n_sign_posn = extlconv->n_sign_posn;

!     /* Try to restore internal settings */
      if (save_lc_monetary)
      {
          setlocale(LC_MONETARY, save_lc_monetary);
--- 512,588 ----
      if (save_lc_numeric)
          save_lc_numeric = pstrdup(save_lc_numeric);

+ #ifdef    WIN32
+     /*
+      *    Ideally, the db server encoding and locale settings would
+      *    always match.  Unfortunately, WIN32 does not support UTF-8
+      *    values for setlocale(), even though PostgreSQL runs fine with
+      *    a UTF-8 encoding on Windows:
+      *
+      *        http://msdn.microsoft.com/en-us/library/x99tb11d.aspx
+      *
+      *    Therefore, we must set LC_CTYPE to match LC_NUMERIC and
+      *    LC_MONETARY, call localeconv(), and use mbstowcs() to
+      *    convert the locale-aware string, e.g. Euro symbol, which
+      *    is not in UTF-8 to the server encoding.
+      */
+
+     if ((save_lc_ctype = setlocale(LC_CTYPE, NULL)) != NULL)
+     {
+         save_lc_ctype = pstrdup(save_lc_ctype);
+         /* Set LC_CTYPE to match LC_MONETARY? */
+         if (pg_strcasecmp(save_lc_ctype, locale_monetary) != 0)
+             setlocale(LC_CTYPE, locale_monetary);
+     }
+ #endif
+
      setlocale(LC_MONETARY, locale_monetary);
      setlocale(LC_NUMERIC, locale_numeric);
!     /*
!      *    Get formatting information for LC_MONETARY, and LC_NUMERIC if they
!      *    are the same.
!      */
      extlconv = localeconv();

      /*
!      * Must copy all values since restoring internal settings might overwrite
       * localeconv()'s results.
       */
      CurrentLocaleConv = *extlconv;
!
!     /* The first argument of db_encoding_strdup() is only used on WIN32 */
!     CurrentLocaleConv.currency_symbol = db_encoding_strdup("currency_symbol", extlconv->currency_symbol);
!     CurrentLocaleConv.int_curr_symbol = db_encoding_strdup("int_curr_symbol", extlconv->int_curr_symbol);
!     CurrentLocaleConv.mon_decimal_point = db_encoding_strdup("mon_decimal_point", extlconv->mon_decimal_point);
      CurrentLocaleConv.mon_grouping = strdup(extlconv->mon_grouping);
!     CurrentLocaleConv.mon_thousands_sep = db_encoding_strdup("mon_thousands_sep", extlconv->mon_thousands_sep);
!     CurrentLocaleConv.negative_sign = db_encoding_strdup("negative_sign", extlconv->negative_sign);
!     CurrentLocaleConv.positive_sign = db_encoding_strdup("positive_sign", extlconv->positive_sign);
      CurrentLocaleConv.n_sign_posn = extlconv->n_sign_posn;

! #ifdef    WIN32
!     if (save_lc_ctype && pg_strcasecmp(locale_numeric, locale_monetary) != 0)
!     {
!         setlocale(LC_CTYPE, locale_numeric);
!         /* Get formatting information for LC_NUMERIC with matching LC_CTYPE */
!         extlconv = localeconv();
!     }
! #endif
!
!     CurrentLocaleConv.decimal_point = db_encoding_strdup("decimal_point", extlconv->decimal_point);
!     CurrentLocaleConv.grouping = strdup(extlconv->grouping);
!     CurrentLocaleConv.thousands_sep = db_encoding_strdup("thousands_sep", extlconv->thousands_sep);
!
!     /*
!      *    Restore internal settings
!      */
! #ifdef    WIN32
!     if (save_lc_ctype)
!     {
!         setlocale(LC_CTYPE, save_lc_ctype);
!         pfree(save_lc_ctype);
!     }
! #endif
      if (save_lc_monetary)
      {
          setlocale(LC_MONETARY, save_lc_monetary);
***************
*** 455,483 ****

  #ifdef WIN32
  /*
!  * On win32, strftime() returns the encoding in CP_ACP, which is likely
!  * different from SERVER_ENCODING. This is especially important in Japanese
!  * versions of Windows which will use SJIS encoding, which we don't support
!  * as a server encoding.
!  *
!  * Replace strftime() with a version that gets the string in UTF16 and then
!  * converts it to the appropriate encoding as necessary.
   *
   * Note that this only affects the calls to strftime() in this file, which are
   * used to get the locale-aware strings. Other parts of the backend use
   * pg_strftime(), which isn't locale-aware and does not need to be replaced.
   */
  static size_t
! strftime_win32(char *dst, size_t dstlen, const wchar_t *format, const struct tm * tm)
  {
!     size_t        len;
      wchar_t        wbuf[MAX_L10N_DATA];
-     int            encoding;

!     encoding = GetDatabaseEncoding();
!
!     len = wcsftime(wbuf, MAX_L10N_DATA, format, tm);
!     if (len == 0)

          /*
           * strftime call failed - return 0 with the contents of dst
--- 601,628 ----

  #ifdef WIN32
  /*
!  * On WIN32, strftime() returns the encoding in CP_ACP (the default
!  * operating system codpage for that computer), which is likely different
!  * from SERVER_ENCODING.  This is especially important in Japanese versions
!  * of Windows which will use SJIS encoding, which we don't support as a
!  * server encoding.
!  *
!  * So, instead of using strftime(), use wcsftime() to return the value in
!  * wide characters (internally UTF16) and then convert it to the appropriate
!  * database encoding.
   *
   * Note that this only affects the calls to strftime() in this file, which are
   * used to get the locale-aware strings. Other parts of the backend use
   * pg_strftime(), which isn't locale-aware and does not need to be replaced.
   */
  static size_t
! strftime_win32(char *dst, size_t dstlen, const wchar_t *format, const struct tm *tm)
  {
!     size_t        wchars;
      wchar_t        wbuf[MAX_L10N_DATA];

!     wchars = wcsftime(wbuf, MAX_L10N_DATA, format, tm);
!     if (wchars == 0)

          /*
           * strftime call failed - return 0 with the contents of dst
***************
*** 485,511 ****
           */
          return 0;

!     len = WideCharToMultiByte(CP_UTF8, 0, wbuf, len, dst, dstlen, NULL, NULL);
!     if (len == 0)
!         elog(ERROR,
!              "could not convert string to UTF-8:error %lu", GetLastError());
!
!     dst[len] = '\0';
!     if (encoding != PG_UTF8)
!     {
!         char       *convstr = pg_do_encoding_conversion(dst, len, PG_UTF8, encoding);
!
!         if (dst != convstr)
!         {
!             strlcpy(dst, convstr, dstlen);
!             len = strlen(dst);
!         }
!     }
!
!     return len;
  }

  #define strftime(a,b,c,d) strftime_win32(a,b,L##c,d)
  #endif   /* WIN32 */


--- 630,641 ----
           */
          return 0;

!     return win32_wchar_to_db_encoding(wbuf, wchars, dst, dstlen);
  }

+ /* redefine strftime() */
  #define strftime(a,b,c,d) strftime_win32(a,b,L##c,d)
+
  #endif   /* WIN32 */


***************
*** 533,542 ****
      elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);

  #ifdef WIN32
!     /* set user's value of ctype locale */
      save_lc_ctype = setlocale(LC_CTYPE, NULL);
      if (save_lc_ctype)
          save_lc_ctype = pstrdup(save_lc_ctype);

      setlocale(LC_CTYPE, locale_time);
  #endif
--- 663,674 ----
      elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);

  #ifdef WIN32
!     /* See the WIN32 comment near the top of PGLC_localeconv() */
      save_lc_ctype = setlocale(LC_CTYPE, NULL);
      if (save_lc_ctype)
          save_lc_ctype = pstrdup(save_lc_ctype);
+     else
+         save_lc_ctype = pstrdup("");

      setlocale(LC_CTYPE, locale_time);
  #endif

Re: [GENERAL] trouble with to_char('L')

From
Takahiro Itagaki
Date:
Bruce Momjian <bruce@momjian.us> wrote:

> OK, I have created a new function, win32_wchar_to_db_encoding(), to
> share the conversion from wide characters to the database encoding.
> New patch attached.

Since 9.0 has GetPlatformEncoding() for the purpose, we could simplify
db_encoding_strdup() with the function. Like this:

static char *
db_encoding_strdup(const char *str)
{char   *pstr;char   *mstr;
/* convert the string to the database encoding */pstr = (char *) pg_do_encoding_conversion(
(unsignedchar *) str, strlen(str),                    GetPlatformEncoding(), GetDatabaseEncoding());mstr =
strdup(pstr);if(pstr != str)    pfree(pstr);
 
return mstr;
}

I beleive the code is harmless on all platforms and we can use it
instead of strdup() without any #ifdef WIN32 quotes.


BTW, I found we'd better to add "ANSI_X3.4-1968" as an alias for
PG_SQL_ASCII. My Fedora 12 returns the name when --no-locale is used.

Regards,
---
Takahiro Itagaki
NTT Open Source Software Center




Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Takahiro Itagaki wrote:
> 
> Bruce Momjian <bruce@momjian.us> wrote:
> 
> > OK, I have created a new function, win32_wchar_to_db_encoding(), to
> > share the conversion from wide characters to the database encoding.
> > New patch attached.
> 
> Since 9.0 has GetPlatformEncoding() for the purpose, we could simplify
> db_encoding_strdup() with the function. Like this:
> 
> static char *
> db_encoding_strdup(const char *str)
> {
>     char   *pstr;
>     char   *mstr;
> 
>     /* convert the string to the database encoding */
>     pstr = (char *) pg_do_encoding_conversion(
>                         (unsigned char *) str, strlen(str),
>                         GetPlatformEncoding(), GetDatabaseEncoding());
>     mstr = strdup(pstr);
>     if (pstr != str)
>         pfree(pstr);
> 
>     return mstr;
> }
> 
> I beleive the code is harmless on all platforms and we can use it
> instead of strdup() without any #ifdef WIN32 quotes.

OK, I don't have any Win32 people testing this patch so if we want this
fixed for 9.0 someone is going to have to test my patch to see that it
works.  Can you make the adjustments suggested above to my patch and
test it to see that it works so we can apply it for 9.0?

> BTW, I found we'd better to add "ANSI_X3.4-1968" as an alias for
> PG_SQL_ASCII. My Fedora 12 returns the name when --no-locale is used.

OK.

--  Bruce Momjian  <bruce@momjian.us>        http://momjian.us EnterpriseDB
http://enterprisedb.com
 PG East:  http://www.enterprisedb.com/community/nav-pg-east-2010.do


Re: [GENERAL] trouble with to_char('L')

From
Takahiro Itagaki
Date:
Bruce Momjian <bruce@momjian.us> wrote:

> Takahiro Itagaki wrote:
> > Since 9.0 has GetPlatformEncoding() for the purpose, we could simplify
> > db_encoding_strdup() with the function. Like this:
>
> OK, I don't have any Win32 people testing this patch so if we want this
> fixed for 9.0 someone is going to have to test my patch to see that it
> works.  Can you make the adjustments suggested above to my patch and
> test it to see that it works so we can apply it for 9.0?

Here is a full patch that can be applied cleanly to HEAD.
Can anyone test it on Windows?

I'm not sure why temporary changes of lc_ctype was required in the
original patch. The codes are not included in my patch, but please
notice me it is still needed.

Regards,
---
Takahiro Itagaki
NTT Open Source Software Center


Attachment

Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Takahiro Itagaki wrote:
> 
> Bruce Momjian <bruce@momjian.us> wrote:
> 
> > Takahiro Itagaki wrote:
> > > Since 9.0 has GetPlatformEncoding() for the purpose, we could simplify
> > > db_encoding_strdup() with the function. Like this:
> > 
> > OK, I don't have any Win32 people testing this patch so if we want this
> > fixed for 9.0 someone is going to have to test my patch to see that it
> > works.  Can you make the adjustments suggested above to my patch and
> > test it to see that it works so we can apply it for 9.0?
> 
> Here is a full patch that can be applied cleanly to HEAD.
> Can anyone test it on Windows?
> 
> I'm not sure why temporary changes of lc_ctype was required in the
> original patch. The codes are not included in my patch, but please
> notice me it is still needed.

Sorry for the delay in replying to you.

I considered your idea of using the existing Postgres encoding
conversion routines to do the conversion of localenv() strings, but
found two problems.

First, GetPlatformEncoding() caches its result, so it assumes the
LC_CTYPE never changes for the server, while fixing this issue actually
requires us to change LC_CTYPE.  We could avoid the caching but that
then involves complex table lookups, etc, which seems overly complex:

+       /* convert the string to the database encoding */
+       pstr = (char *) pg_do_encoding_conversion(
+                                               (unsigned char *) str, strlen(str),
+                                               GetPlatformEncoding(), GetDatabaseEncoding());

Second, having our backend routines do the conversion seems wrong
because it is possible for someone to set LC_MONETARY to an encoding
that our database does not understand, e.g. UTF16, but one that WIN32
can convert to a valid encoding.

The reason we are doing all this is because of this updated comment in
my patch:
ftp://momjian.us/pub/postgresql/mypatches/pg_locale

+    *  Ideally, monetary and numeric local symbols could be returned in
+    *  any server encoding.  Unfortunately, the WIN32 API does not allow
+    *  setlocale() to return values in a codepage/CTYPE that uses more
+    *  than two bytes per character, like UTF-8:
+    *
+    *      http://msdn.microsoft.com/en-us/library/x99tb11d.aspx
+    *
+    *  Evidently, LC_CTYPE allows us to control the encoding used
+    *  for strings returned by localeconv().  The Open Group
+    *  standard, mentioned at the top of this C file, doesn't
+    *  explicitly state this.
+    *
+    *  Therefore, we set LC_CTYPE to match LC_NUMERIC and
+    *  LC_MONETARY, call localeconv(), and use mbstowcs() to
+    *  convert the locale-aware string, e.g. Euro symbol (which
+    *  is not in UTF-8), to the server encoding.

One new idea would be to set LC_CTYPE to UTF16/widechars unconditionally
on Win32 and then just convert that always to the server encoding with
win32_wchar_to_db_encoding(), instead of using the encoding from
LC_MONETARY to set LC_CTYPE and having to do double-conversion.

--  Bruce Momjian  <bruce@momjian.us>        http://momjian.us EnterpriseDB
http://enterprisedb.com
 PG East:  http://www.enterprisedb.com/community/nav-pg-east-2010.do


Re: [GENERAL] trouble with to_char('L')

From
Magnus Hagander
Date:
On Mon, Mar 22, 2010 at 9:14 PM, Bruce Momjian <bruce@momjian.us> wrote:
> Takahiro Itagaki wrote:
>>
>> Bruce Momjian <bruce@momjian.us> wrote:
>>
>> > Takahiro Itagaki wrote:
>> > > Since 9.0 has GetPlatformEncoding() for the purpose, we could simplify
>> > > db_encoding_strdup() with the function. Like this:
>> >
>> > OK, I don't have any Win32 people testing this patch so if we want this
>> > fixed for 9.0 someone is going to have to test my patch to see that it
>> > works.  Can you make the adjustments suggested above to my patch and
>> > test it to see that it works so we can apply it for 9.0?
>>
>> Here is a full patch that can be applied cleanly to HEAD.
>> Can anyone test it on Windows?
>>
>> I'm not sure why temporary changes of lc_ctype was required in the
>> original patch. The codes are not included in my patch, but please
>> notice me it is still needed.
>
> Sorry for the delay in replying to you.
>
> I considered your idea of using the existing Postgres encoding
> conversion routines to do the conversion of localenv() strings, but
> found two problems.
>
> First, GetPlatformEncoding() caches its result, so it assumes the
> LC_CTYPE never changes for the server, while fixing this issue actually
> requires us to change LC_CTYPE.  We could avoid the caching but that
> then involves complex table lookups, etc, which seems overly complex:
>
> +       /* convert the string to the database encoding */
> +       pstr = (char *) pg_do_encoding_conversion(
> +                                               (unsigned char *) str, strlen(str),
> +                                               GetPlatformEncoding(), GetDatabaseEncoding());
>
> Second, having our backend routines do the conversion seems wrong
> because it is possible for someone to set LC_MONETARY to an encoding
> that our database does not understand, e.g. UTF16, but one that WIN32
> can convert to a valid encoding.
>
> The reason we are doing all this is because of this updated comment in
> my patch:
>
>        ftp://momjian.us/pub/postgresql/mypatches/pg_locale
>
> +    *  Ideally, monetary and numeric local symbols could be returned in
> +    *  any server encoding.  Unfortunately, the WIN32 API does not allow
> +    *  setlocale() to return values in a codepage/CTYPE that uses more
> +    *  than two bytes per character, like UTF-8:
> +    *
> +    *      http://msdn.microsoft.com/en-us/library/x99tb11d.aspx
> +    *
> +    *  Evidently, LC_CTYPE allows us to control the encoding used
> +    *  for strings returned by localeconv().  The Open Group
> +    *  standard, mentioned at the top of this C file, doesn't
> +    *  explicitly state this.
> +    *
> +    *  Therefore, we set LC_CTYPE to match LC_NUMERIC and
> +    *  LC_MONETARY, call localeconv(), and use mbstowcs() to
> +    *  convert the locale-aware string, e.g. Euro symbol (which
> +    *  is not in UTF-8), to the server encoding.
>
> One new idea would be to set LC_CTYPE to UTF16/widechars unconditionally
> on Win32 and then just convert that always to the server encoding with
> win32_wchar_to_db_encoding(), instead of using the encoding from
> LC_MONETARY to set LC_CTYPE and having to do double-conversion.

So, hugely late, reviving this thread.

Ideally, we should definitely consider doing that. Internally, Windows
will do it in UTF16 anyway. So we're basically doing
UTF16->db->UTF16->UTF8->db or something like that with this patch.

But I'm unsure how that would work. We're talking about the output of
localeconv(), right? I don't see a version of localeconv() that does
wide chars anywhere. (You can't just set LC_CTYPE and use the regular
function - Windows has a separate set of functions for dealing with
UTF16).

Looking at the patch, you're passing "item" to db_encoding_strdup()
but it doesn't seem to be used anywhere. Leftover from previous
experiments, or forgot to use it? Perhaps you intended for it to be in
the error messages?

Also, won't this need special-casing for UTF8? Per comment in
mbutils.c, wcstombs() doesn't work for UTF8 encodings - you need to
use MultiByteToWideChar().

I also note that we have char2wchar() already - we should perhaps just
call that? Or will that use the wrong locale?

-- Magnus HaganderMe: http://www.hagander.net/Work: http://www.redpill-linpro.com/


Re: [GENERAL] trouble with to_char('L')

From
Takahiro Itagaki
Date:
Magnus Hagander <magnus@hagander.net> wrote:

> But I'm unsure how that would work. We're talking about the output of
> localeconv(), right? I don't see a version of localeconv() that does
> wide chars anywhere. (You can't just set LC_CTYPE and use the regular
> function - Windows has a separate set of functions for dealing with
> UTF16).

Yeah, msvcrt doesn't have wlocaleconv :-( . Since localeconv() returns
characters in the encoding specified in LC_TYPE, we need to hande the
issue with codes something like:
   1. setlocale(LC_CTYPE, lc_monetary)   2. setlocale(LC_MONETARY, lc_monetary)   3. lc = localeconv()   4.
pg_do_encoding_conversion(lc->xxx,        FROM pg_get_encoding_from_locale(lc_monetary),         TO
GetDatabaseEncoding())  5. Revert LC_CTYPE and LC_MONETARY.
 


Another idea is to use GetLocaleInfoW() [1], that is win32 native locale
functions, instead of the libc one. It returns locale characters in wide
chars, so we can safely convert them as UTF16->UTF8->db. But it requires
an additional branch in our locale codes only for Windows.

[1] http://msdn.microsoft.com/en-us/library/dd318101

Regards,
---
Takahiro Itagaki
NTT Open Source Software Center




Re: [GENERAL] trouble with to_char('L')

From
Magnus Hagander
Date:
On Mon, Apr 19, 2010 at 03:59, Takahiro Itagaki
<itagaki.takahiro@oss.ntt.co.jp> wrote:
>
> Magnus Hagander <magnus@hagander.net> wrote:
>
>> But I'm unsure how that would work. We're talking about the output of
>> localeconv(), right? I don't see a version of localeconv() that does
>> wide chars anywhere. (You can't just set LC_CTYPE and use the regular
>> function - Windows has a separate set of functions for dealing with
>> UTF16).
>
> Yeah, msvcrt doesn't have wlocaleconv :-( . Since localeconv() returns
> characters in the encoding specified in LC_TYPE, we need to hande the
> issue with codes something like:
>
>    1. setlocale(LC_CTYPE, lc_monetary)
>    2. setlocale(LC_MONETARY, lc_monetary)
>    3. lc = localeconv()
>    4. pg_do_encoding_conversion(lc->xxx,
>          FROM pg_get_encoding_from_locale(lc_monetary),
>          TO GetDatabaseEncoding())
>    5. Revert LC_CTYPE and LC_MONETARY.
>
>
> Another idea is to use GetLocaleInfoW() [1], that is win32 native locale
> functions, instead of the libc one. It returns locale characters in wide
> chars, so we can safely convert them as UTF16->UTF8->db. But it requires
> an additional branch in our locale codes only for Windows.

If we can go UTF16->db directly, it might be a good idea. If we're
going via UTF8 anyway, I doubt it's going to be worth it.

Let's work off what we have now to start with at least. Bruce, can you
comment on that thing about the extra parameter? And UTF8?

-- Magnus HaganderMe: http://www.hagander.net/Work: http://www.redpill-linpro.com/


Re: [GENERAL] trouble with to_char('L')

From
Takahiro Itagaki
Date:
Magnus Hagander <magnus@hagander.net> wrote:

> > 1. setlocale(LC_CTYPE, lc_monetary)
> > 2. setlocale(LC_MONETARY, lc_monetary)
> > 3. lc = localeconv()
> > 4. pg_do_encoding_conversion(lc->xxx,
> >      FROM pg_get_encoding_from_locale(lc_monetary),
> >      TO GetDatabaseEncoding())
> > 5. Revert LC_CTYPE and LC_MONETARY.

A patch attached for the above straightforwardly. Does this work?
Note that #ifdef WIN32 parts in the patch are harmless on other platforms
even if they are enabled.

> Let's work off what we have now to start with at least. Bruce, can you
> comment on that thing about the extra parameter? And UTF8?

Regards,
---
Takahiro Itagaki
NTT Open Source Software Center


Attachment

Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Magnus Hagander wrote:
> > One new idea would be to set LC_CTYPE to UTF16/widechars unconditionally
> > on Win32 and then just convert that always to the server encoding with
> > win32_wchar_to_db_encoding(), instead of using the encoding from
> > LC_MONETARY to set LC_CTYPE and having to do double-conversion.
> 
> So, hugely late, reviving this thread.
> 
> Ideally, we should definitely consider doing that. Internally, Windows
> will do it in UTF16 anyway. So we're basically doing
> UTF16->db->UTF16->UTF8->db or something like that with this patch.
> 
> But I'm unsure how that would work. We're talking about the output of
> localeconv(), right? I don't see a version of localeconv() that does
> wide chars anywhere. (You can't just set LC_CTYPE and use the regular
> function - Windows has a separate set of functions for dealing with
> UTF16).

I thought there was an LC_CTYPE for UTF16 that we could use without a
wide version of that function.  If not, forget that idea.

> Looking at the patch, you're passing "item" to db_encoding_strdup()
> but it doesn't seem to be used anywhere. Leftover from previous
> experiments, or forgot to use it? Perhaps you intended for it to be in
> the error messages?

It originally was in the error message but can be removed.  I have now
removed 'item' from my version of the patch.

> Also, won't this need special-casing for UTF8? Per comment in
> mbutils.c, wcstombs() doesn't work for UTF8 encodings - you need to
> use MultiByteToWideChar().

Well, we don't support UTF8 for any of the non-encoding locales, e.g.
monetary, numeric, so I never considered that we would support it.  If
we did support it, we would have to _pick_ a locale that is <= 2 bytes
per character and use that, and then convert to UTF8, but what locale
would we pick?  They could use a LC_TYPE that is <= 2 bytes and a
numeric that is UTF8, but I never suspected we would want to support
that, and we would need some logic to detect that case.

> I also note that we have char2wchar() already - we should perhaps just
> call that? Or will that use the wrong locale?

I see char2wchar() calling GetDatabaseEncoding() right away, which does
use the cached value for the server encoding, so I don't think it will
work.  We can't use our existing routines to convert _from_ the current
encoding to wide characters (because our numeric encoding might not
match the server encoding).  However, we can use existing code that
converts from wide to the server encoding, perhaps replacing
win32_wchar_to_db_encoding().

--  Bruce Momjian  <bruce@momjian.us>        http://momjian.us EnterpriseDB
http://enterprisedb.com


Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Takahiro Itagaki wrote:
> 
> Magnus Hagander <magnus@hagander.net> wrote:
> 
> > > 1. setlocale(LC_CTYPE, lc_monetary)
> > > 2. setlocale(LC_MONETARY, lc_monetary)
> > > 3. lc = localeconv()
> > > 4. pg_do_encoding_conversion(lc->xxx,
> > >      FROM pg_get_encoding_from_locale(lc_monetary),
> > >      TO GetDatabaseEncoding())
> > > 5. Revert LC_CTYPE and LC_MONETARY.
> 
> A patch attached for the above straightforwardly. Does this work?
> Note that #ifdef WIN32 parts in the patch are harmless on other platforms
> even if they are enabled.

I like this patch.  Instead of having special code to convert from the
_current_ locale, you pass the encoding name to our routines.  This does
mean we are bound by supporting only the encodings PG supports, not the
full range of Win32 encodings, but that seems fine.

--  Bruce Momjian  <bruce@momjian.us>        http://momjian.us EnterpriseDB
http://enterprisedb.com


Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Magnus Hagander wrote:
> > Another idea is to use GetLocaleInfoW() [1], that is win32 native locale
> > functions, instead of the libc one. It returns locale characters in wide
> > chars, so we can safely convert them as UTF16->UTF8->db. But it requires
> > an additional branch in our locale codes only for Windows.
> 
> If we can go UTF16->db directly, it might be a good idea. If we're
> going via UTF8 anyway, I doubt it's going to be worth it.
> 
> Let's work off what we have now to start with at least. Bruce, can you
> comment on that thing about the extra parameter? And UTF8?

I do like the idea of using UTF16 directly because that would eliminate
our need to even set LC_CTYPE for Win32 in this routine.  That would
also eliminate any need to refer to the encoding for numeric/monetary,
so we could get rid of the odd case where their encoding is UTF8 but
their numeric/monetary locale settings have to use a non-UTF8 encoding. 
For example, the original bug report has these locale settings:
http://archives.postgresql.org/pgsql-general/2009-04/msg00829.php
psql (PostgreSQL) 8.3.7server_version 8.3.7server_encoding UTF8client_encoding win1252lc_numeric Finnish,
Finlandlc_monetaryFinnish, Finland
 

but really needed to use "Finnish_Finland.1252":
http://archives.postgresql.org/pgsql-general/2009-04/msg00859.phpHowever, I noticed that both lc_collate and lc_ctype
areset toFinnish_Finland.1252 by the installer. Should I have just run initdbwith --locale fi_FI.UTF8 at the very
start?The to_char('L') worksfine with a database with win1252 encoding.
 

Of course, that still does not work with our current CVS code if the
database encoding is UTF8, which is what we are trying to fix now.

I am not even sure how users set these things properly but I assume the
installer does all that magic.  And, of course, if someone manually runs
initdb on Windows, they can easily set things wrong.

Magnus, if I remember correctly, all our non-UTF8 to UTF8 conversion
already has to pass through UTF16 as an intermediary case, so going to
UTF16 directly seems fine.

--  Bruce Momjian  <bruce@momjian.us>        http://momjian.us EnterpriseDB
http://enterprisedb.com


Re: [GENERAL] trouble with to_char('L')

From
Hiroshi Inoue
Date:
Takahiro Itagaki wrote:
> Magnus Hagander <magnus@hagander.net> wrote:
> 
>>> 1. setlocale(LC_CTYPE, lc_monetary)
>>> 2. setlocale(LC_MONETARY, lc_monetary)
>>> 3. lc = localeconv()
>>> 4. pg_do_encoding_conversion(lc->xxx,
>>>      FROM pg_get_encoding_from_locale(lc_monetary),
>>>      TO GetDatabaseEncoding())
>>> 5. Revert LC_CTYPE and LC_MONETARY.
> 
> A patch attached for the above straightforwardly. Does this work?

I have 2 questions about this patch.

1. How does it work when LC_MONETARY and LC_NUMERIC are different?
2. Calling db_encoding_strdup() for lconv->grouping is appropriate?

regards,
Hiroshi Inoue

> Note that #ifdef WIN32 parts in the patch are harmless on other platforms
> even if they are enabled.
> 
>> Let's work off what we have now to start with at least. Bruce, can you
>> comment on that thing about the extra parameter? And UTF8?
> 
> Regards,
> ---
> Takahiro Itagaki
> NTT Open Source Software Center


Re: [GENERAL] trouble with to_char('L')

From
Takahiro Itagaki
Date:
Hiroshi Inoue <inoue@tpf.co.jp> wrote:

> 1. How does it work when LC_MONETARY and LC_NUMERIC are different?

I think it is rarely used, but possible. Fixed.

> 2. Calling db_encoding_strdup() for lconv->grouping is appropriate?

Ah, we didn't need it. Removed.

Revised patch attached. Please test it.

Regards,
---
Takahiro Itagaki
NTT Open Source Software Center


Attachment

Re: [GENERAL] trouble with to_char('L')

From
Takahiro Itagaki
Date:
Takahiro Itagaki <itagaki.takahiro@oss.ntt.co.jp> wrote:

> Revised patch attached. Please test it.

I applied this version of the patch.
Please check wheter the bug is fixed and any buildfarm failures.

Regards,
---
Takahiro Itagaki
NTT Open Source Software Center




Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Takahiro Itagaki wrote:
>
> Takahiro Itagaki <itagaki.takahiro@oss.ntt.co.jp> wrote:
>
> > Revised patch attached. Please test it.
>
> I applied this version of the patch.
> Please check wheter the bug is fixed and any buildfarm failures.

Great.  I have merged in my C comments into the code with the attached
patch so we remember why the code is setup as it is.

One thing I am confused about is that, for Win32, our numeric/monetary
handling sets lc_ctype to match numeric/monetary, while our time code in
the same file uses that method _and_ uses wcsftime() to return the value
in wide characters.  So, why do we do both for time?  Is there any value
to that?

Seems we should do the same for both numeric/monetary and time.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com
Index: src/backend/utils/adt/pg_locale.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/pg_locale.c,v
retrieving revision 1.54
diff -c -c -r1.54 pg_locale.c
*** src/backend/utils/adt/pg_locale.c    22 Apr 2010 01:55:52 -0000    1.54
--- src/backend/utils/adt/pg_locale.c    24 Apr 2010 22:43:53 -0000
***************
*** 41,46 ****
--- 41,50 ----
   * DOES NOT WORK RELIABLY: on some platforms the second setlocale() call
   * will change the memory save is pointing at.    To do this sort of thing
   * safely, you *must* pstrdup what setlocale returns the first time.
+  *
+  * FYI, The Open Group locale standard is defined here:
+  *
+  *  http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html
   *----------
   */

***************
*** 424,430 ****
      char       *grouping;
      char       *thousands_sep;
      int            encoding;
-
  #ifdef WIN32
      char       *save_lc_ctype;
  #endif
--- 428,433 ----
***************
*** 435,459 ****

      free_struct_lconv(&CurrentLocaleConv);

!     /* Set user's values of monetary and numeric locales */
      save_lc_monetary = setlocale(LC_MONETARY, NULL);
      if (save_lc_monetary)
          save_lc_monetary = pstrdup(save_lc_monetary);
      save_lc_numeric = setlocale(LC_NUMERIC, NULL);
      if (save_lc_numeric)
          save_lc_numeric = pstrdup(save_lc_numeric);

  #ifdef WIN32
!     /* set user's value of ctype locale */
      save_lc_ctype = setlocale(LC_CTYPE, NULL);
      if (save_lc_ctype)
          save_lc_ctype = pstrdup(save_lc_ctype);
- #endif

!     /* Get formatting information for numeric */
! #ifdef WIN32
      setlocale(LC_CTYPE, locale_numeric);
  #endif
      setlocale(LC_NUMERIC, locale_numeric);
      extlconv = localeconv();
      encoding = pg_get_encoding_from_locale(locale_numeric);
--- 438,485 ----

      free_struct_lconv(&CurrentLocaleConv);

!     /* Save user's values of monetary and numeric locales */
      save_lc_monetary = setlocale(LC_MONETARY, NULL);
      if (save_lc_monetary)
          save_lc_monetary = pstrdup(save_lc_monetary);
+
      save_lc_numeric = setlocale(LC_NUMERIC, NULL);
      if (save_lc_numeric)
          save_lc_numeric = pstrdup(save_lc_numeric);

  #ifdef WIN32
!    /*
!     *  Ideally, monetary and numeric local symbols could be returned in
!     *  any server encoding.  Unfortunately, the WIN32 API does not allow
!     *  setlocale() to return values in a codepage/CTYPE that uses more
!     *  than two bytes per character, like UTF-8:
!     *
!     *      http://msdn.microsoft.com/en-us/library/x99tb11d.aspx
!     *
!     *  Evidently, LC_CTYPE allows us to control the encoding used
!     *  for strings returned by localeconv().  The Open Group
!     *  standard, mentioned at the top of this C file, doesn't
!     *  explicitly state this.
!     *
!     *  Therefore, we set LC_CTYPE to match LC_NUMERIC or LC_MONETARY
!     *  (which cannot be UTF8), call localeconv(), and then convert from
!     *  the numeric/monitary LC_CTYPE to the server encoding.  One
!     *  example use of this is for the Euro symbol.
!     *
!     *  Perhaps someday we will use GetLocaleInfoW() which returns values
!     *  in UTF16 and convert from that.
!     */
!
!     /* save user's value of ctype locale */
      save_lc_ctype = setlocale(LC_CTYPE, NULL);
      if (save_lc_ctype)
          save_lc_ctype = pstrdup(save_lc_ctype);

!     /* use numeric to set the ctype */
      setlocale(LC_CTYPE, locale_numeric);
  #endif
+
+     /* Get formatting information for numeric */
      setlocale(LC_NUMERIC, locale_numeric);
      extlconv = localeconv();
      encoding = pg_get_encoding_from_locale(locale_numeric);
***************
*** 462,471 ****
      thousands_sep = db_encoding_strdup(encoding, extlconv->thousands_sep);
      grouping = strdup(extlconv->grouping);

-     /* Get formatting information for monetary */
  #ifdef WIN32
      setlocale(LC_CTYPE, locale_monetary);
  #endif
      setlocale(LC_MONETARY, locale_monetary);
      extlconv = localeconv();
      encoding = pg_get_encoding_from_locale(locale_monetary);
--- 488,499 ----
      thousands_sep = db_encoding_strdup(encoding, extlconv->thousands_sep);
      grouping = strdup(extlconv->grouping);

  #ifdef WIN32
+     /* use monetary to set the ctype */
      setlocale(LC_CTYPE, locale_monetary);
  #endif
+
+     /* Get formatting information for monetary */
      setlocale(LC_MONETARY, locale_monetary);
      extlconv = localeconv();
      encoding = pg_get_encoding_from_locale(locale_monetary);
***************
*** 500,506 ****
      }

  #ifdef WIN32
!     /* try to restore internal ctype settings */
      if (save_lc_ctype)
      {
          setlocale(LC_CTYPE, save_lc_ctype);
--- 528,534 ----
      }

  #ifdef WIN32
!     /* Try to restore internal ctype settings */
      if (save_lc_ctype)
      {
          setlocale(LC_CTYPE, save_lc_ctype);
***************
*** 514,526 ****

  #ifdef WIN32
  /*
!  * On win32, strftime() returns the encoding in CP_ACP, which is likely
!  * different from SERVER_ENCODING. This is especially important in Japanese
!  * versions of Windows which will use SJIS encoding, which we don't support
!  * as a server encoding.
   *
!  * Replace strftime() with a version that gets the string in UTF16 and then
!  * converts it to the appropriate encoding as necessary.
   *
   * Note that this only affects the calls to strftime() in this file, which are
   * used to get the locale-aware strings. Other parts of the backend use
--- 542,556 ----

  #ifdef WIN32
  /*
!  * On WIN32, strftime() returns the encoding in CP_ACP (the default
!  * operating system codpage for that computer), which is likely different
!  * from SERVER_ENCODING.  This is especially important in Japanese versions
!  * of Windows which will use SJIS encoding, which we don't support as a
!  * server encoding.
   *
!  * So, instead of using strftime(), use wcsftime() to return the value in
!  * wide characters (internally UTF16) and then convert it to the appropriate
!  * database encoding.
   *
   * Note that this only affects the calls to strftime() in this file, which are
   * used to get the locale-aware strings. Other parts of the backend use
***************
*** 537,543 ****

      len = wcsftime(wbuf, MAX_L10N_DATA, format, tm);
      if (len == 0)
-
          /*
           * strftime call failed - return 0 with the contents of dst
           * unspecified
--- 567,572 ----
***************
*** 564,570 ****
--- 593,601 ----
      return len;
  }

+ /* redefine strftime() */
  #define strftime(a,b,c,d) strftime_win32(a,b,L##c,d)
+
  #endif   /* WIN32 */


***************
*** 580,586 ****
      char        buf[MAX_L10N_DATA];
      char       *ptr;
      int            i;
-
  #ifdef WIN32
      char       *save_lc_ctype;
  #endif
--- 611,616 ----
***************
*** 591,610 ****

      elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);

  #ifdef WIN32
!     /* set user's value of ctype locale */
      save_lc_ctype = setlocale(LC_CTYPE, NULL);
      if (save_lc_ctype)
          save_lc_ctype = pstrdup(save_lc_ctype);

      setlocale(LC_CTYPE, locale_time);
  #endif

-     /* set user's value of time locale */
-     save_lc_time = setlocale(LC_TIME, NULL);
-     if (save_lc_time)
-         save_lc_time = pstrdup(save_lc_time);
-
      setlocale(LC_TIME, locale_time);

      timenow = time(NULL);
--- 621,642 ----

      elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);

+     /* save user's value of time locale */
+     save_lc_time = setlocale(LC_TIME, NULL);
+     if (save_lc_time)
+         save_lc_time = pstrdup(save_lc_time);
+
  #ifdef WIN32
!     /* See the WIN32 comment near the top of PGLC_localeconv() */
!     /* save user's value of ctype locale */
      save_lc_ctype = setlocale(LC_CTYPE, NULL);
      if (save_lc_ctype)
          save_lc_ctype = pstrdup(save_lc_ctype);

+     /* use lc_time to set the ctype */
      setlocale(LC_CTYPE, locale_time);
  #endif

      setlocale(LC_TIME, locale_time);

      timenow = time(NULL);

Re: [GENERAL] trouble with to_char('L')

From
Hiroshi Inoue
Date:
Bruce Momjian wrote:
> Takahiro Itagaki wrote:
>> Takahiro Itagaki <itagaki.takahiro@oss.ntt.co.jp> wrote:
>>
>>> Revised patch attached. Please test it.
>> I applied this version of the patch.
>> Please check wheter the bug is fixed and any buildfarm failures.
> 
> Great.  I have merged in my C comments into the code with the attached
> patch so we remember why the code is setup as it is.
> 
> One thing I am confused about is that, for Win32, our numeric/monetary
> handling sets lc_ctype to match numeric/monetary, while our time code in
> the same file uses that method _and_ uses wcsftime() to return the value
> in wide characters.  So, why do we do both for time?  Is there any value
> to that?

Unfortunately wcsftime() is a halfway conveniece function which uses
ANSI version of functionalities internally.
AFAIC the only way to remove the dependency to LC_CTYPE is to call GeLocaleInfoW() directly.

regards,
Hiroshi Inoue




Re: [GENERAL] trouble with to_char('L')

From
Bruce Momjian
Date:
Hiroshi Inoue wrote:
> Bruce Momjian wrote:
> > Takahiro Itagaki wrote:
> >> Takahiro Itagaki <itagaki.takahiro@oss.ntt.co.jp> wrote:
> >>
> >>> Revised patch attached. Please test it.
> >> I applied this version of the patch.
> >> Please check wheter the bug is fixed and any buildfarm failures.
> >
> > Great.  I have merged in my C comments into the code with the attached
> > patch so we remember why the code is setup as it is.
> >
> > One thing I am confused about is that, for Win32, our numeric/monetary
> > handling sets lc_ctype to match numeric/monetary, while our time code in
> > the same file uses that method _and_ uses wcsftime() to return the value
> > in wide characters.  So, why do we do both for time?  Is there any value
> > to that?
>
> Unfortunately wcsftime() is a halfway conveniece function which uses
> ANSI version of functionalities internally.
> AFAIC the only way to remove the dependency to LC_CTYPE is to call
>   GeLocaleInfoW() directly.

Thanks.  I have documented this fact in a C comment;  patch attached.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com
Index: src/backend/utils/adt/pg_locale.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/pg_locale.c,v
retrieving revision 1.55
diff -c -c -r1.55 pg_locale.c
*** src/backend/utils/adt/pg_locale.c    24 Apr 2010 22:54:56 -0000    1.55
--- src/backend/utils/adt/pg_locale.c    26 Apr 2010 13:30:03 -0000
***************
*** 627,633 ****
          save_lc_time = pstrdup(save_lc_time);

  #ifdef WIN32
!     /* See the WIN32 comment near the top of PGLC_localeconv() */
      /* save user's value of ctype locale */
      save_lc_ctype = setlocale(LC_CTYPE, NULL);
      if (save_lc_ctype)
--- 627,641 ----
          save_lc_time = pstrdup(save_lc_time);

  #ifdef WIN32
!     /*
!      * On WIN32, there is no way to get locale-specific time values in a
!      * specified locale, like we do for monetary/numeric.  We can only get
!      * CP_ACP (see strftime_win32) or UTF16.  Therefore, we get UTF16 and
!      * convert it to the database locale.  However, wcsftime() internally
!      * uses LC_CTYPE, so we set it here.  See the WIN32 comment near the
!      * top of PGLC_localeconv().
!      */
!
      /* save user's value of ctype locale */
      save_lc_ctype = setlocale(LC_CTYPE, NULL);
      if (save_lc_ctype)