Re: More message encoding woes - Mailing list pgsql-hackers

From Heikki Linnakangas
Subject Re: More message encoding woes
Date
Msg-id 49D35095.5020900@enterprisedb.com
Whole thread Raw
In response to Re: More message encoding woes  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: More message encoding woes  (Hiroshi Inoue <inoue@tpf.co.jp>)
Re: More message encoding woes  (Hiroshi Inoue <inoue@tpf.co.jp>)
List pgsql-hackers
Tom Lane wrote:
> Heikki Linnakangas <heikki.linnakangas@enterprisedb.com> writes:
>> Tom Lane wrote:
>>> Maybe use a special string "Translate Me First" that
>>> doesn't actually need to be end-user-visible, just so no one sweats over
>>> getting it right in context.
>
>> Yep, something like that. There seems to be a magic empty string
>> translation at the beginning of every po file that returns the
>> meta-information about the translation, like translation author and
>> date. Assuming that works reliably, I'll use that.
>
> At first that sounded like an ideal answer, but I can see a gotcha:
> suppose the translation's author's name contains some characters that
> don't convert to the database encoding.  I suppose that would result in
> failure, when we'd prefer it not to.  A single-purpose string could be
> documented as "whatever you translate this to should be pure ASCII,
> never mind if it's sensible".

I just tried that, and it seems that gettext() does transliteration, so
any characters that have no counterpart in the database encoding will be
replaced with something similar, or question marks. Assuming that's
universal across platforms, and I think it is, using the empty string
should work.

It also means that you can use lc_messages='ja' with
server_encoding='latin1', but it will be unreadable because all the
non-ascii characters are replaced with question marks. For something
like lc_messages='es_ES' and server_encoding='koi8-r', it will still
look quite nice.

Attached is a patch I've been testing. Seems to work quite well. It
would be nice if someone could test it on Windows, which seems to be a
bit special in this regard.

--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 118a6fe..390a7cf 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -290,6 +290,7 @@ locale_messages_assign(const char *value, bool doit, GucSource source)
         if (!pg_perm_setlocale(LC_MESSAGES, value))
             if (source != PGC_S_DEFAULT)
                 return NULL;
+        pg_init_gettext_codeset();
     }
 #ifndef WIN32
     else
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 03d86ca..47ebe1b 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -1242,7 +1242,7 @@ pg_bindtextdomain(const char *domain)

         get_locale_path(my_exec_path, locale_path);
         bindtextdomain(domain, locale_path);
-        pg_bind_textdomain_codeset(domain, GetDatabaseEncoding());
+        pg_register_textdomain(domain);
     }
 #endif
 }
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index bf66321..970cb83 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -842,46 +842,6 @@ cliplen(const char *str, int len, int limit)
     return l;
 }

-#if defined(ENABLE_NLS) && defined(WIN32)
-static const struct codeset_map {
-    int    encoding;
-    const char *codeset;
-} codeset_map_array[] = {
-    {PG_UTF8, "UTF-8"},
-    {PG_LATIN1, "LATIN1"},
-    {PG_LATIN2, "LATIN2"},
-    {PG_LATIN3, "LATIN3"},
-    {PG_LATIN4, "LATIN4"},
-    {PG_ISO_8859_5, "ISO-8859-5"},
-    {PG_ISO_8859_6, "ISO_8859-6"},
-    {PG_ISO_8859_7, "ISO-8859-7"},
-    {PG_ISO_8859_8, "ISO-8859-8"},
-    {PG_LATIN5, "LATIN5"},
-    {PG_LATIN6, "LATIN6"},
-    {PG_LATIN7, "LATIN7"},
-    {PG_LATIN8, "LATIN8"},
-    {PG_LATIN9, "LATIN-9"},
-    {PG_LATIN10, "LATIN10"},
-    {PG_KOI8R, "KOI8-R"},
-    {PG_WIN1250, "CP1250"},
-    {PG_WIN1251, "CP1251"},
-    {PG_WIN1252, "CP1252"},
-    {PG_WIN1253, "CP1253"},
-    {PG_WIN1254, "CP1254"},
-    {PG_WIN1255, "CP1255"},
-    {PG_WIN1256, "CP1256"},
-    {PG_WIN1257, "CP1257"},
-    {PG_WIN1258, "CP1258"},
-    {PG_WIN866, "CP866"},
-    {PG_WIN874, "CP874"},
-    {PG_EUC_CN, "EUC-CN"},
-    {PG_EUC_JP, "EUC-JP"},
-    {PG_EUC_KR, "EUC-KR"},
-    {PG_EUC_TW, "EUC-TW"},
-    {PG_EUC_JIS_2004, "EUC-JP"}
-};
-#endif /* WIN32 */
-
 void
 SetDatabaseEncoding(int encoding)
 {
@@ -892,28 +852,132 @@ SetDatabaseEncoding(int encoding)
     Assert(DatabaseEncoding->encoding == encoding);

 #ifdef ENABLE_NLS
-    pg_bind_textdomain_codeset(textdomain(NULL), encoding);
+    pg_init_gettext_codeset();
+    pg_register_textdomain(textdomain(NULL));
 #endif
 }

+static char **registered_textdomains = NULL;
+static const char *system_codeset = "invalid";
+
 /*
- * On Windows, we need to explicitly bind gettext to the correct
- * encoding, because gettext() tends to get confused.
+ * Register a gettext textdomain with the backend. We will call
+ * bind_textdomain_codeset() for it to ensure that translated strings
+ * are returned in the right encoding.
  */
 void
-pg_bind_textdomain_codeset(const char *domainname, int encoding)
+pg_register_textdomain(const char *domainname)
 {
-#if defined(ENABLE_NLS) && defined(WIN32)
+#if defined(ENABLE_NLS)
     int     i;
+    MemoryContext old_cxt;
+
+    old_cxt = MemoryContextSwitchTo(TopMemoryContext);
+    if (registered_textdomains == NULL)
+    {
+        registered_textdomains = palloc(sizeof(char *) * 1);
+        registered_textdomains[0] = NULL;
+    }

-    for (i = 0; i < lengthof(codeset_map_array); i++)
+    for (i = 0; registered_textdomains[i] != NULL; i++)
     {
-        if (codeset_map_array[i].encoding == encoding)
+        /* Ignore if already bound */
+        if (strcmp(registered_textdomains[i], domainname) == 0)
+            return;
+    }
+    registered_textdomains = repalloc(registered_textdomains,
+                                      (i + 2) * sizeof(char *));
+    registered_textdomains[i] = pstrdup(domainname);
+    registered_textdomains[i + 1] = NULL;
+
+    MemoryContextSwitchTo(old_cxt);
+
+    if (GetDatabaseEncoding() != PG_SQL_ASCII)
+    {
+        if (bind_textdomain_codeset(domainname,    system_codeset) == NULL)
+            elog(LOG, "bind_textdomain_codeset failed");
+    }
+#endif
+}
+
+/*
+ * Set the codeset used for strings returned by gettext() to match the
+ * database encoding.
+ *
+ * In theory this should only depend on the database encoding, but because
+ * of the way use gettext() to find the corresponding OS codeset name, we
+ * also need LC_MESSAGES to be set correctly for this to work. Because of
+ * that, pg_init_gettext_codeset() should be called after any changes to
+ * LC_MESSAGES.
+ */
+void
+pg_init_gettext_codeset(void)
+{
+#if defined(ENABLE_NLS)
+    int        i;
+
+    /*
+     * SQL_ASCII encoding is special. In that case we do nothing, and let
+     * gettext() to pick the codeset from LC_CTYPE.
+     */
+    if (GetDatabaseEncoding() == PG_SQL_ASCII)
+        return;
+
+    /*
+     * Find a codeset name for the database encoding that
+     * bind_textdomain_codeset() recognizes.
+     *
+     * Unfortunately there's no handy interface to list all the codesets
+     * in the system. 'locale -m' or 'iconv --list' do that, but we don't
+     * want to call external programs here. So we try every alias for the
+     * encoding that we know until we find one that works.
+     *
+     * Unfortunately bind_textdomain_codeset() doesn't return any error code
+     * when given an invalid codeset name, so we have to work a bit harder
+     * to check if a codeset name works. We call gettext("") after
+     * bind_textdomain_codeset(), and check that it returned a translated
+     * string other than "". Empty string is a special value in .po files
+     * that is present in all translations: it translates into a string with
+     * meta-information about the translation, like author and creation date.
+     */
+    system_codeset = NULL;
+    for (i = 0; encoding_match_list[i].system_enc_name; i++)
+    {
+        if (encoding_match_list[i].pg_enc_code != GetDatabaseEncoding())
+            continue;
+
+        if (bind_textdomain_codeset(textdomain(NULL),
+                        encoding_match_list[i].system_enc_name) != NULL)
         {
-            if (bind_textdomain_codeset(domainname,
-                                        codeset_map_array[i].codeset) == NULL)
+            const char *str = gettext("");
+            if (strcmp(str, "") != 0)
+            {
+                /* great, it worked */
+                system_codeset = encoding_match_list[i].system_enc_name;
+                break;
+            }
+        }
+    }
+
+    if (system_codeset == NULL)
+    {
+        elog(DEBUG1, "failed to find a system codeset name for encoding \"%s\"",
+             GetDatabaseEncodingName());
+        system_codeset = "invalid";
+    }
+
+    /*
+     * Bind all textdomains in use to the new codeset. This is done even if
+     * no valid codeset name was found, to force gettext() to revert to
+     * ascii English.
+     */
+    if (registered_textdomains != NULL)
+    {
+        for (i = 0; registered_textdomains[i] != NULL; i++)
+        {
+            if (bind_textdomain_codeset(registered_textdomains[i],
+                                        system_codeset) == NULL)
                 elog(LOG, "bind_textdomain_codeset failed");
-            break;
         }
     }
 #endif
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 76322c9..8fcfa52 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -392,7 +392,8 @@ extern const char *pg_get_client_encoding_name(void);
 extern void SetDatabaseEncoding(int encoding);
 extern int    GetDatabaseEncoding(void);
 extern const char *GetDatabaseEncodingName(void);
-extern void pg_bind_textdomain_codeset(const char *domainname, int encoding);
+extern void pg_register_textdomain(const char *domainname);
+extern void pg_init_gettext_codeset(void);

 extern int    pg_valid_client_encoding(const char *name);
 extern int    pg_valid_server_encoding(const char *name);
diff --git a/src/include/port.h b/src/include/port.h
index 0557dd2..cbd72bd 100644
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -422,6 +422,14 @@ extern void qsort_arg(void *base, size_t nel, size_t elsize,
           qsort_arg_comparator cmp, void *arg);

 /* port/chklocale.c */
+
+struct encoding_match
+{
+    int pg_enc_code;
+    const char *system_enc_name;
+};
+extern const struct encoding_match encoding_match_list[];
+
 extern int    pg_get_encoding_from_locale(const char *ctype);

 #endif   /* PG_PORT_H */
diff --git a/src/port/chklocale.c b/src/port/chklocale.c
index 78410df..4469e89 100644
--- a/src/port/chklocale.c
+++ b/src/port/chklocale.c
@@ -35,15 +35,12 @@
  * numbers (CPnnn).
  *
  * Note that we search the table with pg_strcasecmp(), so variant
- * capitalizations don't need their own entries.
+ * capitalizations don't need their own entries. XXX: Now that we also
+ * use this to map from pg encoding code to system name, do we need to
+ * include different capitalizations?
  */
-struct encoding_match
-{
-    enum pg_enc pg_enc_code;
-    const char *system_enc_name;
-};

-static const struct encoding_match encoding_match_list[] = {
+const struct encoding_match encoding_match_list[] = {
     {PG_EUC_JP, "EUC-JP"},
     {PG_EUC_JP, "eucJP"},
     {PG_EUC_JP, "IBM-eucJP"},

pgsql-hackers by date:

Previous
From: Hitoshi Harada
Date:
Subject: Re: Sort a column that does not exist
Next
From: Alvaro Herrera
Date:
Subject: Re: WIP: transformation hook modules and JSON support