On Wed, May 28, 2003 at 11:56:07PM +0200, Peter Eisentraut wrote:
> There is a standard interface (SUSv2) for detecting the character set
> based on the locale settings. I suggest we use this (if available) in
> applications like psql and pg_dump by default unless it is overridden by
> the usual mechanisms. If the character set name obtained this way is not
> recognized by PostgreSQL, we fall back to SQL_ASCII.
>
> Here's a piece of code that shows how this would work:
>
> #include <stdio.h>
> #include <locale.h>
> #include <langinfo.h>
>
> int
> main(int argc, char *argv[])
> {
> setlocale(LC_ALL, "");
> printf("%s\n", nl_langinfo(CODESET));
> return 0;
> }
>
> (LC_CTYPE is the governing category for this.)
>
> Comments?
It isn't enought for all OS. Please, look at glib or libcharset for this problem.
http://www.haible.de/bruno/packages-libcharset.html
I use in my project following code that is simplification oflibcharset (the main function is mp_locale_charset()).
Maybeit will help you :-)
/* Determine a canonical name for the current locale's character encoding.** mp_locale_charset() inspire with
libcharsetby:* * Copyright (C) 2000-2002 Free Software Foundation, Inc.* Written by Bruno Haible
<bruno@clisp.org>. ** $Id: charset.c,v 1.2 2003/01/24 14:02:01 zakkr Exp $*/
#include "mape.h"
#if HAVE_STDDEF_H
# include <stddef.h>
#endif
#include <stdio.h>
#if HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif
#if HAVE_STDLIB_H
# include <stdlib.h>
#endif
#if defined _WIN32 || defined __WIN32__
# undef WIN32 /* avoid warning on mingw32 */
# define WIN32
#endif
#if defined __EMX__
/* Assume EMX program runs on OS/2, even if compiled under DOS. */
# define OS2
#endif
#if !defined WIN32
# if HAVE_LANGINFO_CODESET
# include <langinfo.h>
# else
# if HAVE_SETLOCALE
# include <locale.h>
# endif
# endif
#elif defined WIN32
# define WIN32_LEAN_AND_MEAN
# include <windows.h>
#endif
#if defined OS2
# define INCL_DOS
# include <os2.h>
#endif
typedef struct MpCharsetAlias
{char *alias, *name;
} MpCharsetAlias;
extern mpbool mp_locale_charset (char **charset);
/** The libcharset load all from external text file, but it's strange and* slow solution, we rather use array(s)
compiledinto source. In the * "good" libc this is not needful -- for example in linux.* * Please, put to this funtion
exoticaliases only. The libc 'iconv' knows* a lot of basic aliases (check it first by iconv -l).* */
static const char *
mp_charset_aliases (const char *name)
{MpCharsetAlias *a;
#if defined WIN32MpCharsetAlias aliases[] = { { "CP936", "GBK" }, { "CP1361", "JOHAB" }, { "CP20127",
"ASCII"}, { "CP20866", "KOI8-R" }, { "CP21866", "KOI8-RU" }, { "CP28591", "ISO-8859-1" }, {
"CP28592", "ISO-8859-2" }, { "CP28593", "ISO-8859-3" }, { "CP28594", "ISO-8859-4" }, { "CP28595",
"ISO-8859-5"}, { "CP28596", "ISO-8859-6" }, { "CP28597", "ISO-8859-7" }, { "CP28598", "ISO-8859-8" },
{ "CP28599", "ISO-8859-9" }, { "CP28605", "ISO-8859-15" }, { NULL, NULL }};
#elif PORTNAME == aixMpCharsetAlias aliases[] ={ { "IBM-850", "CP850" }, { "IBM-856", "CP856" }, {
"IBM-921", "ISO-8859-13" }, { "IBM-922", "CP922" }, { "IBM-932", "CP932" }, { "IBM-943", "CP943" },
{ "IBM-1046", "CP1046" }, { "IBM-1124", "CP1124" }, { "IBM-1129", "CP1129" }, { "IBM-1252",
"CP1252"}, { "IBM-EUCCN", "GB2312" }, { "IBM-EUCJP", "EUC-JP" }, { "IBM-EUCKR", "EUC-KR" }, {
"IBM-EUCTW", "EUC-TW" }, { NULL, NULL }};
#elif PORTNAME == hpux MpCharsetAlias aliases[] ={ { "ROMAN8", "HP-ROMAN8" }, { "ARABIC8", "HP-ARABIC8" },
{ "GREEK8", "HP-GREEK8" }, { "HEBREW8", "HP-HEBREW8" }, { "TURKISH8", "HP-TURKISH8" }, { "KANA8",
"HP-KANA8"}, { "HP15CN", "GB2312" }, { NULL, NULL }};
#elif (PORTNAME == irix || PORTNAME == irix5)MpCharsetAlias aliases[] ={ { "EUCCN", "GB2312" }, { NULL, NULL
}};
#elif PORTNAME == osf MpCharsetAlias aliases[] ={ { "KSC5601", "CP949" }, { "SDECKANJI", "EUC-JP" }, {
"TACTIS", "TIS-620" }, { NULL, NULL }};
#elif (PORTNAME == solaris || PORTNAME == solaris_sparc || POSRTNAME == solaris_i386)MpCharsetAlias aliases[] ={ {
"646", "ASCII" }, { "CNS11643", "EUC-TW" }, { "5601", "EUC-KR" }, { "JOHAP92", "JOHAB" }, {
"PCK", "SHIFT_JIS" }, { "2533", "TIS-620" }, { NULL, NULL }};
#elif PORTNAME == netbsdMpCharsetAlias aliases[] ={ { "646", " ASCII" }, { "EUCCN", "GB2312" }, { NULL, NULL
}};
#elsereturn name;
#endifif (aliases){ for (a = aliases; a->alias; a++) if (strcasecmp (a->alias, name) == 0)
returna->name;} /* we return original name beacuse iconv() probably will know * something better about name if we
don'tknow it :-) */return name;
}
/* Returns charset from "language_COUNTRY.charset@modifier" string */
#ifndef HAVE_LANGINFO_CODESET
static char *
mp_encoding_from_locale(char *locale)
{char *dot = strchr (locale, '.');
if (dot != NULL){ const char *modifier; static char buf[2 + 10 + 1];
dot++; /* Look for the possible @... trailer and remove it, if any. */ modifier = strchr (dot, '@');
if (modifier == NULL) return dot; if (modifier - dot < sizeof (buf)) { memcpy (buf, dot, modifier
-dot); buf [modifier - dot] = '\0'; return buf; }}return locale;
}
#endif
mpbool
mp_locale_charset (char **charset)
{const char *codeset;
#if !(defined WIN32 || defined OS2)
# if HAVE_LANGINFO_CODESET/* Most systems support nl_langinfo (CODESET) nowadays. */codeset = nl_langinfo (CODESET);
# else/* On old systems which lack it, use setlocale or getenv. */const char *locale = NULL;
/* But most old systems don't have a complete set of locales. Some * (like SunOS 4 or DJGPP) have only the C locale.
Thereforewe don't * use setlocale here; it would return "C" when it doesn't support the * locale name the user has set.
*/
# if HAVE_SETLOCALE && 0locale = setlocale (LC_CTYPE, NULL);
# endifif (locale == NULL || locale[0] == '\0'){ locale = getenv ("LC_ALL"); if (locale == NULL || locale[0] ==
'\0') { locale = getenv ("LC_CTYPE"); if (locale == NULL || locale[0] == '\0') locale =
getenv("LANG"); }}
/* On some old systems, one used to set locale = "iso8859_1". On others, * you set it to "language_COUNTRY.charset". In
anycase, we resolve it * through the charset.alias file. */codeset = mp_encoding_from_locale(locale);
# endif /* HAVE_LANGINFO_CODESET */
#elif defined WIN32
static char buf[2 + 10 + 1];
/* Woe32 has a function returning the locale's codepage as a number. */sprintf (buf, "CP%u", GetACP ());codeset =
buf;
#elif defined OS2
const char *locale;static char buf[2 + 10 + 1];ULONG cp[3];ULONG cplen;
/* Allow user to override the codeset, as set in the operating system, * with standard language environment variables.
*/locale = getenv ("LC_ALL");if (locale == NULL || locale[0] == '\0'){ locale = getenv ("LC_CTYPE"); if (locale
==NULL || locale[0] == '\0') locale = getenv ("LANG");}if (locale != NULL && locale[0] != '\0') codeset =
mp_encoding_from_locale(locale);else{ /* OS/2 has a function returning the locale's codepage as a number. */ if
(DosQueryCp(sizeof (cp), cp, &cplen)) codeset = ""; else { sprintf (buf, "CP%u", cp[0]);
codeset= buf; } }
#endifif (codeset == NULL) /* The canonical name cannot be determined. */ codeset = "";else codeset =
mp_charset_aliases(codeset);/* Don't return an empty string. GNU libc and GNU libiconv interpret * the empty string as
denoting"the locale's character encoding", * thus GNU libiconv would call this function a second time. */if
(codeset[0]== '\0'){ /* * Last possibility is 'CHARSET' enviroment variable */ if (!(codeset = getenv
("CHARSET"))) codeset = "ASCII";}if (charset) *charset = (char *) codeset;
if (strcasecmp(codeset, "UTF8")==0 || strcasecmp(codeset, "UTF-8")==0) return TRUE;return FALSE;
}
autoconf part:-------------
AC_DEFUN(jm_LANGINFO_CODESET,
[ AC_CHECK_HEADERS(langinfo.h) AC_CHECK_FUNCS(nl_langinfo)
AC_CACHE_CHECK([for nl_langinfo and CODESET], jm_cv_langinfo_codeset, [AC_TRY_LINK([#include <langinfo.h>],
[char*cs = nl_langinfo(CODESET);], jm_cv_langinfo_codeset=yes, jm_cv_langinfo_codeset=no) ]) if test
$jm_cv_langinfo_codeset= yes; then AC_DEFINE(HAVE_LANGINFO_CODESET, 1, [Define if you have <langinfo.h> and
nl_langinfo(CODESET).])fi
])
-- Karel Zak <zakkr@zf.jcu.cz>http://home.zf.jcu.cz/~zakkr/