Re: Enforcing database encoding and locale match - Mailing list pgsql-hackers

From Zdenek Kotala
Subject Re: Enforcing database encoding and locale match
Date
Msg-id 46FD61AA.6010904@sun.com
Whole thread Raw
In response to Re: Enforcing database encoding and locale match  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: Enforcing database encoding and locale match
Re: Enforcing database encoding and locale match
Re: Enforcing database encoding and locale match
List pgsql-hackers
Tom Lane wrote:
> Andrew Dunstan <andrew@dunslane.net> writes:
>> Gregory Stark wrote:
>>> "Tom Lane" <tgl@sss.pgh.pa.us> writes:
>>>> Another possibility is to treat the case as a WARNING if you're
>>>> superuser and an ERROR if you're not.  This would satisfy people
>>>> who are uncomfortable with the idea that CREATEDB privilege comes
>>>> with a built-in denial-of-service attack, while still leaving a
>>>> loophole for anyone for whom the test didn't work properly.
>>> That sounds like a good combination
>> +1
>
> After further experimentation I want to change the proposal a bit.
> AFAICS, if we recognize the nl_langinfo(CODESET) result, there is
> no reason not to trust the answer, so we might as well throw an
> error always.

Agree. Code seems to be OK and on POSIX compatible OS it should be work.
  I attached testing code. With following command

  for LOCALE in `locale -a`; do ./a.out $LOCALE ; done

is should be possible to verify status on all unix OS.

On Solaris I got following problematic locales:

C                       ... 646        - NO MATCH
POSIX                   ... 646        - NO MATCH
cs                      ... 646        - NO MATCH
da                      ... 646        - NO MATCH
et                      ... 646        - NO MATCH
it                      ... 646        - NO MATCH
ja_JP.PCK               ... PCK        - NO MATCH
ko                      ... 646        - NO MATCH
no                      ... 646        - NO MATCH
ru                      ... 646        - NO MATCH
sl                      ... 646        - NO MATCH
sv                      ... 646        - NO MATCH
tr                      ... 646        - NO MATCH
zh.GBK                  ... GBK        - NO MATCH
zh_CN.GB18030           ... GB18030    - NO MATCH
zh_CN.GB18030@pinyin    ... GB18030    - NO MATCH
zh_CN.GB18030@radical   ... GB18030    - NO MATCH
zh_CN.GB18030@stroke    ... GB18030    - NO MATCH
zh_CN.GBK               ... GBK        - NO MATCH
zh_CN.GBK@pinyin        ... GBK        - NO MATCH
zh_CN.GBK@radical       ... GBK        - NO MATCH
zh_CN.GBK@stroke        ... GBK        - NO MATCH


> The case that is problematic is where we can get a
> CODESET string but we don't recognize it.  In this case it seems
> appropriate to do
>
>     ereport(WARNING,
>             (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",
>                     ctype, sys),
>              errdetail("Please report this to <pgsql-bugs@postgresql.org>.")));
>
> and then let the user do what he wants.

The another question is what do when we know that this codeset/encoding
is not supported by postgres. Maybe extend encoding match structure to

struct encoding_match
{
    enum pg_enc pg_enc_code;
    const char *system_enc_name;
    bool supported;
};

and in case when it is unsupported then generates error. In case when
codeset does not match anyway then generates only warning.


        Zdenek
#include <locale.h>
#include <langinfo.h>
#include "postgres_fe.h"
//#include "miscadmin.h"
#include "mb/pg_wchar.h"

/*
 * Checks whether the encoding selected for PostgreSQL and the
 * encoding used by the system locale match.
 */

struct encoding_match
{
    enum pg_enc pg_enc_code;
    const char *system_enc_name;
};

static const struct encoding_match encoding_match_list[] = {
    {PG_EUC_JP, "EUC-JP"},
    {PG_EUC_JP, "eucJP"},
    {PG_EUC_JP, "IBM-eucJP"},
    {PG_EUC_JP, "sdeckanji"},

    {PG_EUC_CN, "EUC-CN"},
    {PG_EUC_CN, "eucCN"},
    {PG_EUC_CN, "IBM-eucCN"},
    {PG_EUC_CN, "GB2312"},
    {PG_EUC_CN, "dechanzi"},

    {PG_EUC_KR, "EUC-KR"},
    {PG_EUC_KR, "eucKR"},
    {PG_EUC_KR, "IBM-eucKR"},
    {PG_EUC_KR, "deckorean"},
    {PG_EUC_KR, "5601"},

    {PG_EUC_TW, "EUC-TW"},
    {PG_EUC_TW, "eucTW"},
    {PG_EUC_TW, "IBM-eucTW"},
    {PG_EUC_TW, "cns11643"},

#ifdef NOT_VERIFIED
    {PG_JOHAB, "???"},
#endif

    {PG_UTF8, "UTF-8"},
    {PG_UTF8, "utf8"},

    {PG_LATIN1, "ISO-8859-1"},
    {PG_LATIN1, "ISO8859-1"},
    {PG_LATIN1, "iso88591"},

    {PG_LATIN2, "ISO-8859-2"},
    {PG_LATIN2, "ISO8859-2"},
    {PG_LATIN2, "iso88592"},

    {PG_LATIN3, "ISO-8859-3"},
    {PG_LATIN3, "ISO8859-3"},
    {PG_LATIN3, "iso88593"},

    {PG_LATIN4, "ISO-8859-4"},
    {PG_LATIN4, "ISO8859-4"},
    {PG_LATIN4, "iso88594"},

    {PG_LATIN5, "ISO-8859-9"},
    {PG_LATIN5, "ISO8859-9"},
    {PG_LATIN5, "iso88599"},

    {PG_LATIN6, "ISO-8859-10"},
    {PG_LATIN6, "ISO8859-10"},
    {PG_LATIN6, "iso885910"},

    {PG_LATIN7, "ISO-8859-13"},
    {PG_LATIN7, "ISO8859-13"},
    {PG_LATIN7, "iso885913"},

    {PG_LATIN8, "ISO-8859-14"},
    {PG_LATIN8, "ISO8859-14"},
    {PG_LATIN8, "iso885914"},

    {PG_LATIN9, "ISO-8859-15"},
    {PG_LATIN9, "ISO8859-15"},
    {PG_LATIN9, "iso885915"},

    {PG_LATIN10, "ISO-8859-16"},
    {PG_LATIN10, "ISO8859-16"},
    {PG_LATIN10, "iso885916"},

    {PG_WIN1252, "CP1252"},
    {PG_WIN1253, "CP1253"},
    {PG_WIN1254, "CP1254"},
    {PG_WIN1255, "CP1255"},
    {PG_WIN1256, "CP1256"},
    {PG_WIN1257, "CP1257"},
    {PG_WIN1258, "CP1258"},
#ifdef NOT_VERIFIED
    {PG_WIN874, "???"},
#endif
    {PG_KOI8R, "KOI8-R"},
    {PG_WIN1251, "CP1251"},
    {PG_WIN866, "CP866"},

    {PG_ISO_8859_5, "ISO-8859-5"},
    {PG_ISO_8859_5, "ISO8859-5"},
    {PG_ISO_8859_5, "iso88595"},

    {PG_ISO_8859_6, "ISO-8859-6"},
    {PG_ISO_8859_6, "ISO8859-6"},
    {PG_ISO_8859_6, "iso88596"},

    {PG_ISO_8859_7, "ISO-8859-7"},
    {PG_ISO_8859_7, "ISO8859-7"},
    {PG_ISO_8859_7, "iso88597"},

    {PG_ISO_8859_8, "ISO-8859-8"},
    {PG_ISO_8859_8, "ISO8859-8"},
    {PG_ISO_8859_8, "iso88598"},

    {PG_SQL_ASCII, NULL}        /* end marker */
};

static char *
get_encoding_from_locale(const char *ctype)
{
    char       *save;
    char       *sys;

    save = setlocale(LC_CTYPE, NULL);
    if (!save)
        return NULL;
    save = strdup(save);

    setlocale(LC_CTYPE, ctype);
    sys = nl_langinfo(CODESET);
    sys = strdup(sys);

    setlocale(LC_CTYPE, save);
    free(save);

    return sys;
}


static int
find_matching_encoding(const char *ctype, const char *sys)
{
//    char       *sys;
    int            i;

    sys = get_encoding_from_locale(ctype);

    for (i = 0; encoding_match_list[i].system_enc_name; i++)
    {
        if (strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
        {
//            free(sys);
            return encoding_match_list[i].pg_enc_code;
        }
    }

//    free(sys);
    return -1;
}


int main(int argc, char **argv)
{
    int enc;
    char       *sys;

    if( argc != 2)
    {
        fprintf(stderr,"Invalid number of arguments.\n");
        return 1;
    }

    printf("%-23s ... ", argv[1]);

    sys = get_encoding_from_locale(argv[1]);
    printf("%-10s - ",sys);

    enc=find_matching_encoding(argv[1], sys);
    if( enc != -1 )
        printf("OK\n");
    else
        printf("NO MATCH\n");

    free(sys);
    return 0;
}


pgsql-hackers by date:

Previous
From: Tom Lane
Date:
Subject: Re: Enforcing database encoding and locale match
Next
From: Tom Lane
Date:
Subject: Re: Enforcing database encoding and locale match