Chinese GB18030 support is implemented! - Mailing list pgsql-patches
From | Bill Huang |
---|---|
Subject | Chinese GB18030 support is implemented! |
Date | |
Msg-id | 3CFF0AE1.10205@ybb.ne.jp Whole thread Raw |
Responses |
Re: Chinese GB18030 support is implemented!
Re: Chinese GB18030 support is implemented! |
List | pgsql-patches |
Hello, As postgresql is widely used in the world,many Chinese users are looking forward to use such a high performanced database management system.However since the Chinese new codepage standard GB18030 is not completely supported,postgresql is limitted to be used in China. Now I have managed to implement the GB18030 support upon the latest version,so the following functions are added after the patches are added. -Chinese GB18030 encoding is available on front-end side,while on backend side,EUC_CN or MIC is used. -Encoding convertion between MIC and GB18030 is implement. -GB18030 locale support is available on front-end side. -GB18030 locale test is added. Any help for testing with these patches and sugguestions for GB18030 support are greatly appreciated. Best Regards, Bill -- /---------------------------/ Bill Huang E-mail:bill_huanghb@ybb.ne.jp Cell phone:090-9979-4631 /---------------------------/ --- postgresql-7.2.1/src/backend/utils/mb/conv.c.org Thu Jun 6 11:52:24 2002 +++ postgresql-7.2.1/src/backend/utils/mb/conv.c Thu Jun 6 12:20:36 2002 @@ -502,6 +502,96 @@ } /* + * GB18030 ---> MIC + * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp> + */ +static void +gb180302mic(unsigned char *gb18030, unsigned char *p, int len) +{ + int c1; + int c2; + + while (len > 0 && (c1 = *gb18030++)) + { + if (c1 < 0x80) + { /* should be ASCII */ + len--; + *p++ = c1; + } + else if(c1 >= 0x81 && c1 <= 0xfe) + { + c2 = *gb18030++; + + if(c2 >= 0x30 && c2 <= 0x69){ + len -= 4; + *p++ = c1; + *p++ = c2; + *p++ = *gb18030++; + *p++ = *gb18030++; + *p++ = *gb18030++; + } + else if ((c2 >=0x40 && c2 <= 0x7e) ||(c2 >=0x80 && c2 <= 0xfe)){ + len -= 2; + *p++ = c1; + *p++ = c2; + *p++ = *gb18030++; + } + else{ /*throw the strange code*/ + len--; + } + } + } + *p = '\0'; +} + +/* + * MIC ---> GB18030 + * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp> + */ +static void +mic2gb18030(unsigned char *mic, unsigned char *p, int len) +{ + int c1; + int c2; + + while (len > 0 && (c1 = *mic)) + { + len -= pg_mic_mblen(mic++); + + if (c1 <= 0x7f) /*ASCII*/ + { + *p++ = c1; + } + else if (c1 >= 0x81 && c1 <= 0xfe) + { + c2 = *mic++; + + if((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe)){ + *p++ = c1; + *p++ = c2; + } + else if(c2 >= 0x30 && c2 <= 0x39){ + *p++ = c1; + *p++ = c2; + *p++ = *mic++; + *p++ = *mic++; + } + else{ + mic--; + printBogusChar(&mic, &p); + mic--; + printBogusChar(&mic, &p); + } + } + else{ + mic--; + printBogusChar(&mic, &p); + } + } + *p = '\0'; +} + +/* * EUC_TW ---> MIC */ static void @@ -1583,6 +1673,26 @@ } /* + * UTF-8 ---> GB18030 + */ +static void +utf_to_gb18030(unsigned char *utf, unsigned char *euc, int len) + +{ + utf_to_local(utf, euc, ULmapEUC_CN, + sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), len); +} + +/* + * GB18030 ---> UTF-8 + */ +static void +gb18030_to_utf(unsigned char *euc, unsigned char *utf, int len) +{ + local_to_utf(euc, utf, LUmapEUC_CN, + sizeof(LUmapEUC_CN) / sizeof(pg_local_to_utf), PG_EUC_CN, len); +} +/* * UTF-8 ---> EUC_KR */ static void @@ -1754,6 +1864,9 @@ PG_BIG5, big52mic, mic2big5, big5_to_utf, utf_to_big5 }, { + PG_GB18030, gb180302mic, mic2gb18030, gb18030_to_utf, utf_to_gb18030 + }, + { PG_WIN1250, win12502mic, mic2win1250, 0, 0 }, }; @@ -1841,6 +1954,9 @@ PG_BIG5, big52mic, mic2big5, 0, 0 }, { + PG_GB18030, gb180302mic, mic2gb18030, 0, 0 + }, + { PG_WIN1250, win12502mic, mic2win1250, 0, 0 }, }; --- postgresql-7.2.1/src/backend/utils/mb/encnames.c.org Mon Jun 3 19:24:10 2002 +++ postgresql-7.2.1/src/backend/utils/mb/encnames.c Mon Jun 3 19:25:26 2002 @@ -173,6 +173,9 @@ { "windows1251", PG_WIN1251 }, /* Windows-1251; Microsoft */ + { + "gb18030", PG_GB18030 + }, /* GB18030; GB18030 */ { NULL, 0 @@ -268,6 +271,9 @@ "BIG5", PG_BIG5 }, { + "GB18030", PG_GB18030 + }, + { "WIN1250", PG_WIN1250 } }; --- postgresql-7.2.1/src/interfaces/odbc/multibyte.c.org Wed Jun 5 18:28:30 2002 +++ postgresql-7.2.1/src/interfaces/odbc/multibyte.c Wed Jun 5 19:48:01 2002 @@ -48,6 +48,28 @@ mb_st = 0; } break; + /* Chinese GB18030 support + * By Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp> + * */ + case GB18030: + { + if (mb_st < 2 && s[i] > 0x81) + mb_st = 2; + else if (mb_st == 2) + if(s[i] >= 0x30 && s[i] <= 0x39) + mb_st = 3; + else + mb_st = 1; + else if (mb_st == 3) + if(s[i] >= 0x30 && s[i] <= 0x39) + mb_st = 1; + else + mb_st = 3; + else + mb_st = 0; + } + break; + default: mb_st = 0; } @@ -87,6 +109,16 @@ { multibyte_client_encoding = BIG5; return ("BIG5"); + }/* Chinese GB18030 support. + * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp> + */ + if (strstr(str, "%27GB18030%27") || + strstr(str, "%27gb18030%27") || + strstr(str, "'GB18030'") || + strstr(str, "'gb18030'") ) + { + multibyte_client_encoding = GB18030; + return ("GB18030"); } return ("OTHER"); } @@ -127,6 +159,25 @@ else multibyte_status = 0; } + break; + /*Chinese GB18030 support.Added by Bill Huang <bhuang@redhat.com> <bill_huanghb@ybb.ne.jp>*/ + case GB18030: + { + if (multibyte_status < 2 && s > 0x80) + multibyte_status = 2; + else if (multibyte_status = 2) + if (s >= 0x30 && s <= 0x39) + multibyte_status = 3; + else + multibyte_status = 1; + else if (multibyte_status = 3) + if (s >= 0x30 && s <= 0x39) + multibyte_status = 1; + else + multibyte_status = 3; + else + multibyte_status = 0; + } break; default: multibyte_status = 0; --- postgresql-7.2.1/src/interfaces/odbc/multibyte.h.org Wed Jun 5 19:51:20 2002 +++ postgresql-7.2.1/src/interfaces/odbc/multibyte.h Wed Jun 5 19:51:35 2002 @@ -28,6 +28,7 @@ #define SJIS 32 /* Shift JIS */ #define BIG5 33 /* Big5 */ #define WIN1250 34 /* windows-1250 */ +#define GB18030 35 /* GB18030 */ extern int multibyte_client_encoding; /* Multibyte client encoding. */ extern int multibyte_status; /* Multibyte charcter status. */ --- postgresql-7.2.1/src/include/mb/pg_wchar.h.org Mon May 27 20:07:58 2002 +++ postgresql-7.2.1/src/include/mb/pg_wchar.h Mon May 27 20:08:59 2002 @@ -182,6 +182,7 @@ /* followings are for client encoding only */ PG_SJIS, /* Shift JIS */ PG_BIG5, /* Big5 */ + PG_GB18030, /* GB18030 */ PG_WIN1250, /* windows-1250 */ _PG_LAST_ENCODING_ /* mark only */ --- postgresql-7.2.1/src/backend/utils/mb/wchar.c.org Mon May 27 20:02:44 2002 +++ postgresql-7.2.1/src/backend/utils/mb/wchar.c Mon May 27 20:03:12 2002 @@ -457,6 +457,33 @@ return (len); } +/* + * GB18030 + * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp> + */ +static int +pg_gb18030_mblen(const unsigned char *s) +{ + int len; + + if (*s <= 0x7f) + { /* kanji? */ + len = 1; + } + else + { /* should be ASCII */ + + if((*(s+1) >0x40 && *(s+1) <= 0x7e) + || (*(s+1) >= 0x80 && *(s+1) <= 0xfe)) + len = 2; + else if(*(s+1) >0x30 && *(s+1) <= 0x39) + len = 4; + else + len = 2; + } + return (len); +} + pg_wchar_tbl pg_wchar_table[] = { {pg_ascii2wchar_with_len, pg_ascii_mblen, 1}, /* 0; PG_SQL_ASCII */ {pg_eucjp2wchar_with_len, pg_eucjp_mblen, 3}, /* 1; PG_EUC_JP */ @@ -483,6 +510,7 @@ {pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 22; ISO-8859-15 */ {pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 23; ISO-8859-16 */ {0, pg_sjis_mblen, 2}, /* 24; PG_SJIS */ + {0, pg_gb18030_mblen, 2}, /* 25; PG_GB18030 */ {0, pg_big5_mblen, 2}, /* 25; PG_BIG5 */ {pg_latin12wchar_with_len, pg_latin1_mblen, 1} /* 26; PG_WIN1250 */ };
pgsql-patches by date: