Chinese GB18030 support is implemented! - Mailing list pgsql-patches

From Bill Huang
Subject Chinese GB18030 support is implemented!
Date
Msg-id 3CFF0AE1.10205@ybb.ne.jp
Whole thread Raw
Responses Re: Chinese GB18030 support is implemented!  (Tatsuo Ishii <t-ishii@sra.co.jp>)
Re: Chinese GB18030 support is implemented!  (Tatsuo Ishii <t-ishii@sra.co.jp>)
List pgsql-patches
Hello,

As postgresql is widely used in the world,many Chinese users are looking
forward to use such a high performanced database management
system.However since the Chinese new codepage standard GB18030 is not
completely supported,postgresql is limitted to be used in China.

Now I have managed to implement the GB18030 support upon the latest
version,so the following functions are added after the patches are added.

-Chinese GB18030 encoding is available on front-end side,while on
backend side,EUC_CN or MIC is used.
-Encoding convertion between MIC and GB18030 is implement.
-GB18030 locale support is available on front-end side.
-GB18030 locale test is added.

Any help for testing with these patches and sugguestions for GB18030
support are greatly appreciated.

Best Regards,
Bill

--
/---------------------------/
Bill Huang
E-mail:bill_huanghb@ybb.ne.jp
Cell phone:090-9979-4631
/---------------------------/

--- postgresql-7.2.1/src/backend/utils/mb/conv.c.org    Thu Jun  6 11:52:24 2002
+++ postgresql-7.2.1/src/backend/utils/mb/conv.c    Thu Jun  6 12:20:36 2002
@@ -502,6 +502,96 @@
 }

 /*
+ * GB18030 ---> MIC
+ * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
+ */
+static void
+gb180302mic(unsigned char *gb18030, unsigned char *p, int len)
+{
+    int            c1;
+    int            c2;
+
+    while (len > 0 && (c1 = *gb18030++))
+    {
+        if (c1 < 0x80)
+        {                        /* should be ASCII */
+            len--;
+            *p++ = c1;
+        }
+        else if(c1 >= 0x81 && c1 <= 0xfe)
+        {
+            c2 = *gb18030++;
+
+            if(c2 >= 0x30 && c2 <= 0x69){
+                len -= 4;
+                *p++ = c1;
+                *p++ = c2;
+                *p++ = *gb18030++;
+                *p++ = *gb18030++;
+                *p++ = *gb18030++;
+            }
+            else if ((c2 >=0x40 && c2 <= 0x7e) ||(c2 >=0x80 && c2 <= 0xfe)){
+                len -= 2;
+                *p++ = c1;
+                *p++ = c2;
+                *p++ = *gb18030++;
+            }
+            else{    /*throw the strange code*/
+                len--;
+            }
+        }
+    }
+    *p = '\0';
+}
+
+/*
+ * MIC ---> GB18030
+ * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
+ */
+static void
+mic2gb18030(unsigned char *mic, unsigned char *p, int len)
+{
+    int            c1;
+    int            c2;
+
+    while (len > 0 && (c1 = *mic))
+    {
+        len -= pg_mic_mblen(mic++);
+
+        if (c1 <= 0x7f) /*ASCII*/
+        {
+            *p++ = c1;
+        }
+        else if (c1 >= 0x81 && c1 <= 0xfe)
+        {
+            c2 = *mic++;
+
+            if((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe)){
+                *p++ = c1;
+                *p++ = c2;
+            }
+            else if(c2 >= 0x30 && c2 <= 0x39){
+                *p++ = c1;
+                *p++ = c2;
+                *p++ = *mic++;
+                *p++ = *mic++;
+            }
+            else{
+                mic--;
+                printBogusChar(&mic, &p);
+                mic--;
+                printBogusChar(&mic, &p);
+            }
+        }
+        else{
+            mic--;
+            printBogusChar(&mic, &p);
+        }
+    }
+    *p = '\0';
+}
+
+/*
  * EUC_TW ---> MIC
  */
 static void
@@ -1583,6 +1673,26 @@
 }

 /*
+ * UTF-8 ---> GB18030
+ */
+static void
+utf_to_gb18030(unsigned char *utf, unsigned char *euc, int len)
+
+{
+    utf_to_local(utf, euc, ULmapEUC_CN,
+                 sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), len);
+}
+
+/*
+ * GB18030 ---> UTF-8
+ */
+static void
+gb18030_to_utf(unsigned char *euc, unsigned char *utf, int len)
+{
+    local_to_utf(euc, utf, LUmapEUC_CN,
+          sizeof(LUmapEUC_CN) / sizeof(pg_local_to_utf), PG_EUC_CN, len);
+}
+/*
  * UTF-8 ---> EUC_KR
  */
 static void
@@ -1754,6 +1864,9 @@
         PG_BIG5, big52mic, mic2big5, big5_to_utf, utf_to_big5
     },
     {
+        PG_GB18030, gb180302mic, mic2gb18030, gb18030_to_utf, utf_to_gb18030
+    },
+    {
         PG_WIN1250, win12502mic, mic2win1250, 0, 0
     },
 };
@@ -1841,6 +1954,9 @@
         PG_BIG5, big52mic, mic2big5, 0, 0
     },
     {
+        PG_GB18030, gb180302mic, mic2gb18030, 0, 0
+    },
+    {
         PG_WIN1250, win12502mic, mic2win1250, 0, 0
     },
 };
--- postgresql-7.2.1/src/backend/utils/mb/encnames.c.org    Mon Jun  3 19:24:10 2002
+++ postgresql-7.2.1/src/backend/utils/mb/encnames.c    Mon Jun  3 19:25:26 2002
@@ -173,6 +173,9 @@
     {
         "windows1251", PG_WIN1251
     },                            /* Windows-1251; Microsoft */
+    {
+        "gb18030", PG_GB18030
+    },                            /* GB18030; GB18030 */

     {
         NULL, 0
@@ -268,6 +271,9 @@
         "BIG5", PG_BIG5
     },
     {
+        "GB18030", PG_GB18030
+    },
+    {
         "WIN1250", PG_WIN1250
     }
 };
--- postgresql-7.2.1/src/interfaces/odbc/multibyte.c.org    Wed Jun  5 18:28:30 2002
+++ postgresql-7.2.1/src/interfaces/odbc/multibyte.c    Wed Jun  5 19:48:01 2002
@@ -48,6 +48,28 @@
                         mb_st = 0;
                 }
                 break;
+                /* Chinese GB18030 support
+                 * By Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
+                 * */
+            case GB18030:
+                {
+                    if (mb_st < 2 && s[i] > 0x81)
+                        mb_st = 2;
+                    else if (mb_st == 2)
+                        if(s[i] >= 0x30 && s[i] <= 0x39)
+                            mb_st = 3;
+                        else
+                            mb_st = 1;
+                    else if (mb_st == 3)
+                        if(s[i] >= 0x30 && s[i] <= 0x39)
+                             mb_st = 1;
+                        else
+                             mb_st = 3;
+                    else
+                        mb_st = 0;
+                }
+                break;
+
             default:
                 mb_st = 0;
         }
@@ -87,6 +109,16 @@
     {
         multibyte_client_encoding = BIG5;
         return ("BIG5");
+    }/* Chinese GB18030 support.
+      * Added by Bill Huang  <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
+      */
+    if (strstr(str, "%27GB18030%27") ||
+        strstr(str, "%27gb18030%27") ||
+        strstr(str, "'GB18030'") ||
+        strstr(str, "'gb18030'") )
+    {
+        multibyte_client_encoding = GB18030;
+        return ("GB18030");
     }
     return ("OTHER");
 }
@@ -127,6 +159,25 @@
                 else
                     multibyte_status = 0;
             }
+            break;
+            /*Chinese GB18030 support.Added by Bill Huang <bhuang@redhat.com> <bill_huanghb@ybb.ne.jp>*/
+        case GB18030:
+            {
+                if (multibyte_status < 2 && s > 0x80)
+                    multibyte_status = 2;
+                else if (multibyte_status = 2)
+                    if (s >= 0x30 && s <= 0x39)
+                        multibyte_status = 3;
+                    else
+                        multibyte_status = 1;
+                else if (multibyte_status = 3)
+                    if (s >= 0x30 && s <= 0x39)
+                        multibyte_status = 1;
+                    else
+                        multibyte_status = 3;
+                else
+                    multibyte_status = 0;
+            }
             break;
         default:
             multibyte_status = 0;
--- postgresql-7.2.1/src/interfaces/odbc/multibyte.h.org    Wed Jun  5 19:51:20 2002
+++ postgresql-7.2.1/src/interfaces/odbc/multibyte.h    Wed Jun  5 19:51:35 2002
@@ -28,6 +28,7 @@
 #define SJIS                32    /* Shift JIS */
 #define BIG5                33    /* Big5 */
 #define WIN1250                34    /* windows-1250 */
+#define GB18030                35    /* GB18030 */

 extern int    multibyte_client_encoding;    /* Multibyte client encoding. */
 extern int    multibyte_status;    /* Multibyte charcter status. */
--- postgresql-7.2.1/src/include/mb/pg_wchar.h.org    Mon May 27 20:07:58 2002
+++ postgresql-7.2.1/src/include/mb/pg_wchar.h    Mon May 27 20:08:59 2002
@@ -182,6 +182,7 @@
     /* followings are for client encoding only */
     PG_SJIS,                    /* Shift JIS */
     PG_BIG5,                    /* Big5 */
+    PG_GB18030,                    /* GB18030 */
     PG_WIN1250,                    /* windows-1250 */

     _PG_LAST_ENCODING_            /* mark only */
--- postgresql-7.2.1/src/backend/utils/mb/wchar.c.org    Mon May 27 20:02:44 2002
+++ postgresql-7.2.1/src/backend/utils/mb/wchar.c    Mon May 27 20:03:12 2002
@@ -457,6 +457,33 @@
     return (len);
 }

+/*
+ * GB18030
+ * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
+ */
+static int
+pg_gb18030_mblen(const unsigned char *s)
+{
+    int            len;
+
+    if (*s <= 0x7f)
+    {                            /* kanji? */
+        len = 1;
+    }
+    else
+    {                            /* should be ASCII */
+
+        if((*(s+1) >0x40 && *(s+1) <= 0x7e)
+                || (*(s+1) >= 0x80 && *(s+1) <= 0xfe))
+            len = 2;
+        else if(*(s+1) >0x30 && *(s+1) <= 0x39)
+            len = 4;
+        else
+            len = 2;
+    }
+    return (len);
+}
+
 pg_wchar_tbl pg_wchar_table[] = {
     {pg_ascii2wchar_with_len, pg_ascii_mblen, 1},        /* 0; PG_SQL_ASCII    */
     {pg_eucjp2wchar_with_len, pg_eucjp_mblen, 3},        /* 1; PG_EUC_JP */
@@ -483,6 +510,7 @@
     {pg_latin12wchar_with_len, pg_latin1_mblen, 1},        /* 22; ISO-8859-15 */
     {pg_latin12wchar_with_len, pg_latin1_mblen, 1},        /* 23; ISO-8859-16 */
     {0, pg_sjis_mblen, 2},        /* 24; PG_SJIS */
+    {0, pg_gb18030_mblen, 2},    /* 25; PG_GB18030 */
     {0, pg_big5_mblen, 2},        /* 25; PG_BIG5 */
     {pg_latin12wchar_with_len, pg_latin1_mblen, 1}        /* 26; PG_WIN1250 */
 };

pgsql-patches by date:

Previous
From: Bruce Momjian
Date:
Subject: Re: Cygwin InstallXLogFileSegment() rename() patch
Next
From: "Dave Page"
Date:
Subject: FW: Patch for current_schemas to optionally include implicit schemas