From 9564aa018c7eae9d66589b2be4303f8aede94f77 Mon Sep 17 00:00:00 2001 From: Arjen Nienhuis Date: Sun, 3 May 2015 22:28:26 +0200 Subject: [PATCH] Have GB18030 handle more than 2-byte Unicode code points BUG #12845: The GB18030 encoding doesn't support Unicode characters over 0xFFFF SELECT convert_to(chr(128512), 'GB18030'); expected result: convert_to ------------ \x9439fc36 (1 row) --- .../utf8_and_gb18030/utf8_and_gb18030.c | 279 ++++++++++++++++++++- src/backend/utils/mb/wchar.c | 2 +- src/include/mb/pg_wchar.h | 1 + src/test/regress/expected/conversion.out | 16 +- src/test/regress/sql/conversion.sql | 4 +- 5 files changed, 287 insertions(+), 15 deletions(-) diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c index 4427fea..c645831 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c @@ -25,6 +25,16 @@ PG_FUNCTION_INFO_V1(utf8_to_gb18030); extern Datum gb18030_to_utf8(PG_FUNCTION_ARGS); extern Datum utf8_to_gb18030(PG_FUNCTION_ARGS); +static uint32 utf8_to_gb18030_hi(uint32 utf8); +static uint32 gb18030_to_utf8_hi(uint32 gb); +static int compare1(const void *p1, const void *p2); +static int compare2(const void *p1, const void *p2); + +/* All Unicode codepoints over U+FFFF are mapped to one range in the GB18030 encoding */ +static const uint32 UTF32_FIRST = 0x10000; +static const uint32 GB18030_FIRST = 0x90308130; +static const uint32 GB18030_LAST = 0xe3329a35; + /* ---------- * conv_proc( * INTEGER, -- source encoding id @@ -41,11 +51,84 @@ gb18030_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + unsigned int iiso; + unsigned int outf; + int l; + pg_local_to_utf *p; CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8); - LocalToUtf(src, dest, LUmapGB18030, NULL, - sizeof(LUmapGB18030) / sizeof(pg_local_to_utf), 0, PG_GB18030, len); + for (; len > 0; len -= l) + { + /* "break" cases all represent errors */ + if (*src == '\0') + break; + + if (!IS_HIGHBIT_SET(*src)) + { + /* ASCII case is easy */ + *dest++ = *src++; + l = 1; + continue; + } + + l = pg_gb18030_verifier(src, len); + if (l < 0) + break; + + if (l == 2) + { + iiso = *src++ << 8; + iiso |= *src++; + } + else if (l == 4) + { + iiso = *src++ << 24; + iiso |= *src++ << 16; + iiso |= *src++ << 8; + iiso |= *src++; + + if (iiso >= GB18030_FIRST && iiso <= GB18030_LAST) + { + outf = gb18030_to_utf8_hi(iiso); + *dest++ = outf >> 24; + *dest++ = (outf & 0x00ff0000) >> 16; + *dest++ = (outf & 0x0000ff00) >> 8; + *dest++ = outf & 0x000000ff; + continue; + } + } + else + { + elog(ERROR, "unsupported character length %d", l); + iiso = 0; /* keep compiler quiet */ + } + + p = bsearch(&iiso, LUmapGB18030, sizeof(LUmapGB18030) / sizeof(pg_local_to_utf), + sizeof(pg_local_to_utf), compare2); + + if (p == NULL) + { + report_untranslatable_char(PG_GB18030, PG_UTF8, + (const char *) (src - l), len); + } + else + { + if (p->utf & 0xff000000) + *dest++ = p->utf >> 24; + if (p->utf & 0x00ff0000) + *dest++ = (p->utf & 0x00ff0000) >> 16; + if (p->utf & 0x0000ff00) + *dest++ = (p->utf & 0x0000ff00) >> 8; + if (p->utf & 0x000000ff) + *dest++ = p->utf & 0x000000ff; + } + } + + if (len > 0) + report_invalid_encoding(PG_GB18030, (const char *) src, len); + + *dest = '\0'; PG_RETURN_VOID(); } @@ -56,11 +139,199 @@ utf8_to_gb18030(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + uint32 iutf; + uint32 code; + pg_utf_to_local *p; + int l; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030); - UtfToLocal(src, dest, ULmapGB18030, NULL, - sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), 0, PG_GB18030, len); + for (; len > 0; len -= l) + { + /* "break" cases all represent errors */ + if (*src == '\0') + break; + + l = pg_utf_mblen(src); + + if (len < l) + break; + + if (!pg_utf8_islegal(src, l)) + break; + + if (l == 1) + { + /* ASCII case is easy */ + *dest++ = *src++; + continue; + } + else if (l == 2) + { + iutf = *src++ << 8; + iutf |= *src++; + } + else if (l == 3) + { + iutf = *src++ << 16; + iutf |= *src++ << 8; + iutf |= *src++; + } + else if (l == 4) + { + iutf = *src++ << 24; + iutf |= *src++ << 16; + iutf |= *src++ << 8; + iutf |= *src++; + /* 4 byte codes all map to the linear range */ + code = utf8_to_gb18030_hi(iutf); + *dest++ = code >> 24; + *dest++ = (code & 0x00ff0000) >> 16; + *dest++ = (code & 0x0000ff00) >> 8; + *dest++ = code & 0x000000ff; + continue; + } + else + { + elog(ERROR, "unsupported character length %d", l); + iutf = 0; /* keep compiler quiet */ + } + + p = bsearch(&iutf, ULmapGB18030, sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), + sizeof(pg_utf_to_local), compare1); + if (p == NULL) + report_untranslatable_char(PG_UTF8, PG_GB18030, + (const char *) (src - l), len); + code = p->code; + /* GB18030 is always 1, 2 or 4 bytes. 1 byte is handled above */ + if (code & 0xffff0000) + { + *dest++ = code >> 24; + *dest++ = (code & 0x00ff0000) >> 16; + } + *dest++ = (code & 0x0000ff00) >> 8; + *dest++ = code & 0x000000ff; + } + + if (len > 0) + report_invalid_encoding(PG_UTF8, (const char *) src, len); + + *dest = '\0'; PG_RETURN_VOID(); } + +/* + * comparison routine for bsearch() + * this routine is intended for UTF8 -> local code + */ +static int +compare1(const void *p1, const void *p2) +{ + uint32 v1, + v2; + + v1 = *(const uint32 *) p1; + v2 = ((const pg_utf_to_local *) p2)->utf; + return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); +} + +/* + * comparison routine for bsearch() + * this routine is intended for local code -> UTF8 + */ +static int +compare2(const void *p1, const void *p2) +{ + uint32 v1, + v2; + + v1 = *(const uint32 *) p1; + v2 = ((const pg_local_to_utf *) p2)->code; + return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); +} + +/* + * Convert UTF32 to UTF-8 + * Works only for >= U+10000 + */ +static uint32 +utf32_to_utf8_hi(uint32 utf32) { + uint32 b1 = (utf32 >> 18) | 0xF0; + uint32 b2 = ((utf32 >> 12) & 0x3F) | 0x80; + uint32 b3 = ((utf32 >> 6) & 0x3F) | 0x80; + uint32 b4 = (utf32 & 0x3F) | 0x80; + return (b1 << 24) | (b2 << 16) | (b3 << 8) | (b4 << 0); +} + +/* + * Convert UTF-8 to UTF32 + * Works only for >= U+10000 + */ +static uint32 +utf8_to_utf32_hi(uint32 utf8) { + /* assert(utf8 > 0xffffff); */ + uint32 b1 = (utf8 & 0x07000000) >> 6; + uint32 b2 = (utf8 & 0x003f0000) >> 4; + uint32 b3 = (utf8 & 0x00003f00) >> 2; + uint32 b4 = (utf8 & 0x0000003f) >> 0; + return b1 | b2 | b3 | b4; +} + +static uint32 +gb_linear(uint32 gb) { + uint32 b0 = (gb & 0xff000000) >> 24; + uint32 b1 = (gb & 0x00ff0000) >> 16; + uint32 b2 = (gb & 0x0000ff00) >> 8; + uint32 b3 = (gb & 0x000000ff) >> 0; + return b0 * 12600 + b1 * 1260 + b2 * 10 + b3; +} + +static uint32 +gb_unlinear(uint32 lin) { + uint32 zlin = lin - gb_linear(0x81308130); + uint32 r3 = 0x30 + zlin % 10; + uint32 r2 = 0x81 + (zlin / 10) % 126; + uint32 r1 = 0x30 + (zlin / 1260) % 10; + uint32 r0 = 0x81 + zlin / 12600; + return (r0 << 24) | (r1 << 16) | (r2 << 8) | (r3 << 0); +} + +/* + * Convert GB18030 to UTF32 + * Works only for >= U+10000 + */ +static uint32 +gb_to_utf32_hi(uint32 gb) +{ + return UTF32_FIRST + (gb_linear(gb) - gb_linear(GB18030_FIRST)); +} + +/* + * Convert UTF32 to GB18030 + * Works only for >= U+10000 + */ +static uint32 +utf32_to_gb18030_hi(uint32 utf32) { + return gb_unlinear(gb_linear(GB18030_FIRST) + utf32 - UTF32_FIRST); +} + +/* + * Convert UTF-8 to GB18030 + * Works only for >= U+10000 + */ +static uint32 +utf8_to_gb18030_hi(uint32 utf8) { + uint32 utf32 = utf8_to_utf32_hi(utf8); + return utf32_to_gb18030_hi(utf32); +} + +/* + * Convert UTF-8 to GB18030 + * Works only for >= U+10000 + */ +static uint32 +gb18030_to_utf8_hi(uint32 gb) { + uint32 utf32 = gb_to_utf32_hi(gb); + return utf32_to_utf8_hi(utf32); +} diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 0cc753e..f19a19c 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1400,7 +1400,7 @@ pg_uhc_verifier(const unsigned char *s, int len) return mbl; } -static int +int pg_gb18030_verifier(const unsigned char *s, int len) { int l, diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index f7222fc..ce757b9 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -531,6 +531,7 @@ extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab); +extern int pg_gb18030_verifier(const unsigned char *s, int len); extern bool pg_utf8_islegal(const unsigned char *source, int length); #ifdef WIN32 diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index 82eca26..13f1cf3 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -523,17 +523,17 @@ SELECT CONVERT('foo', 'UTF8', 'EUC_TW'); (1 row) -- GB18030 --> UTF8 -SELECT CONVERT('foo', 'GB18030', 'UTF8'); - convert ---------- - foo +SELECT CONVERT('Postgres \247\343\247\335\247\340\247\337 \2249\3138 \317\363 \250\246le\2010\2747phant', 'GB18030', 'UTF8'); + convert +------------------------------------------------------------------------------------------------- + Postgres \321\201\320\273\320\276\320\275 \360\237\220\230 \350\261\241 \303\251le\314\201phant (1 row) -- UTF8 --> GB18030 -SELECT CONVERT('foo', 'UTF8', 'GB18030'); - convert ---------- - foo +SELECT CONVERT('Postgres \321\201\320\273\320\276\320\275 \360\237\220\230 \350\261\241 \303\251le\314\201phant', 'UTF-8', 'GB18030'); + convert +----------------------------------------------------------------------------------------- + Postgres \247\343\247\335\247\340\247\337 \2249\3138 \317\363 \250\246le\2010\2747phant (1 row) -- GBK --> UTF8 diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index be194ee..e27f06f 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -171,9 +171,9 @@ SELECT CONVERT('foo', 'EUC_TW', 'UTF8'); -- UTF8 --> EUC_TW SELECT CONVERT('foo', 'UTF8', 'EUC_TW'); -- GB18030 --> UTF8 -SELECT CONVERT('foo', 'GB18030', 'UTF8'); +SELECT CONVERT('Postgres \247\343\247\335\247\340\247\337 \2249\3138 \317\363 \250\246le\2010\2747phant', 'GB18030', 'UTF8'); -- UTF8 --> GB18030 -SELECT CONVERT('foo', 'UTF8', 'GB18030'); +SELECT CONVERT('Postgres \321\201\320\273\320\276\320\275 \360\237\220\230 \350\261\241 \303\251le\314\201phant', 'UTF-8', 'GB18030'); -- GBK --> UTF8 SELECT CONVERT('foo', 'GBK', 'UTF8'); -- UTF8 --> GBK -- 2.1.0