[v9.2] make_greater_string() does not return a string in some cases - Mailing list pgsql-hackers
From | Kyotaro HORIGUCHI |
---|---|
Subject | [v9.2] make_greater_string() does not return a string in some cases |
Date | |
Msg-id | 20110914.111320.18404009.horiguchi.kyotaro@oss.ntt.co.jp Whole thread Raw |
In response to | Re: make_greater_string() does not return a string in some cases (Kyotaro HORIGUCHI <horiguchi.kyotaro@oss.ntt.co.jp>) |
Responses |
Re: [v9.2] make_greater_string() does not return a string
in some cases
Re: [v9.2] make_greater_string() does not return a string in some cases |
List | pgsql-hackers |
This is rebased patch of `Allow encoding specific character incrementer'(https://commitfest.postgresql.org/action/patch_view?id=602). Addition to the patch, increment sanity check program for new functions pg_generic_charinc and pg_utf8_increment is attached. -- Kyotaro Horiguchi NTT Open Source Software Center diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 5d999e6..b7f1922 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -5652,6 +5652,18 @@ pattern_selectivity(Const *patt, Pattern_Type ptype)/* + * This function is "character increment" function for bytea used in + * make_greater_string() that has same interface with pg_wchar_tbl.charinc. + */ +static bool byte_increment(unsigned char *ptr, int len) +{ + if (*ptr >= 255) return false; + + (*ptr)++; + return true; +} + +/* * Try to generate a string greater than the given string or any * string it is a prefix of. If successful, return apalloc'd string * in the form of a Const node; else return NULL. @@ -5690,6 +5702,7 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) int len; Datum cmpstr; text *cmptxt = NULL; + character_incrementer charincfunc; /* * Get a modifiable copy of the prefix string in C-string format, and set @@ -5751,27 +5764,38 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) } } + if (datatype != BYTEAOID) + charincfunc = pg_database_encoding_character_incrementer(); + else + charincfunc = &byte_increment; + while (len > 0) { - unsigned char *lastchar = (unsigned char *) (workstr + len - 1); - unsigned char savelastchar = *lastchar; + int charlen; + unsigned char *lastchar; + unsigned char savelastbyte; + Const *workstr_const; + + if (datatype == BYTEAOID) + charlen = 1; + else + charlen = len - pg_mbcliplen(workstr, len, len - 1); + + lastchar = (unsigned char *) (workstr + len - charlen); /* - * Try to generate a larger string by incrementing the last byte. + * savelastbyte has meaning only for datatype == BYTEAOID */ - while (*lastchar < (unsigned char) 255) - { - Const *workstr_const; + savelastbyte = *lastchar; - (*lastchar)++; + /* + * Try to generate a larger string by incrementing the last byte or + * character. + */ + if (charincfunc(lastchar, charlen)) { if (datatype != BYTEAOID) - { - /* do not generate invalid encoding sequences */ - if (!pg_verifymbstr(workstr, len, true)) - continue; workstr_const = string_to_const(workstr, datatype); - } else workstr_const = string_to_bytea_const(workstr, len); @@ -5786,26 +5810,17 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) pfree(workstr); return workstr_const; } - + /* No good, release unusable value and try again */ pfree(DatumGetPointer(workstr_const->constvalue)); pfree(workstr_const); } - /* restore last byte so we don't confuse pg_mbcliplen */ - *lastchar = savelastchar; - /* - * Truncate off the last character, which might be more than 1 byte, - * depending on the character encoding. + * Truncate off the last character or restore last byte for BYTEA. */ - if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1) - len = pg_mbcliplen(workstr, len, len - 1); - else - len -= 1; - - if (datatype != BYTEAOID) - workstr[len] = '\0'; + len -= charlen; + workstr[len] = (datatype != BYTEAOID ? '\0' : savelastbyte); } /* Failed... */ diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index f23732f..00b3e2a 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,3 +1,4 @@ +/* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii @@ -1336,53 +1337,254 @@ pg_utf8_islegal(const unsigned char *source, int length)/* *------------------------------------------------------------------- + * character incrementer + * + * These functions accept "charptr", a pointer to the first byte of a + * maybe-multibyte character. Try `increment' the character and return true if + * successed. If these functions returns false, the character should be + * untouched. These functions must be implemented in correspondence with + * verifiers, in other words, the rewrited character by this function must pass + * the check by pg_*_verifier() if returns true. Returning the return value of + * pg_*_verifier() corresponding can finnaly avoid such a inconsistency when + * something wrong. + * ------------------------------------------------------------------- + */ + +#ifndef FRONTEND +static bool pg_generic_charinc(unsigned char *charptr, int len) +{ + unsigned char *lastchar = (unsigned char *) (charptr + len - 1); + unsigned char savelastchar = *lastchar; + const char *const_charptr = (const char *)charptr; + + while (*lastchar < (unsigned char) 255) + { + (*lastchar)++; + if (!pg_verifymbstr(const_charptr, len, true)) + continue; + return true; + } + + *lastchar = savelastchar; + return false; +} + +static bool pg_utf8_increment(unsigned char *charptr, int length) +{ + unsigned char a; + unsigned char bak[4]; + bool success; + + memcpy(bak, charptr, length); + switch (length) + { + default: + /* reject lengths 5 and 6 for now */ + return false; + case 4: + a = charptr[3]; + if (a < 0xBF) + { + charptr[3]++; + break; + } + charptr[3] = 0x80; + /* FALL THRU */ + case 3: + a = charptr[2]; + if (a < 0xBF) + { + charptr[2]++; + break; + } + charptr[2] = 0x80; + /* FALL THRU */ + case 2: + a = charptr[1]; + if ((*charptr == 0xed && a < 0x9F) || a < 0xBF) + { + charptr[1]++; + break; + } + charptr[1] = 0x80; + /* FALL THRU */ + case 1: + a = *charptr; + if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF7) { + memcpy(charptr, bak, length); + return false; + } + charptr[0]++; + break; + } + + /* Check the result with pg_utf8_islegal as the last resort. */ + success = pg_utf8_islegal(charptr, length); + if (!success) + memcpy(charptr, bak, length); + + return success; +} + +static bool pg_eucjp_increment(unsigned char *charptr, int length) { + unsigned char bak[3]; + bool success; + unsigned char c1, c2; + signed int i; + + memcpy(bak, charptr, length); + + c1 = *charptr; + + switch (c1) + { + case SS2: /* JIS X 0201 */ + if (length != 2) return false; + + c2 = charptr[1]; + + if (c2 > 0xde) + charptr[0] = charptr[1] = 0xa1; + else if (c2 < 0xa1) + charptr[1] = 0xa1; + else + charptr[1]++; + + break; + + case SS3: /* JIS X 0212 */ + if (length != 3) return false; + + for (i = 2 ; i > 1 ; i--) + { + c2 = charptr[i]; + if (c2 < 0xa1) + { + charptr[i] = 0xa1; + return true; + } + else if (c2 < 0xfe) + { + charptr[i]++; + break; + } + charptr[i] = 0xa1; + } + + + if (i == 0) /* Out of code region */ + { + memcpy(charptr, bak, length); + return false; + } + + break; + + default: + if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ + { + if (length != 2) return false; + + for (i = 1 ; i >= 0 ; i--) /* i must be signed */ + { + c2 = charptr[i]; + if (c2 < 0xa1) + { + charptr[i] = 0xa1; + return true; + } + else if (c2 < 0xfe) + { + charptr[i]++; + break; + } + charptr[i] = 0xa1; + } + + if (i < 0) /* Out of 2 byte code region */ + { + memcpy(charptr, bak, length); + return false; + } + } + else + { /* ASCII */ + if (c1 > 0x7e) + return false; + (*charptr)++; + } + } + + + /* Check the result with pg_eucjp_verifier as the last resort. */ + success = (pg_eucjp_verifier(charptr, length) == length); + if (!success) + memcpy(charptr, bak, length); + + return success; +} +#else +/* + * Character increment functions are not available on frontend. Abort on call + * to prevent miseuse. + */ +static bool pg_generic_charinc(unsigned char *charptr, int len) { + fputs(_("Character incrementer cannot be used in frontend.\n"), stderr); + abort(); +} +#define pg_utf8_increment pg_generic_charinc +#define pg_eucjp_increment pg_generic_charinc +#endif + +/* + *------------------------------------------------------------------- * encoding info table * XXX must be sorted by thesame order as enum pg_enc (in mb/pg_wchar.h) *------------------------------------------------------------------- */pg_wchar_tblpg_wchar_table[] = { - {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */ - {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */ - {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */ - {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */ - {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */ - {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */ - {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */ - {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */ - {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */ - {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */ - {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */ - {0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */ - {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */ + {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_generic_charinc, pg_ascii_verifier, 1}, /* PG_SQL_ASCII*/ + {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_increment, pg_eucjp_verifier, 3}, /* PG_EUC_JP*/ + {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_generic_charinc, pg_euccn_verifier, 2}, /* PG_EUC_CN*/ + {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_generic_charinc, pg_euckr_verifier, 3}, /* PG_EUC_KR*/ + {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_generic_charinc, pg_euctw_verifier, 4}, /* PG_EUC_TW*/ + {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_increment, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004*/ + {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_increment, pg_utf8_verifier, 4}, /* PG_UTF8 */ + {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_generic_charinc, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL*/ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_LATIN1 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_LATIN2 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_LATIN3 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_LATIN4 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_LATIN5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_LATIN6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_LATIN7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_LATIN8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_LATIN9 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_LATIN10 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN1256 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN1258 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN866 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN874 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_KOI8R */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN1251 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN1252 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*ISO-8859-5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*ISO-8859-6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*ISO-8859-7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*ISO-8859-8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN1250 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN1253 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN1254 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN1255 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_WIN1257 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1}, /*PG_KOI8U */ + {0, pg_sjis_mblen, pg_sjis_dsplen, pg_generic_charinc, pg_sjis_verifier, 2}, /* PG_SJIS */ + {0, pg_big5_mblen, pg_big5_dsplen, pg_generic_charinc, pg_big5_verifier, 2}, /* PG_BIG5 */ + {0, pg_gbk_mblen, pg_gbk_dsplen, pg_generic_charinc, pg_gbk_verifier, 2}, /* PG_GBK */ + {0, pg_uhc_mblen, pg_uhc_dsplen, pg_generic_charinc, pg_uhc_verifier, 2}, /* PG_UHC */ + {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_generic_charinc, pg_gb18030_verifier, 4}, /* PG_GB18030 */ + {0, pg_johab_mblen, pg_johab_dsplen, pg_generic_charinc, pg_johab_verifier, 3}, /* PG_JOHAB */ + {0, pg_sjis_mblen, pg_sjis_dsplen, pg_generic_charinc, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */};/* returnsthe byte length of a word for mule internal code */ @@ -1459,6 +1661,15 @@ pg_database_encoding_max_length(void)}/* + * give the character incrementer for the encoding for the current database + */ +character_incrementer +pg_database_encoding_character_incrementer(void) +{ + return pg_wchar_table[GetDatabaseEncoding()].charinc; +} + +/* * Verify mbstr to make sure that it is validly encoded in the current * database encoding. Otherwise same as pg_verify_mbstr().*/ diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 826c7af..356703a 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -284,6 +284,8 @@ typedef int (*mblen_converter) (const unsigned char *mbstr);typedef int (*mbdisplaylen_converter) (constunsigned char *mbstr); +typedef bool (*character_incrementer) (unsigned char *mbstr, int len); +typedef int (*mbverifier) (const unsigned char *mbstr, int len);typedef struct @@ -292,6 +294,7 @@ typedef struct * string to a wchar */ mblen_convertermblen; /* get byte length of a char */ mbdisplaylen_converter dsplen; /* get display widthof a char */ + character_incrementer charinc; /* Character code incrementer if not null */ mbverifier mbverify; /* verifymultibyte sequence */ int maxmblen; /* max bytes for a char in this encoding */} pg_wchar_tbl; @@ -389,6 +392,7 @@ extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,extern int pg_mbcharcliplen(constchar *mbstr, int len, int imit);extern int pg_encoding_max_length(int encoding);extern int pg_database_encoding_max_length(void); +extern character_incrementer pg_database_encoding_character_incrementer(void);extern int PrepareClientEncoding(int encoding);externint SetClientEncoding(int encoding); // sanity test for utf specific character incrementer. // // -v displays status for invalid source code. // -m displays status for the result that the product of new // incrementer is match to the one of the generic incrementer. // show all status lines when both -v and -m are specified. // // `utftest | grep FAILED' shows remaining glitches using new // incrementer. (4 lines) // // CAUTION: this program yields so much lines. // // `utftest' yields 17375 lines. These lines are the saved by new func // and remaining glitches. // `utftest -m' yields 1112064 lines. // // // Sample of status lines: // src char src utf8 dst utf8 dest char result status // 000d7bf => ed9ebf => ed9f80 (000d7c0) successed - Don't match to generic inc(000d7bf) // 000d7ff => ed9fbf => ed9fbf (000d7ff) FAILED - Match to generic inc // 000d800 => eda080 Source not valid utf8 // // successed/FAILED in result status shows the return value of // character increment function. Following description says that the // result of the new incrementer was/was'nt identical to the generic // incrementer. #include <stdio.h> #include <stdarg.h> typedef int bool; static int true = 1; static int false = 0; static bool pg_utf8_increment(unsigned char *mbstr, int length); static bool pg_generic_charinc(unsigned char *charptr, int len); void uni2utf8(unsigned int unicode, unsigned char *utf8buf); unsigned int utf8tounicode(unsigned char *utf8buf); int scatf(char* buf, char* format, ...); int main(int argc, char** argv) { unsigned char buf[4], buf2[4]; char outbuf[1024]; unsigned int i; int dispinvalid = 0;int dispmatch = 0; for (i = 1 ; i < argc ; i++) {if (strcmp(argv[i], "-v") == 0) dispinvalid = 1;if (strcmp(argv[i], "-m") == 0) dispmatch= 1; } for(i = 0 ; i < 0x1010000 ; i++) {bool prechk, successed, gensuccess, match; uni2utf8(i, buf);uni2utf8(i, buf2);*outbuf = 0; scatf(outbuf, "%07x => ", i); int len = pg_utf_mblen(buf); int j = 0; while (j < len) scatf(outbuf, "%02x", buf[j++]); while (j < 4) { scatf(outbuf, " "); j++;} prechk = pg_utf8_islegal(buf, len);if (! prechk) { scatf(outbuf, "Source notvalid utf8"); if (dispinvalid) puts(outbuf); continue;} successed = pg_utf8_increment(buf, len);scatf(outbuf, " => ");j = 0;while (j < len) scatf(outbuf, "%02x", buf[j++]); while(j < 4) { scatf(outbuf, " "); j++;} gensuccess = pg_generic_charinc(buf2, len);match = (memcmp(buf, buf2, len) == 0); if (!gensuccess || !match || dispmatch) { scatf(outbuf, "(%07x) %s - %s", utf8tounicode(buf), (successed? "successed" : "FAILED"), (match ? "Match to generic inc" : "Don't match to generic inc")); if (!match){ scatf(outbuf, "(%07x)", utf8tounicode(buf2)); } puts(outbuf);} } } bool pg_utf8_islegal(const unsigned char *source, int length) {unsigned char a; switch (length){ default: /* reject lengths 5 and 6 for now */ return false; case 4: a = source[3]; if (a < 0x80 || a > 0xBF) return false; /* FALL THRU */ case 3: a = source[2]; if (a < 0x80 || a > 0xBF) return false; /* FALL THRU */ case 2: a = source[1]; switch (*source) { case 0xE0: if (a < 0xA0 || a > 0xBF) return false; break; case 0xED: if (a < 0x80 || a > 0x9F) return false; break; case 0xF0: if (a < 0x90 || a > 0xBF) returnfalse; break; case 0xF4: if (a < 0x80 || a > 0x8F) returnfalse; break; default: if (a < 0x80 || a > 0xBF) returnfalse; break; } /* FALL THRU */ case 1: a = *source; if (a >= 0x80 &&a < 0xC2) return false; if (a > 0xF4) return false; break;}return true; } int pg_utf_mblen(const unsigned char *s) {int len; if ((*s & 0x80) == 0) len = 1;else if ((*s & 0xe0) == 0xc0) len = 2;else if ((*s & 0xf0) == 0xe0) len = 3;else if((*s & 0xf8) == 0xf0) len = 4; #ifdef NOT_USEDelse if ((*s & 0xfc) == 0xf8) len = 5;else if ((*s & 0xfe) == 0xfc) len = 6; #endifelse len = 1;return len; } static bool pg_utf8_increment(unsigned char *charptr, int length) { unsigned char a; unsigned char bak[4]; bool success; memcpy(bak, charptr, length); switch (length) { default: /* reject lengths 5 and 6 for now */ return false; case 4: a = charptr[3]; if (a < 0xBF) { charptr[3]++; break; } charptr[3] = 0x80; /* FALL THRU */ case 3: a = charptr[2]; if (a < 0xBF) { charptr[2]++; break; } charptr[2] = 0x80; /* FALLTHRU */ case 2: a = charptr[1]; if ((*charptr == 0xed && a < 0x9F) || a < 0xBF) { charptr[1]++; break; } charptr[1] = 0x80; /* FALL THRU*/ case 1: a = *charptr; if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF7) { memcpy(charptr, bak, length); return false; } charptr[0]++; break; } /* Check the result with pg_utf8_islegal as the last resort. */ success = pg_utf8_islegal(charptr, length); if (!success) memcpy(charptr, bak, length); return success; } void uni2utf8(unsigned int unicode, unsigned char *utf8buf) { int i, len; if (unicode < 0x80) { len = 1; *utf8buf =0; } else if (unicode < 0x800) { len = 2; *utf8buf = 0xc0; } else if (unicode < 0x10000) { len = 3; *utf8buf = 0xe0;} else if (unicode < 0x110000) { len = 4; *utf8buf = 0xf0; } else { printf("Uunicode of of range: %x\n", unicode);exit(1);} for(i = len - 1 ; i > 0 ; i--) { utf8buf[i] = (0x80 | (unicode & 0x3f)); unicode >>= 6; } *utf8buf |= unicode; } unsigned int utf8tounicode(unsigned char *utf8buf) { unsigned int a = *utf8buf; if (a < 0x80) return a; if (a < 0xc0) return0xfffffff; if (a < 0xe0)return ((utf8buf[0] - 0xc0) << 6) + (utf8buf[1] - 0x80); if (a < 0xf0)return ((utf8buf[0]- 0xe0) << 12) + ((utf8buf[1] - 0x80) << 6) + utf8buf[2] - 0x80; if (a < 0xf8)return ((utf8buf[0] - 0xf0)<< 18) + ((utf8buf[1] - 0x80) << 12) + ((utf8buf[2] - 0x80) << 6) + utf8buf[3] - 0x80; return 0xfffffff; } static bool pg_generic_charinc(unsigned char *charptr, int len) { unsigned char *lastchar = (unsigned char *) (charptr + len - 1); unsigned char savelastchar = *lastchar; const char *const_charptr = (const char *)charptr; while (*lastchar < (unsigned char) 255) { (*lastchar)++; if (!pg_utf8_islegal(const_charptr, len)) // modified. continue; return true; } *lastchar = savelastchar; return false; } int scatf(char* buf, char* format, ...) { va_list args; int ret; va_start(args, format); ret = vsprintf(buf + strlen(buf), format, args); va_end(args); return ret; } // sanity test for euc-japan specific character incrementer. // // -v displays status for invalid source charcode. // -m displays status for the result that the product of new // incrementer is match to the one of the generic incrementer. // show all status lines when both -v and -m are specified. // // `euctest | grep FAILED' shows remaining glitches using new // incrementer. (2 lines) // // CAUTION: // `euctest' yields 190 lines. // `euctest -m' yields 17863 lines. // `euctest -m -v' yields 16843008 lines. // // Sample of output lines: // src => dest result - status // 7e => 7f successed - Match to generic inc // 7f => 7f FAILED - Match to generic inc // 8edf => a1a1 successed - Don't match to generic inc(8edf) // // successed/FAILED in result status shows the return value of // character increment function. Following description says that the // result of the new incrementer was/was'nt identical to the generic // incrementer. #include <stdio.h> #include <stdarg.h> #define SS2 0x8e /* single shift 2 (JIS0201) */ #define SS3 0x8f /* single shift 3 (JIS0212) */ #define HIGHBIT (0x80) #define IS_HIGHBIT_SET(ch) ((unsigned char)(ch) & HIGHBIT) typedef int bool; static int false = 0; static int true = 1; static bool pg_generic_charinc(unsigned char *charptr, int len); static bool pg_eucjp_increment(unsigned char *charptr, int length); static int pg_eucjp_verifier(const unsigned char *s, int len); void do_check(int len, unsigned char *buf, int dispinvalid, int dispmatch); int scatf(char* buf, char* format, ...); int main(int argc, char **argv) { unsigned int i, j, k; unsigned char buf[3]; int res; int dispinvalid = 0; int dispmatch= 0; for (i = 1 ; i < argc ; i++) {if (strcmp(argv[i], "-v") == 0) dispinvalid = 1;if (strcmp(argv[i], "-m") == 0) dispmatch= 1; } // single byte characters for (i = 0 ; i < 256 ; i++) {*buf = i;do_check(1, buf, dispinvalid, dispmatch); } // 2 byte characters for (i = 0 ; i < 256 ; i++) {for (j = 0 ; j < 256 ; j++) { *buf = i; buf[1] = j; do_check(2, buf,dispinvalid, dispmatch);} } // 3 byte characters for (i = 0 ; i < 256 ; i++) {for (j = 0 ; j < 256 ; j++) { for (k = 0 ; k < 256 ; k++) { *buf =i; buf[1] = j; buf[2] = k; do_check(3, buf, dispinvalid, dispmatch); } } } } void do_check(int len, unsigned char *buf, int dispinvalid, int dispmatch) { unsigned char buf2[3]; char outbuf[1024]; inti, src_is_valid, successed, gensuccessed, match; *outbuf = 0; src_is_valid = (pg_eucjp_verifier(buf, len) == len); if (!src_is_valid) {if (dispinvalid) { for (i = 0 ; i < len ; i++) scatf(outbuf, "%02x", buf[i]); strcat(outbuf, "- Src char is invalid."); puts(outbuf);}return; } memcpy(buf2, buf, len); for (i = 0 ; i < len ; i++)scatf(outbuf, "%02x", ((int)buf[i] & 0xff)); strcat(outbuf, " => "); successed = pg_eucjp_increment(buf, len); gensuccessed = pg_generic_charinc((char*)buf2, len); match = (memcmp(buf, buf2,len) == 0); if (!gensuccessed || !match || dispmatch) {for (i = 0 ; i < len ; i++) scatf(outbuf, "%02x", ((int)buf[i]& 0xff));scatf(outbuf, " %s - %s", (successed ? "successed" : "FAILED"), (match ? "Match to genericinc" : "Don't match to generic inc"));if (!match) { strcat(outbuf, "("); for (i = 0 ; i < len ; i++) scatf(outbuf,"%02x", ((int)buf2[i] & 0xff)); strcat(outbuf, ")");}puts(outbuf); } } static bool pg_eucjp_increment(unsigned char *charptr, int length) { unsigned char bak[3]; bool success; unsignedchar c1, c2; signed int i; memcpy(bak, charptr, length); c1 = *charptr; switch (c1) { caseSS2: /* JIS X 0201 */ if (length != 2) return false; c2 = charptr[1]; if (c2 > 0xde) charptr[0] = charptr[1] = 0xa1; else if (c2 < 0xa1) charptr[1] = 0xa1; else charptr[1]++; break; case SS3: /* JIS X 0212 */ if (length !=3) return false; for (i = 2 ; i > 1 ; i--) { c2 = charptr[i]; if (c2< 0xa1) { charptr[i] = 0xa1; return true; } else if (c2 < 0xfe) { charptr[i]++; break; } charptr[i] = 0xa1; } if (i == 0) /* Out of code region */ { memcpy(charptr, bak, length); return false; } break; default: if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ { if (length != 2) return false; for (i = 1 ; i >= 0 ; i--) /* i must be signed */ { c2= charptr[i]; if (c2 < 0xa1) { charptr[i] = 0xa1; return true; } else if (c2 < 0xfe) { charptr[i]++; break; } charptr[i] = 0xa1; } if (i < 0) /* Out of 2 byte code region */ { memcpy(charptr,bak, length); return false; } } else { /* ASCII */ if (c1 > 0x7e) return false; (*charptr)++; } } /* Check the result with pg_eucjp_verifier as the last resort. */ success = (pg_eucjp_verifier(charptr, length)== length); if (!success) memcpy(charptr, bak, length); return success; } #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe) static int pg_eucjp_verifier(const unsigned char *s, int len) { int l; unsigned char c1,c2; c1 = *s++; switch (c1){case SS2: /* JIS X 0201 */ l = 2; if (l > len) return -1; c2 = *s++; if (c2 < 0xa1 ||c2 > 0xdf) return -1; break; case SS3: /* JIS X 0212 */ l = 3; if (l > len) return -1; c2 = *s++; if (!IS_EUC_RANGE_VALID(c2)) return -1; c2 = *s++; if (!IS_EUC_RANGE_VALID(c2)) return -1; break; default: if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ { l = 2; if (l > len) return -1; if (!IS_EUC_RANGE_VALID(c1)) return -1; c2 = *s++; if (!IS_EUC_RANGE_VALID(c2)) return -1; } else /* must be ASCII */ { l = 1; } break;} return l; } static bool pg_generic_charinc(unsigned char *charptr, int len) { unsigned char *lastchar = (unsigned char *) (charptr + len - 1); unsigned char savelastchar = *lastchar; const char *const_charptr = (const char *)charptr; while (*lastchar < (unsigned char) 255) { (*lastchar)++; if (pg_eucjp_verifier(const_charptr, len) != len) // modified. continue; return true; } *lastchar = savelastchar; return false; } int scatf(char* buf, char* format, ...) { va_list args; int ret; va_start(args, format); ret = vsprintf(buf + strlen(buf), format, args); va_end(args); return ret; }
pgsql-hackers by date: