[v9.2] make_greater_string() does not return a string in some cases - Mailing list pgsql-hackers

From Kyotaro HORIGUCHI
Subject [v9.2] make_greater_string() does not return a string in some cases
Date
Msg-id 20110914.111320.18404009.horiguchi.kyotaro@oss.ntt.co.jp
Whole thread Raw
In response to Re: make_greater_string() does not return a string in some cases  (Kyotaro HORIGUCHI <horiguchi.kyotaro@oss.ntt.co.jp>)
Responses Re: [v9.2] make_greater_string() does not return a string in some cases
Re: [v9.2] make_greater_string() does not return a string in some cases
List pgsql-hackers
This is rebased patch of `Allow encoding specific character
incrementer'(https://commitfest.postgresql.org/action/patch_view?id=602).

Addition to the patch, increment sanity check program for new
functions pg_generic_charinc and pg_utf8_increment is attached.
-- 
Kyotaro Horiguchi
NTT Open Source Software Center
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 5d999e6..b7f1922 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5652,6 +5652,18 @@ pattern_selectivity(Const *patt, Pattern_Type ptype)/*
+ * This function is "character increment" function for bytea used in
+ * make_greater_string() that has same interface with pg_wchar_tbl.charinc.
+ */
+static bool byte_increment(unsigned char *ptr, int len)
+{
+    if (*ptr >= 255) return false;
+
+    (*ptr)++;
+    return true;
+}
+
+/* * Try to generate a string greater than the given string or any * string it is a prefix of.  If successful, return
apalloc'd string * in the form of a Const node; else return NULL.
 
@@ -5690,6 +5702,7 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)    int
len;   Datum        cmpstr;    text       *cmptxt = NULL;
 
+    character_incrementer charincfunc;    /*     * Get a modifiable copy of the prefix string in C-string format, and
set
@@ -5751,27 +5764,38 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)        }    }
+    if (datatype != BYTEAOID)
+        charincfunc = pg_database_encoding_character_incrementer();
+    else
+        charincfunc = &byte_increment;
+    while (len > 0)    {
-        unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
-        unsigned char savelastchar = *lastchar;
+        int charlen;
+        unsigned char *lastchar;
+        unsigned char savelastbyte;
+        Const       *workstr_const;
+        
+        if (datatype == BYTEAOID)
+            charlen = 1;
+        else
+            charlen = len - pg_mbcliplen(workstr, len, len - 1);
+
+        lastchar = (unsigned char *) (workstr + len - charlen);        /*
-         * Try to generate a larger string by incrementing the last byte.
+         * savelastbyte has meaning only for datatype == BYTEAOID         */
-        while (*lastchar < (unsigned char) 255)
-        {
-            Const       *workstr_const;
+        savelastbyte = *lastchar;
-            (*lastchar)++;
+        /*
+         * Try to generate a larger string by incrementing the last byte or
+         * character.
+         */
+        if (charincfunc(lastchar, charlen)) {            if (datatype != BYTEAOID)
-            {
-                /* do not generate invalid encoding sequences */
-                if (!pg_verifymbstr(workstr, len, true))
-                    continue;                workstr_const = string_to_const(workstr, datatype);
-            }            else                workstr_const = string_to_bytea_const(workstr, len);
@@ -5786,26 +5810,17 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
pfree(workstr);               return workstr_const;            }
 
-
+                        /* No good, release unusable value and try again */
pfree(DatumGetPointer(workstr_const->constvalue));           pfree(workstr_const);        }
 
-        /* restore last byte so we don't confuse pg_mbcliplen */
-        *lastchar = savelastchar;
-        /*
-         * Truncate off the last character, which might be more than 1 byte,
-         * depending on the character encoding.
+         * Truncate off the last character or restore last byte for BYTEA.         */
-        if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
-            len = pg_mbcliplen(workstr, len, len - 1);
-        else
-            len -= 1;
-
-        if (datatype != BYTEAOID)
-            workstr[len] = '\0';
+        len -= charlen;
+        workstr[len] = (datatype != BYTEAOID ? '\0' : savelastbyte);    }    /* Failed... */
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
index f23732f..00b3e2a 100644
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1,3 +1,4 @@
+/* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii
@@ -1336,53 +1337,254 @@ pg_utf8_islegal(const unsigned char *source, int length)/*
*-------------------------------------------------------------------
+ * character incrementer
+ *
+ * These functions accept "charptr", a pointer to the first byte of a
+ * maybe-multibyte character. Try `increment' the character and return true if
+ * successed.  If these functions returns false, the character should be
+ * untouched.  These functions must be implemented in correspondence with
+ * verifiers, in other words, the rewrited character by this function must pass
+ * the check by pg_*_verifier() if returns true. Returning the return value of
+ * pg_*_verifier() corresponding can finnaly avoid such a inconsistency when
+ * something wrong.
+ * -------------------------------------------------------------------
+ */
+
+#ifndef FRONTEND
+static bool pg_generic_charinc(unsigned char *charptr, int len)
+{
+     unsigned char *lastchar = (unsigned char *) (charptr + len - 1);
+     unsigned char savelastchar = *lastchar;
+     const char *const_charptr = (const char *)charptr;
+ 
+     while (*lastchar < (unsigned char) 255)
+     {
+         (*lastchar)++;
+         if (!pg_verifymbstr(const_charptr, len, true))
+             continue;
+         return true;
+     }
+ 
+     *lastchar = savelastchar;
+     return false;
+}
+ 
+static bool pg_utf8_increment(unsigned char *charptr, int length)
+{
+     unsigned char a;
+     unsigned char bak[4];
+     bool success;
+ 
+     memcpy(bak, charptr, length);
+     switch (length)
+     {
+         default:
+             /* reject lengths 5 and 6 for now */
+             return false;
+         case 4:
+             a = charptr[3];
+             if (a < 0xBF)
+             {
+                 charptr[3]++;
+                 break;
+             }
+             charptr[3] = 0x80;
+             /* FALL THRU */
+         case 3:
+             a = charptr[2];
+             if (a < 0xBF)
+             {
+                 charptr[2]++;
+                 break;
+             }
+             charptr[2] = 0x80;
+             /* FALL THRU */
+         case 2:
+             a = charptr[1];
+             if ((*charptr == 0xed && a < 0x9F) || a < 0xBF)
+             {
+                 charptr[1]++;
+                 break;
+             }
+             charptr[1] = 0x80;
+             /* FALL THRU */
+         case 1:
+             a = *charptr;
+             if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF7) {
+                memcpy(charptr, bak, length);
+                 return false;
+             }
+             charptr[0]++;
+             break;
+     }
+     
+     /* Check the result with pg_utf8_islegal as the last resort. */
+     success = pg_utf8_islegal(charptr, length);
+     if (!success)
+         memcpy(charptr, bak, length);
+ 
+     return success;
+}
+ 
+static bool pg_eucjp_increment(unsigned char *charptr, int length) {
+     unsigned char bak[3];
+     bool success;
+     unsigned char c1, c2;
+     signed int i;
+ 
+     memcpy(bak, charptr, length);
+ 
+     c1 = *charptr;
+ 
+     switch (c1)
+     {
+         case SS2:    /* JIS X 0201 */
+             if (length != 2) return false;
+ 
+             c2 = charptr[1];
+ 
+             if (c2 > 0xde)
+                 charptr[0] = charptr[1] = 0xa1;
+             else if (c2 < 0xa1)
+                 charptr[1] = 0xa1;
+             else
+                 charptr[1]++;
+ 
+             break;
+ 
+         case SS3:    /* JIS X 0212 */
+             if (length != 3) return false;
+ 
+             for (i = 2 ; i > 1 ; i--)
+             {
+                 c2 = charptr[i];
+                 if (c2 < 0xa1)
+                 {
+                     charptr[i] = 0xa1;
+                     return true;
+                 }
+                 else if (c2 < 0xfe)
+                 {
+                     charptr[i]++;
+                     break;
+                 }
+                 charptr[i] = 0xa1;
+             }
+ 
+ 
+             if (i == 0)      /* Out of code region */
+             {
+                 memcpy(charptr, bak, length);
+                 return false;
+             }
+             
+             break;
+ 
+         default:
+             if (IS_HIGHBIT_SET(c1))     /* JIS X 0208? */
+             {
+                 if (length != 2) return false;
+           
+                 for (i = 1 ; i >= 0 ; i--)    /* i must be signed */
+                 {
+                     c2 = charptr[i];
+                     if (c2 < 0xa1)
+                     {
+                         charptr[i] = 0xa1;
+                         return true;
+                     }
+                     else if (c2 < 0xfe)
+                     {
+                         charptr[i]++;
+                         break;
+                     }
+                     charptr[i] = 0xa1;
+                 }
+           
+                 if (i < 0)    /*  Out of 2 byte code region */
+                 {
+                     memcpy(charptr, bak, length);
+                     return false;
+                 }
+             }
+             else
+             {    /* ASCII */
+                 if (c1 > 0x7e)
+                     return false;
+                 (*charptr)++;
+             }
+     }
+ 
+ 
+     /* Check the result with pg_eucjp_verifier as the last resort. */
+     success = (pg_eucjp_verifier(charptr, length) == length);
+     if (!success)
+         memcpy(charptr, bak, length);    
+   
+     return success;
+}
+#else
+/*
+ * Character increment functions are not available on frontend. Abort on call
+ * to prevent miseuse.
+ */
+static bool pg_generic_charinc(unsigned char *charptr, int len) {
+    fputs(_("Character incrementer cannot be used in frontend.\n"), stderr);
+    abort();
+}
+#define pg_utf8_increment pg_generic_charinc
+#define pg_eucjp_increment pg_generic_charinc
+#endif
+
+/*
+ *------------------------------------------------------------------- * encoding info table * XXX must be sorted by
thesame order as enum pg_enc (in mb/pg_wchar.h) *-------------------------------------------------------------------
*/pg_wchar_tblpg_wchar_table[] = {
 
-    {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1},    /* PG_SQL_ASCII */
-    {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},    /* PG_EUC_JP */
-    {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2},    /* PG_EUC_CN */
-    {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},    /* PG_EUC_KR */
-    {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4},    /* PG_EUC_TW */
-    {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},    /* PG_EUC_JIS_2004 */
-    {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4},    /* PG_UTF8 */
-    {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4},        /* PG_MULE_INTERNAL */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_LATIN1 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_LATIN2 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_LATIN3 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_LATIN4 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_LATIN5 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_LATIN6 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_LATIN7 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_LATIN8 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_LATIN9 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_LATIN10 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN1256 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN1258 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN866 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN874 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_KOI8R */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN1251 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN1252 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* ISO-8859-5 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* ISO-8859-6 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* ISO-8859-7 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* ISO-8859-8 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN1250 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN1253 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN1254 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN1255 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_WIN1257 */
-    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},        /* PG_KOI8U */
-    {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2},    /* PG_SJIS */
-    {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2},    /* PG_BIG5 */
-    {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},        /* PG_GBK */
-    {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},        /* PG_UHC */
-    {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4},    /* PG_GB18030 */
-    {0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */
-    {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}        /* PG_SHIFT_JIS_2004 */
+    {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_generic_charinc, pg_ascii_verifier, 1},    /*
PG_SQL_ASCII*/
 
+    {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_increment, pg_eucjp_verifier, 3},    /*
PG_EUC_JP*/
 
+    {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_generic_charinc, pg_euccn_verifier, 2},    /*
PG_EUC_CN*/
 
+    {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_generic_charinc, pg_euckr_verifier, 3},    /*
PG_EUC_KR*/
 
+    {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_generic_charinc, pg_euctw_verifier, 4},    /*
PG_EUC_TW*/
 
+    {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_increment, pg_eucjp_verifier, 3},    /*
PG_EUC_JIS_2004*/
 
+    {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_increment, pg_utf8_verifier, 4},    /* PG_UTF8 */
+    {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_generic_charinc, pg_mule_verifier, 4},        /*
PG_MULE_INTERNAL*/
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_LATIN1 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_LATIN2 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_LATIN3 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_LATIN4 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_LATIN5 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_LATIN6 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_LATIN7 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_LATIN8 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_LATIN9 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_LATIN10 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN1256 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN1258 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN866 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN874 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_KOI8R */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN1251 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN1252 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*ISO-8859-5 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*ISO-8859-6 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*ISO-8859-7 */ 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*ISO-8859-8 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN1250 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN1253 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN1254 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN1255 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_WIN1257 */
 
+    {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_generic_charinc, pg_latin1_verifier, 1},
/*PG_KOI8U */
 
+    {0, pg_sjis_mblen, pg_sjis_dsplen, pg_generic_charinc, pg_sjis_verifier, 2},    /* PG_SJIS */
+    {0, pg_big5_mblen, pg_big5_dsplen, pg_generic_charinc, pg_big5_verifier, 2},    /* PG_BIG5 */
+    {0, pg_gbk_mblen, pg_gbk_dsplen, pg_generic_charinc, pg_gbk_verifier, 2},        /* PG_GBK */
+    {0, pg_uhc_mblen, pg_uhc_dsplen, pg_generic_charinc, pg_uhc_verifier, 2},        /* PG_UHC */
+    {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_generic_charinc, pg_gb18030_verifier, 4},    /* PG_GB18030 */
+    {0, pg_johab_mblen, pg_johab_dsplen, pg_generic_charinc, pg_johab_verifier, 3}, /* PG_JOHAB */
+    {0, pg_sjis_mblen, pg_sjis_dsplen, pg_generic_charinc, pg_sjis_verifier, 2}        /* PG_SHIFT_JIS_2004 */};/*
returnsthe byte length of a word for mule internal code */
 
@@ -1459,6 +1661,15 @@ pg_database_encoding_max_length(void)}/*
+ * give the character incrementer for the encoding for the current database
+ */
+character_incrementer
+pg_database_encoding_character_incrementer(void)
+{
+    return pg_wchar_table[GetDatabaseEncoding()].charinc;
+}
+
+/* * Verify mbstr to make sure that it is validly encoded in the current * database encoding.  Otherwise same as
pg_verify_mbstr().*/
 
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 826c7af..356703a 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -284,6 +284,8 @@ typedef int (*mblen_converter) (const unsigned char *mbstr);typedef int (*mbdisplaylen_converter)
(constunsigned char *mbstr);
 
+typedef bool (*character_incrementer) (unsigned char *mbstr, int len);
+typedef int (*mbverifier) (const unsigned char *mbstr, int len);typedef struct
@@ -292,6 +294,7 @@ typedef struct                                                         * string to a wchar */
mblen_convertermblen;        /* get byte length of a char */    mbdisplaylen_converter dsplen;        /* get display
widthof a char */
 
+    character_incrementer charinc;  /* Character code incrementer if not null */    mbverifier    mbverify;        /*
verifymultibyte sequence */    int            maxmblen;        /* max bytes for a char in this encoding */}
pg_wchar_tbl;
@@ -389,6 +392,7 @@ extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,extern int
pg_mbcharcliplen(constchar *mbstr, int len, int imit);extern int    pg_encoding_max_length(int encoding);extern int
pg_database_encoding_max_length(void);
+extern character_incrementer pg_database_encoding_character_incrementer(void);extern int    PrepareClientEncoding(int
encoding);externint    SetClientEncoding(int encoding); 
// sanity test for utf specific character incrementer.
//
// -v displays status for invalid source code.
// -m displays status for the result that the product of new
//    incrementer is match to the one of the generic incrementer.
// show all status lines when both -v and -m are specified.
//
// `utftest | grep FAILED' shows remaining glitches using new
//    incrementer. (4 lines)
//
// CAUTION: this program yields so much lines.
//
// `utftest' yields 17375 lines. These lines are the saved by new func
//           and remaining glitches.
// `utftest -m' yields 1112064 lines.
//
//
// Sample of status lines:
// src char  src utf8   dst utf8  dest char result status
// 000d7bf => ed9ebf   => ed9f80  (000d7c0) successed - Don't match to generic inc(000d7bf)
// 000d7ff => ed9fbf   => ed9fbf  (000d7ff) FAILED - Match to generic inc
// 000d800 => eda080  Source not valid utf8
//
// successed/FAILED in result status shows the return value of
// character increment function. Following description says that the
// result of the new incrementer was/was'nt identical to the generic
// incrementer.

#include <stdio.h>
#include <stdarg.h>

typedef int bool;
static int true = 1;
static int false = 0;

static bool pg_utf8_increment(unsigned char *mbstr, int length);
static bool pg_generic_charinc(unsigned char *charptr, int len);
void uni2utf8(unsigned int unicode, unsigned char *utf8buf);
unsigned int utf8tounicode(unsigned char *utf8buf);
int scatf(char* buf, char* format, ...);

int main(int argc, char** argv) { unsigned char buf[4], buf2[4]; char outbuf[1024]; unsigned int i; int dispinvalid =
0;int dispmatch = 0;
 
 for (i = 1 ; i < argc ; i++) {if (strcmp(argv[i], "-v") == 0) dispinvalid = 1;if (strcmp(argv[i], "-m") == 0)
dispmatch= 1; }  for(i = 0 ; i < 0x1010000 ; i++) {bool prechk, successed, gensuccess, match;
 
uni2utf8(i, buf);uni2utf8(i, buf2);*outbuf = 0;
scatf(outbuf, "%07x => ", i);
int len = pg_utf_mblen(buf);
int j = 0;
while (j < len)  scatf(outbuf, "%02x", buf[j++]);
while (j < 4) {  scatf(outbuf, "  ");  j++;}  prechk = pg_utf8_islegal(buf, len);if (! prechk) {  scatf(outbuf, "Source
notvalid utf8");
 
  if (dispinvalid)    puts(outbuf);  continue;}
successed = pg_utf8_increment(buf, len);scatf(outbuf, " => ");j = 0;while (j < len)  scatf(outbuf, "%02x", buf[j++]);
while(j < 4) {  scatf(outbuf, "  ");  j++;}
 
gensuccess = pg_generic_charinc(buf2, len);match = (memcmp(buf, buf2, len) == 0);
if (!gensuccess || !match || dispmatch) {  scatf(outbuf,         "(%07x) %s - %s",        utf8tounicode(buf),
(successed? "successed" : "FAILED"),        (match ? "Match to generic inc" : "Don't match to generic inc"));  if
(!match){    scatf(outbuf, "(%07x)", utf8tounicode(buf2));  }  puts(outbuf);} }
 
}

bool
pg_utf8_islegal(const unsigned char *source, int length)
{unsigned char a;
switch (length){    default:        /* reject lengths 5 and 6 for now */        return false;    case 4:        a =
source[3];       if (a < 0x80 || a > 0xBF)            return false;        /* FALL THRU */    case 3:        a =
source[2];       if (a < 0x80 || a > 0xBF)            return false;        /* FALL THRU */    case 2:        a =
source[1];       switch (*source)        {            case 0xE0:                if (a < 0xA0 || a > 0xBF)
    return false;                break;            case 0xED:                if (a < 0x80 || a > 0x9F)
 return false;                break;            case 0xF0:                if (a < 0x90 || a > 0xBF)
returnfalse;                break;            case 0xF4:                if (a < 0x80 || a > 0x8F)
returnfalse;                break;            default:                if (a < 0x80 || a > 0xBF)
returnfalse;                break;        }        /* FALL THRU */    case 1:        a = *source;        if (a >= 0x80
&&a < 0xC2)            return false;        if (a > 0xF4)            return false;        break;}return true;
 
}

int
pg_utf_mblen(const unsigned char *s)
{int            len;
if ((*s & 0x80) == 0)    len = 1;else if ((*s & 0xe0) == 0xc0)    len = 2;else if ((*s & 0xf0) == 0xe0)    len = 3;else
if((*s & 0xf8) == 0xf0)    len = 4;
 
#ifdef NOT_USEDelse if ((*s & 0xfc) == 0xf8)    len = 5;else if ((*s & 0xfe) == 0xfc)    len = 6;
#endifelse    len = 1;return len;
}


static bool pg_utf8_increment(unsigned char *charptr, int length)
{    unsigned char a;    unsigned char bak[4];    bool success;    memcpy(bak, charptr, length);    switch (length)
{       default:            /* reject lengths 5 and 6 for now */            return false;        case 4:            a =
charptr[3];           if (a < 0xBF)            {                charptr[3]++;                break;            }
   charptr[3] = 0x80;            /* FALL THRU */        case 3:            a = charptr[2];            if (a < 0xBF)
      {                charptr[2]++;                break;            }            charptr[2] = 0x80;            /*
FALLTHRU */        case 2:            a = charptr[1];            if ((*charptr == 0xed && a < 0x9F) || a < 0xBF)
   {                charptr[1]++;                break;            }            charptr[1] = 0x80;            /* FALL
THRU*/        case 1:            a = *charptr;            if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF7) {
    memcpy(charptr, bak, length);                return false;            }            charptr[0]++;            break;
 }        /* Check the result with pg_utf8_islegal as the last resort. */    success = pg_utf8_islegal(charptr,
length);   if (!success)        memcpy(charptr, bak, length);    return success;
 
}

void uni2utf8(unsigned int unicode, unsigned char *utf8buf) { int i, len;  if (unicode < 0x80) {   len = 1;   *utf8buf
=0; } else if (unicode < 0x800) {   len = 2;   *utf8buf = 0xc0; } else if (unicode < 0x10000) {   len = 3;   *utf8buf =
0xe0;} else if (unicode < 0x110000) {   len = 4;   *utf8buf = 0xf0; } else {   printf("Uunicode of of range: %x\n",
unicode);exit(1);}
 
 for(i = len - 1 ; i > 0 ; i--) {   utf8buf[i] = (0x80 | (unicode & 0x3f));   unicode >>= 6; } *utf8buf |= unicode;
}

unsigned int utf8tounicode(unsigned char *utf8buf) { unsigned int a = *utf8buf; if (a < 0x80) return a; if (a < 0xc0)
return0xfffffff; if (a < 0xe0)return   ((utf8buf[0] - 0xc0) << 6) +  (utf8buf[1] - 0x80); if (a < 0xf0)return
((utf8buf[0]- 0xe0) << 12) +  ((utf8buf[1] - 0x80) << 6) +  utf8buf[2] - 0x80; if (a < 0xf8)return  ((utf8buf[0] -
0xf0)<< 18) +  ((utf8buf[1] - 0x80) << 12) +  ((utf8buf[2] - 0x80) << 6) +  utf8buf[3] - 0x80; return 0xfffffff;
 
}


static bool pg_generic_charinc(unsigned char *charptr, int len)
{      unsigned char *lastchar = (unsigned char *) (charptr + len - 1);      unsigned char savelastchar = *lastchar;
 const char *const_charptr = (const char *)charptr;      while (*lastchar < (unsigned char) 255)      {
(*lastchar)++;             if (!pg_utf8_islegal(const_charptr, len)) // modified.                      continue;
     return true;      }      *lastchar = savelastchar;      return false;
 
}

int scatf(char* buf, char* format, ...) { va_list args; int ret;
 va_start(args, format); ret = vsprintf(buf + strlen(buf), format, args); va_end(args); return ret;
}
// sanity test for euc-japan specific character incrementer.
//
// -v displays status for invalid source charcode.
// -m displays status for the result that the product of new
//    incrementer is match to the one of the generic incrementer.
// show all status lines when both -v and -m are specified.
//
// `euctest | grep FAILED' shows remaining glitches using new
//    incrementer. (2 lines)
//
// CAUTION:
// `euctest' yields 190 lines.
// `euctest -m' yields 17863 lines.
// `euctest -m -v' yields 16843008 lines.
// 
// Sample of output lines:
// src => dest result - status
// 7e => 7f successed - Match to generic inc
// 7f => 7f FAILED - Match to generic inc
// 8edf => a1a1 successed - Don't match to generic inc(8edf)
//
// successed/FAILED in result status shows the return value of
// character increment function. Following description says that the
// result of the new incrementer was/was'nt identical to the generic
// incrementer.

#include <stdio.h>
#include <stdarg.h>

#define SS2 0x8e                /* single shift 2 (JIS0201) */
#define SS3 0x8f                /* single shift 3 (JIS0212) */
#define HIGHBIT                    (0x80)
#define IS_HIGHBIT_SET(ch)        ((unsigned char)(ch) & HIGHBIT)

typedef int bool;
static int false = 0;
static int true = 1;

static bool pg_generic_charinc(unsigned char *charptr, int len);
static bool pg_eucjp_increment(unsigned char *charptr, int length);
static int  pg_eucjp_verifier(const unsigned char *s, int len);
void do_check(int len, unsigned char *buf, int dispinvalid, int dispmatch);
int scatf(char* buf, char* format, ...);

int main(int argc, char **argv) { unsigned int i, j, k; unsigned char buf[3]; int res; int dispinvalid = 0; int
dispmatch= 0;
 
 for (i = 1 ; i < argc ; i++) {if (strcmp(argv[i], "-v") == 0) dispinvalid = 1;if (strcmp(argv[i], "-m") == 0)
dispmatch= 1; }
 
 // single byte characters for (i = 0 ; i < 256 ; i++) {*buf = i;do_check(1, buf, dispinvalid, dispmatch); }
 // 2 byte characters for (i = 0 ; i < 256 ; i++) {for (j = 0 ; j < 256 ; j++) {  *buf = i;  buf[1] = j;  do_check(2,
buf,dispinvalid, dispmatch);} }
 
 // 3 byte characters for (i = 0 ; i < 256 ; i++) {for (j = 0 ; j < 256 ; j++) {  for (k = 0 ; k < 256 ; k++) {    *buf
=i;    buf[1] = j;    buf[2] = k;    do_check(3, buf, dispinvalid, dispmatch);  }    } }
 
} 

void do_check(int len, unsigned char *buf, int dispinvalid, int dispmatch) { unsigned char buf2[3]; char outbuf[1024];
inti, src_is_valid, successed, gensuccessed, match;
 
 *outbuf = 0;
 src_is_valid = (pg_eucjp_verifier(buf, len) == len);
 if (!src_is_valid) {if (dispinvalid) {  for (i = 0 ; i < len ; i++)    scatf(outbuf, "%02x", buf[i]);  strcat(outbuf,
"- Src char is invalid.");  puts(outbuf);}return; }
 
 memcpy(buf2, buf, len);
 for (i = 0 ; i < len ; i++)scatf(outbuf, "%02x", ((int)buf[i] & 0xff)); strcat(outbuf, " => ");
 successed =  pg_eucjp_increment(buf, len); gensuccessed = pg_generic_charinc((char*)buf2, len); match = (memcmp(buf,
buf2,len) == 0); if (!gensuccessed || !match || dispmatch) {for (i = 0 ; i < len ; i++)  scatf(outbuf, "%02x",
((int)buf[i]& 0xff));scatf(outbuf, " %s - %s",      (successed ? "successed" : "FAILED"),      (match ? "Match to
genericinc" : "Don't match to generic inc"));if (!match) {  strcat(outbuf, "(");  for (i = 0 ; i < len ; i++)
scatf(outbuf,"%02x", ((int)buf2[i] & 0xff));  strcat(outbuf, ")");}puts(outbuf); }
 
}

static bool pg_eucjp_increment(unsigned char *charptr, int length) {    unsigned char bak[3];    bool success;
unsignedchar c1, c2;    signed int i;    memcpy(bak, charptr, length);    c1 = *charptr;    switch (c1)    {
caseSS2:    /* JIS X 0201 */            if (length != 2) return false;            c2 = charptr[1];            if (c2 >
0xde)               charptr[0] = charptr[1] = 0xa1;            else if (c2 < 0xa1)                charptr[1] = 0xa1;
       else                charptr[1]++;            break;        case SS3:    /* JIS X 0212 */            if (length
!=3) return false;            for (i = 2 ; i > 1 ; i--)            {                c2 = charptr[i];                if
(c2< 0xa1)                {                    charptr[i] = 0xa1;                    return true;                }
         else if (c2 < 0xfe)                {                    charptr[i]++;                    break;
}               charptr[i] = 0xa1;            }            if (i == 0)      /* Out of code region */            {
        memcpy(charptr, bak, length);                return false;            }                        break;
default:           if (IS_HIGHBIT_SET(c1))     /* JIS X 0208? */            {                if (length != 2) return
false;                         for (i = 1 ; i >= 0 ; i--)    /* i must be signed */                {
c2= charptr[i];                    if (c2 < 0xa1)                    {                        charptr[i] = 0xa1;
               return true;                    }                    else if (c2 < 0xfe)                    {
           charptr[i]++;                        break;                    }                    charptr[i] = 0xa1;
        }                          if (i < 0)    /*  Out of 2 byte code region */                {
memcpy(charptr,bak, length);                    return false;                }            }            else
{   /* ASCII */                if (c1 > 0x7e)                    return false;                (*charptr)++;
}   }    /* Check the result with pg_eucjp_verifier as the last resort. */    success = (pg_eucjp_verifier(charptr,
length)== length);    if (!success)        memcpy(charptr, bak, length);          return success;
 
}

#define IS_EUC_RANGE_VALID(c)    ((c) >= 0xa1 && (c) <= 0xfe)

static int
pg_eucjp_verifier(const unsigned char *s, int len)
{ int            l; unsigned char c1,c2;
 c1 = *s++;
 switch (c1){case SS2:                /* JIS X 0201 */  l = 2;  if (l > len)    return -1;  c2 = *s++;  if (c2 < 0xa1
||c2 > 0xdf)    return -1;  break;
 
case SS3:                /* JIS X 0212 */  l = 3;  if (l > len)    return -1;  c2 = *s++;  if (!IS_EUC_RANGE_VALID(c2))
  return -1;  c2 = *s++;  if (!IS_EUC_RANGE_VALID(c2))    return -1;  break;
 
default:  if (IS_HIGHBIT_SET(c1))        /* JIS X 0208? */    {      l = 2;      if (l > len)        return -1;      if
(!IS_EUC_RANGE_VALID(c1))       return -1;      c2 = *s++;      if (!IS_EUC_RANGE_VALID(c2))        return -1;    }
else   /* must be ASCII */    {      l = 1;    }  break;}
 
 return l;
}

static bool pg_generic_charinc(unsigned char *charptr, int len)
{      unsigned char *lastchar = (unsigned char *) (charptr + len - 1);      unsigned char savelastchar = *lastchar;
 const char *const_charptr = (const char *)charptr;      while (*lastchar < (unsigned char) 255)      {
(*lastchar)++;             if (pg_eucjp_verifier(const_charptr, len) != len) // modified.             continue;
    return true;      }      *lastchar = savelastchar;      return false;
 
}

int scatf(char* buf, char* format, ...) { va_list args; int ret;
 va_start(args, format); ret = vsprintf(buf + strlen(buf), format, args); va_end(args); return ret;
}

pgsql-hackers by date:

Previous
From: Fujii Masao
Date:
Subject: Re: unite recovery.conf and postgresql.conf
Next
From: Kyotaro HORIGUCHI
Date:
Subject: [REVIEW] Generate column names for subquery expressions