Thread: Request for review: tsearch2 patch
Hi, Here are patches against tsearch2 with CVS head. Currently tsearch2 does not work with multibyte encoding which uses C locale. These patches are intended to solve the problem by using PostgreSQL in-house multibyte function instead of mbstowcs which does not work with C locale. Also iswalpha etc. will not be called in case of C locale since they are not working with it. Tested with the EUC_JP encoding (should be working with any multibye encodings). Existing single byte encodings should not be broken by the patches, I did not test though. -- Tatsuo Ishii SRA OSS, Inc. Japan Index: ts_locale.c =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v retrieving revision 1.7 diff -c -r1.7 ts_locale.c *** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7 --- ts_locale.c 1 Jan 2007 12:22:50 -0000 *************** *** 63,68 **** --- 63,101 ---- return mbstowcs(to, from, len); } + + #else /* WIN32 */ + + size_t + char2wchar(wchar_t *to, const char *from, size_t len) + { + wchar_t *result; + size_t n; + + if (to == NULL) + return 0; + + if (lc_ctype_is_c) + { + /* allocate neccesary memory for "to" including NULL terminate */ + result = (wchar_t *)palloc((len+1)*sizeof(wchar_t)); + + /* do the conversion */ + n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len); + if (n > 0) + { + /* store the result */ + if (n > len) + n = len; + memcpy(to, result, n*sizeof(wchar_t)); + pfree(result); + *(to + n) = '\0'; + } + return n; + } + return mbstowcs(to, from, len); + } + #endif /* WIN32 */ int *************** *** 70,75 **** --- 103,113 ---- { wchar_t character; + if (lc_ctype_is_c) + { + return isalpha(TOUCHAR(ptr)); + } + char2wchar(&character, ptr, 1); return iswalpha((wint_t) character); *************** *** 80,85 **** --- 118,128 ---- { wchar_t character; + if (lc_ctype_is_c) + { + return isprint(TOUCHAR(ptr)); + } + char2wchar(&character, ptr, 1); return iswprint((wint_t) character); *************** *** 126,132 **** if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("transalation failed from server encoding to wchar_t"))); Assert(wlen<=len); wstr[wlen] = 0; --- 169,175 ---- if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("translation failed from server encoding to wchar_t"))); Assert(wlen<=len); wstr[wlen] = 0; *************** *** 152,158 **** if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("transalation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } --- 195,201 ---- if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("translation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } Index: ts_locale.h =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v retrieving revision 1.7 diff -c -r1.7 ts_locale.h *** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7 --- ts_locale.h 1 Jan 2007 12:22:50 -0000 *************** *** 38,45 **** #else /* WIN32 */ /* correct mbstowcs */ - #define char2wchar mbstowcs #define wchar2char wcstombs #endif /* WIN32 */ #define t_isdigit(x) ( pg_mblen(x)==1&& isdigit( TOUCHAR(x) ) ) --- 38,46 ---- #else /* WIN32 */ /* correct mbstowcs */ #define wchar2char wcstombs + size_t char2wchar(wchar_t *to, const char *from, size_t len); + #endif /* WIN32 */ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) *************** *** 54,59 **** --- 55,61 ---- * t_iseq() should be called only for ASCII symbols */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x)== ((unsigned char)(c)) ) : false ) + /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/ #define COPYCHAR(d,s) do { \ int lll= pg_mblen( s ); \ Index: wordparser/parser.c =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v retrieving revision 1.11 diff -c -r1.11 parser.c *** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11 --- wordparser/parser.c 1 Jan 2007 12:22:51 -0000 *************** *** 44,52 **** * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale thereis no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor */ ! if (prs->charmaxlen > 1 && !lc_ctype_is_c()) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t)* prs->lenstr); --- 44,54 ---- * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale thereis no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor + * + * This is wrong assumption. even if locale is C, multibyte is necceary. */ ! if (prs->charmaxlen > 1) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t)* prs->lenstr); *************** *** 92,98 **** static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ ! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ is##type( (unsignedchar)*( prs->str + prs->state->posbyte ) ) ); \ } \ \ --- 94,102 ---- static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ ! return ( ( prs->usewide ) ? \ ! (lc_ctype_is_c? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \ ! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \ is##type( (unsigned char)*( prs->str+ prs->state->posbyte ) ) ); \ } \ \ *************** *** 134,141 **** } #endif /* TS_USE_WIDE */ ! p_iswhat(alnum) ! p_iswhat(alpha) p_iswhat(digit) p_iswhat(lower) p_iswhat(print) --- 138,197 ---- } #endif /* TS_USE_WIDE */ ! static int p_isalnum(TParser *prs) { ! Assert( prs->state ); ! ! if (prs->usewide) ! { ! unsigned int c; ! ! c = *(prs->wstr + prs->state->poschar); ! ! if (lc_ctype_is_c) ! { ! if (c > 0x7f) ! return 1; ! return isalnum(0xff & c); ! } ! else ! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar)); ! } ! else ! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte )); ! } ! ! static int p_isnotalnum(TParser *prs) ! { ! return !p_isalnum(prs); ! } ! ! static int p_isalpha(TParser *prs) { ! Assert( prs->state ); ! ! if (prs->usewide) ! { ! unsigned int c; ! ! c = *(prs->wstr + prs->state->poschar); ! ! if (lc_ctype_is_c) ! { ! if (c > 0x7f) ! return 1; ! return isalpha(0xff & c); ! } ! else ! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar)); ! } ! else ! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte )); ! } ! ! static int p_isnotalpha(TParser *prs) ! { ! return !p_isalpha(prs); ! } ! p_iswhat(digit) p_iswhat(lower) p_iswhat(print)
I have tested with local-enabled environment and found a bug. Included is the new version of patches. Teodor, Oleg, what do you think about these patches? If ok, shall I commit to CVS head? -- Tatsuo Ishii SRA OSS, Inc. Japan > Hi, > > Here are patches against tsearch2 with CVS head. Currently tsearch2 > does not work with multibyte encoding which uses C locale. These > patches are intended to solve the problem by using PostgreSQL in-house > multibyte function instead of mbstowcs which does not work with C > locale. Also iswalpha etc. will not be called in case of C locale > since they are not working with it. Tested with the EUC_JP encoding > (should be working with any multibye encodings). Existing single byte > encodings should not be broken by the patches, I did not test though. > -- > Tatsuo Ishii > SRA OSS, Inc. Japan Index: ts_locale.c =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v retrieving revision 1.7 diff -c -r1.7 ts_locale.c *** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7 --- ts_locale.c 4 Jan 2007 12:16:00 -0000 *************** *** 63,68 **** --- 63,101 ---- return mbstowcs(to, from, len); } + + #else /* WIN32 */ + + size_t + char2wchar(wchar_t *to, const char *from, size_t len) + { + wchar_t *result; + size_t n; + + if (to == NULL) + return 0; + + if (lc_ctype_is_c()) + { + /* allocate neccesary memory for "to" including NULL terminate */ + result = (wchar_t *)palloc((len+1)*sizeof(wchar_t)); + + /* do the conversion */ + n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len); + if (n > 0) + { + /* store the result */ + if (n > len) + n = len; + memcpy(to, result, n*sizeof(wchar_t)); + pfree(result); + *(to + n) = '\0'; + } + return n; + } + return mbstowcs(to, from, len); + } + #endif /* WIN32 */ int *************** *** 70,75 **** --- 103,113 ---- { wchar_t character; + if (lc_ctype_is_c()) + { + return isalpha(TOUCHAR(ptr)); + } + char2wchar(&character, ptr, 1); return iswalpha((wint_t) character); *************** *** 80,85 **** --- 118,128 ---- { wchar_t character; + if (lc_ctype_is_c()) + { + return isprint(TOUCHAR(ptr)); + } + char2wchar(&character, ptr, 1); return iswprint((wint_t) character); *************** *** 126,132 **** if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("transalation failed from server encoding to wchar_t"))); Assert(wlen<=len); wstr[wlen] = 0; --- 169,175 ---- if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("translation failed from server encoding to wchar_t"))); Assert(wlen<=len); wstr[wlen] = 0; *************** *** 152,158 **** if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("transalation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } --- 195,201 ---- if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("translation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } Index: ts_locale.h =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v retrieving revision 1.7 diff -c -r1.7 ts_locale.h *** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7 --- ts_locale.h 4 Jan 2007 12:16:00 -0000 *************** *** 38,45 **** #else /* WIN32 */ /* correct mbstowcs */ - #define char2wchar mbstowcs #define wchar2char wcstombs #endif /* WIN32 */ #define t_isdigit(x) ( pg_mblen(x)==1&& isdigit( TOUCHAR(x) ) ) --- 38,46 ---- #else /* WIN32 */ /* correct mbstowcs */ #define wchar2char wcstombs + size_t char2wchar(wchar_t *to, const char *from, size_t len); + #endif /* WIN32 */ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) *************** *** 54,59 **** --- 55,61 ---- * t_iseq() should be called only for ASCII symbols */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x)== ((unsigned char)(c)) ) : false ) + /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/ #define COPYCHAR(d,s) do { \ int lll= pg_mblen( s ); \ Index: wordparser/parser.c =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v retrieving revision 1.11 diff -c -r1.11 parser.c *** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11 --- wordparser/parser.c 4 Jan 2007 12:16:01 -0000 *************** *** 44,52 **** * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale thereis no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor */ ! if (prs->charmaxlen > 1 && !lc_ctype_is_c()) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t)* prs->lenstr); --- 44,54 ---- * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale thereis no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor + * + * This is wrong assumption. even if locale is C, multibyte is necceary. */ ! if (prs->charmaxlen > 1) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t)* prs->lenstr); *************** *** 92,98 **** static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ ! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ is##type( (unsignedchar)*( prs->str + prs->state->posbyte ) ) ); \ } \ \ --- 94,102 ---- static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ ! return ( ( prs->usewide ) ? \ ! (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \ ! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \ is##type( (unsigned char)*( prs->str+ prs->state->posbyte ) ) ); \ } \ \ *************** *** 134,141 **** } #endif /* TS_USE_WIDE */ ! p_iswhat(alnum) ! p_iswhat(alpha) p_iswhat(digit) p_iswhat(lower) p_iswhat(print) --- 138,197 ---- } #endif /* TS_USE_WIDE */ ! static int p_isalnum(TParser *prs) { ! Assert( prs->state ); ! ! if (prs->usewide) ! { ! unsigned int c; ! ! c = *(prs->wstr + prs->state->poschar); ! ! if (lc_ctype_is_c()) ! { ! if (c > 0x7f) ! return 1; ! return isalnum(0xff & c); ! } ! else ! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar)); ! } ! else ! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte )); ! } ! ! static int p_isnotalnum(TParser *prs) ! { ! return !p_isalnum(prs); ! } ! ! static int p_isalpha(TParser *prs) { ! Assert( prs->state ); ! ! if (prs->usewide) ! { ! unsigned int c; ! ! c = *(prs->wstr + prs->state->poschar); ! ! if (lc_ctype_is_c()) ! { ! if (c > 0x7f) ! return 1; ! return isalpha(0xff & c); ! } ! else ! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar)); ! } ! else ! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte )); ! } ! ! static int p_isnotalpha(TParser *prs) ! { ! return !p_isalpha(prs); ! } ! p_iswhat(digit) p_iswhat(lower) p_iswhat(print)
Sorry for delay, I was on holidays :) Did you test patch on Windows platform? Tatsuo Ishii wrote: > I have tested with local-enabled environment and found a bug. Included > is the new version of patches. > > Teodor, Oleg, what do you think about these patches? > If ok, shall I commit to CVS head? > -- > Tatsuo Ishii > SRA OSS, Inc. Japan > >> Hi, >> >> Here are patches against tsearch2 with CVS head. Currently tsearch2 >> does not work with multibyte encoding which uses C locale. These >> patches are intended to solve the problem by using PostgreSQL in-house >> multibyte function instead of mbstowcs which does not work with C >> locale. Also iswalpha etc. will not be called in case of C locale >> since they are not working with it. Tested with the EUC_JP encoding >> (should be working with any multibye encodings). Existing single byte >> encodings should not be broken by the patches, I did not test though. >> -- >> Tatsuo Ishii >> SRA OSS, Inc. Japan >> >> ------------------------------------------------------------------------ >> >> Index: ts_locale.c >> =================================================================== >> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v >> retrieving revision 1.7 >> diff -c -r1.7 ts_locale.c >> *** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7 >> --- ts_locale.c 4 Jan 2007 12:16:00 -0000 >> *************** >> *** 63,68 **** >> --- 63,101 ---- >> >> return mbstowcs(to, from, len); >> } >> + >> + #else /* WIN32 */ >> + >> + size_t >> + char2wchar(wchar_t *to, const char *from, size_t len) >> + { >> + wchar_t *result; >> + size_t n; >> + >> + if (to == NULL) >> + return 0; >> + >> + if (lc_ctype_is_c()) >> + { >> + /* allocate neccesary memory for "to" including NULL terminate */ >> + result = (wchar_t *)palloc((len+1)*sizeof(wchar_t)); >> + >> + /* do the conversion */ >> + n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len); >> + if (n > 0) >> + { >> + /* store the result */ >> + if (n > len) >> + n = len; >> + memcpy(to, result, n*sizeof(wchar_t)); >> + pfree(result); >> + *(to + n) = '\0'; >> + } >> + return n; >> + } >> + return mbstowcs(to, from, len); >> + } >> + >> #endif /* WIN32 */ >> >> int >> *************** >> *** 70,75 **** >> --- 103,113 ---- >> { >> wchar_t character; >> >> + if (lc_ctype_is_c()) >> + { >> + return isalpha(TOUCHAR(ptr)); >> + } >> + >> char2wchar(&character, ptr, 1); >> >> return iswalpha((wint_t) character); >> *************** >> *** 80,85 **** >> --- 118,128 ---- >> { >> wchar_t character; >> >> + if (lc_ctype_is_c()) >> + { >> + return isprint(TOUCHAR(ptr)); >> + } >> + >> char2wchar(&character, ptr, 1); >> >> return iswprint((wint_t) character); >> *************** >> *** 126,132 **** >> if ( wlen < 0 ) >> ereport(ERROR, >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), >> ! errmsg("transalation failed from server encoding to wchar_t"))); >> >> Assert(wlen<=len); >> wstr[wlen] = 0; >> --- 169,175 ---- >> if ( wlen < 0 ) >> ereport(ERROR, >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), >> ! errmsg("translation failed from server encoding to wchar_t"))); >> >> Assert(wlen<=len); >> wstr[wlen] = 0; >> *************** >> *** 152,158 **** >> if ( wlen < 0 ) >> ereport(ERROR, >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), >> ! errmsg("transalation failed from wchar_t to server encoding %d", errno))); >> Assert(wlen<=len); >> out[wlen]='\0'; >> } >> --- 195,201 ---- >> if ( wlen < 0 ) >> ereport(ERROR, >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), >> ! errmsg("translation failed from wchar_t to server encoding %d", errno))); >> Assert(wlen<=len); >> out[wlen]='\0'; >> } >> Index: ts_locale.h >> =================================================================== >> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v >> retrieving revision 1.7 >> diff -c -r1.7 ts_locale.h >> *** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7 >> --- ts_locale.h 4 Jan 2007 12:16:00 -0000 >> *************** >> *** 38,45 **** >> #else /* WIN32 */ >> >> /* correct mbstowcs */ >> - #define char2wchar mbstowcs >> #define wchar2char wcstombs >> #endif /* WIN32 */ >> >> #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) >> --- 38,46 ---- >> #else /* WIN32 */ >> >> /* correct mbstowcs */ >> #define wchar2char wcstombs >> + size_t char2wchar(wchar_t *to, const char *from, size_t len); >> + >> #endif /* WIN32 */ >> >> #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) >> *************** >> *** 54,59 **** >> --- 55,61 ---- >> * t_iseq() should be called only for ASCII symbols >> */ >> #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) >> + /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/ >> >> #define COPYCHAR(d,s) do { \ >> int lll = pg_mblen( s ); \ >> Index: wordparser/parser.c >> =================================================================== >> RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v >> retrieving revision 1.11 >> diff -c -r1.11 parser.c >> *** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11 >> --- wordparser/parser.c 4 Jan 2007 12:16:01 -0000 >> *************** >> *** 44,52 **** >> * Some operating systems fail with multi-byte encodings and a C locale. >> * Also, for a C locale there is no need to process as multibyte. From >> * backend/utils/adt/oracle_compat.c Teodor >> */ >> >> ! if (prs->charmaxlen > 1 && !lc_ctype_is_c()) >> { >> prs->usewide = true; >> prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); >> --- 44,54 ---- >> * Some operating systems fail with multi-byte encodings and a C locale. >> * Also, for a C locale there is no need to process as multibyte. From >> * backend/utils/adt/oracle_compat.c Teodor >> + * >> + * This is wrong assumption. even if locale is C, multibyte is necceary. >> */ >> >> ! if (prs->charmaxlen > 1) >> { >> prs->usewide = true; >> prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); >> *************** >> *** 92,98 **** >> static int \ >> p_is##type(TParser *prs) { \ >> Assert( prs->state ); \ >> ! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ >> is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ >> } \ >> \ >> --- 94,102 ---- >> static int \ >> p_is##type(TParser *prs) { \ >> Assert( prs->state ); \ >> ! return ( ( prs->usewide ) ? \ >> ! (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \ >> ! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \ >> is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ >> } \ >> \ >> *************** >> *** 134,141 **** >> } >> #endif /* TS_USE_WIDE */ >> >> ! p_iswhat(alnum) >> ! p_iswhat(alpha) >> p_iswhat(digit) >> p_iswhat(lower) >> p_iswhat(print) >> --- 138,197 ---- >> } >> #endif /* TS_USE_WIDE */ >> >> ! static int p_isalnum(TParser *prs) { >> ! Assert( prs->state ); >> ! >> ! if (prs->usewide) >> ! { >> ! unsigned int c; >> ! >> ! c = *(prs->wstr + prs->state->poschar); >> ! >> ! if (lc_ctype_is_c()) >> ! { >> ! if (c > 0x7f) >> ! return 1; >> ! return isalnum(0xff & c); >> ! } >> ! else >> ! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar)); >> ! } >> ! else >> ! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte )); >> ! } >> ! >> ! static int p_isnotalnum(TParser *prs) >> ! { >> ! return !p_isalnum(prs); >> ! } >> ! >> ! static int p_isalpha(TParser *prs) { >> ! Assert( prs->state ); >> ! >> ! if (prs->usewide) >> ! { >> ! unsigned int c; >> ! >> ! c = *(prs->wstr + prs->state->poschar); >> ! >> ! if (lc_ctype_is_c()) >> ! { >> ! if (c > 0x7f) >> ! return 1; >> ! return isalpha(0xff & c); >> ! } >> ! else >> ! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar)); >> ! } >> ! else >> ! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte )); >> ! } >> ! >> ! static int p_isnotalpha(TParser *prs) >> ! { >> ! return !p_isalpha(prs); >> ! } >> ! >> p_iswhat(digit) >> p_iswhat(lower) >> p_iswhat(print) >> >> ------------------------------------------------------------------------ >> >> >> ---------------------------(end of broadcast)--------------------------- >> TIP 9: In versions below 8.0, the planner will ignore your desire to >> choose an index scan if your joining column's datatypes do not >> match -- Teodor Sigaev E-mail: teodor@sigaev.ru WWW: http://www.sigaev.ru/
> Sorry for delay, I was on holidays :) > > Did you test patch on Windows platform? No. I myself does not use Windows platform. Do you have any concern on Windows regarding my patches? -- Tatsuo Ishii SRA OSS, Inc. Japan > Tatsuo Ishii wrote: > > I have tested with local-enabled environment and found a bug. Included > > is the new version of patches. > > > > Teodor, Oleg, what do you think about these patches? > > If ok, shall I commit to CVS head? > > -- > > Tatsuo Ishii > > SRA OSS, Inc. Japan > > > >> Hi, > >> > >> Here are patches against tsearch2 with CVS head. Currently tsearch2 > >> does not work with multibyte encoding which uses C locale. These > >> patches are intended to solve the problem by using PostgreSQL in-house > >> multibyte function instead of mbstowcs which does not work with C > >> locale. Also iswalpha etc. will not be called in case of C locale > >> since they are not working with it. Tested with the EUC_JP encoding > >> (should be working with any multibye encodings). Existing single byte > >> encodings should not be broken by the patches, I did not test though. > >> -- > >> Tatsuo Ishii > >> SRA OSS, Inc. Japan > >> > >> ------------------------------------------------------------------------ > >> > >> Index: ts_locale.c > >> =================================================================== > >> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v > >> retrieving revision 1.7 > >> diff -c -r1.7 ts_locale.c > >> *** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7 > >> --- ts_locale.c 4 Jan 2007 12:16:00 -0000 > >> *************** > >> *** 63,68 **** > >> --- 63,101 ---- > >> > >> return mbstowcs(to, from, len); > >> } > >> + > >> + #else /* WIN32 */ > >> + > >> + size_t > >> + char2wchar(wchar_t *to, const char *from, size_t len) > >> + { > >> + wchar_t *result; > >> + size_t n; > >> + > >> + if (to == NULL) > >> + return 0; > >> + > >> + if (lc_ctype_is_c()) > >> + { > >> + /* allocate neccesary memory for "to" including NULL terminate */ > >> + result = (wchar_t *)palloc((len+1)*sizeof(wchar_t)); > >> + > >> + /* do the conversion */ > >> + n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len); > >> + if (n > 0) > >> + { > >> + /* store the result */ > >> + if (n > len) > >> + n = len; > >> + memcpy(to, result, n*sizeof(wchar_t)); > >> + pfree(result); > >> + *(to + n) = '\0'; > >> + } > >> + return n; > >> + } > >> + return mbstowcs(to, from, len); > >> + } > >> + > >> #endif /* WIN32 */ > >> > >> int > >> *************** > >> *** 70,75 **** > >> --- 103,113 ---- > >> { > >> wchar_t character; > >> > >> + if (lc_ctype_is_c()) > >> + { > >> + return isalpha(TOUCHAR(ptr)); > >> + } > >> + > >> char2wchar(&character, ptr, 1); > >> > >> return iswalpha((wint_t) character); > >> *************** > >> *** 80,85 **** > >> --- 118,128 ---- > >> { > >> wchar_t character; > >> > >> + if (lc_ctype_is_c()) > >> + { > >> + return isprint(TOUCHAR(ptr)); > >> + } > >> + > >> char2wchar(&character, ptr, 1); > >> > >> return iswprint((wint_t) character); > >> *************** > >> *** 126,132 **** > >> if ( wlen < 0 ) > >> ereport(ERROR, > >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), > >> ! errmsg("transalation failed from server encoding to wchar_t"))); > >> > >> Assert(wlen<=len); > >> wstr[wlen] = 0; > >> --- 169,175 ---- > >> if ( wlen < 0 ) > >> ereport(ERROR, > >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), > >> ! errmsg("translation failed from server encoding to wchar_t"))); > >> > >> Assert(wlen<=len); > >> wstr[wlen] = 0; > >> *************** > >> *** 152,158 **** > >> if ( wlen < 0 ) > >> ereport(ERROR, > >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), > >> ! errmsg("transalation failed from wchar_t to server encoding %d", errno))); > >> Assert(wlen<=len); > >> out[wlen]='\0'; > >> } > >> --- 195,201 ---- > >> if ( wlen < 0 ) > >> ereport(ERROR, > >> (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), > >> ! errmsg("translation failed from wchar_t to server encoding %d", errno))); > >> Assert(wlen<=len); > >> out[wlen]='\0'; > >> } > >> Index: ts_locale.h > >> =================================================================== > >> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v > >> retrieving revision 1.7 > >> diff -c -r1.7 ts_locale.h > >> *** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7 > >> --- ts_locale.h 4 Jan 2007 12:16:00 -0000 > >> *************** > >> *** 38,45 **** > >> #else /* WIN32 */ > >> > >> /* correct mbstowcs */ > >> - #define char2wchar mbstowcs > >> #define wchar2char wcstombs > >> #endif /* WIN32 */ > >> > >> #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) > >> --- 38,46 ---- > >> #else /* WIN32 */ > >> > >> /* correct mbstowcs */ > >> #define wchar2char wcstombs > >> + size_t char2wchar(wchar_t *to, const char *from, size_t len); > >> + > >> #endif /* WIN32 */ > >> > >> #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) > >> *************** > >> *** 54,59 **** > >> --- 55,61 ---- > >> * t_iseq() should be called only for ASCII symbols > >> */ > >> #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) > >> + /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/ > >> > >> #define COPYCHAR(d,s) do { \ > >> int lll = pg_mblen( s ); \ > >> Index: wordparser/parser.c > >> =================================================================== > >> RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v > >> retrieving revision 1.11 > >> diff -c -r1.11 parser.c > >> *** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11 > >> --- wordparser/parser.c 4 Jan 2007 12:16:01 -0000 > >> *************** > >> *** 44,52 **** > >> * Some operating systems fail with multi-byte encodings and a C locale. > >> * Also, for a C locale there is no need to process as multibyte. From > >> * backend/utils/adt/oracle_compat.c Teodor > >> */ > >> > >> ! if (prs->charmaxlen > 1 && !lc_ctype_is_c()) > >> { > >> prs->usewide = true; > >> prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); > >> --- 44,54 ---- > >> * Some operating systems fail with multi-byte encodings and a C locale. > >> * Also, for a C locale there is no need to process as multibyte. From > >> * backend/utils/adt/oracle_compat.c Teodor > >> + * > >> + * This is wrong assumption. even if locale is C, multibyte is necceary. > >> */ > >> > >> ! if (prs->charmaxlen > 1) > >> { > >> prs->usewide = true; > >> prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); > >> *************** > >> *** 92,98 **** > >> static int \ > >> p_is##type(TParser *prs) { \ > >> Assert( prs->state ); \ > >> ! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ > >> is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ > >> } \ > >> \ > >> --- 94,102 ---- > >> static int \ > >> p_is##type(TParser *prs) { \ > >> Assert( prs->state ); \ > >> ! return ( ( prs->usewide ) ? \ > >> ! (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \ > >> ! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \ > >> is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ > >> } \ > >> \ > >> *************** > >> *** 134,141 **** > >> } > >> #endif /* TS_USE_WIDE */ > >> > >> ! p_iswhat(alnum) > >> ! p_iswhat(alpha) > >> p_iswhat(digit) > >> p_iswhat(lower) > >> p_iswhat(print) > >> --- 138,197 ---- > >> } > >> #endif /* TS_USE_WIDE */ > >> > >> ! static int p_isalnum(TParser *prs) { > >> ! Assert( prs->state ); > >> ! > >> ! if (prs->usewide) > >> ! { > >> ! unsigned int c; > >> ! > >> ! c = *(prs->wstr + prs->state->poschar); > >> ! > >> ! if (lc_ctype_is_c()) > >> ! { > >> ! if (c > 0x7f) > >> ! return 1; > >> ! return isalnum(0xff & c); > >> ! } > >> ! else > >> ! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar)); > >> ! } > >> ! else > >> ! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte )); > >> ! } > >> ! > >> ! static int p_isnotalnum(TParser *prs) > >> ! { > >> ! return !p_isalnum(prs); > >> ! } > >> ! > >> ! static int p_isalpha(TParser *prs) { > >> ! Assert( prs->state ); > >> ! > >> ! if (prs->usewide) > >> ! { > >> ! unsigned int c; > >> ! > >> ! c = *(prs->wstr + prs->state->poschar); > >> ! > >> ! if (lc_ctype_is_c()) > >> ! { > >> ! if (c > 0x7f) > >> ! return 1; > >> ! return isalpha(0xff & c); > >> ! } > >> ! else > >> ! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar)); > >> ! } > >> ! else > >> ! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte )); > >> ! } > >> ! > >> ! static int p_isnotalpha(TParser *prs) > >> ! { > >> ! return !p_isalpha(prs); > >> ! } > >> ! > >> p_iswhat(digit) > >> p_iswhat(lower) > >> p_iswhat(print) > >> > >> ------------------------------------------------------------------------ > >> > >> > >> ---------------------------(end of broadcast)--------------------------- > >> TIP 9: In versions below 8.0, the planner will ignore your desire to > >> choose an index scan if your joining column's datatypes do not > >> match > > -- > Teodor Sigaev E-mail: teodor@sigaev.ru > WWW: http://www.sigaev.ru/ >
> I have tested with local-enabled environment and found a bug. Included > is the new version of patches. Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale C', simple way to reproduce: # select to_tsquery('default', '''New York'''); server closed the connection unexpectedly This probably means the server terminated abnormally before or whileprocessing the request. The connection to the server was lost. Attempting reset: Failed. >> ! static int p_isalnum(TParser *prs) { ... >> ! if (lc_ctype_is_c()) >> ! { >> ! if (c > 0x7f) >> ! return 1; I have some some doubts that any character greater than 0x7f is an alpha symbol. Is it simple assumption or workaround? -- Teodor Sigaev E-mail: teodor@sigaev.ru WWW: http://www.sigaev.ru/
From: Teodor Sigaev <teodor@sigaev.ru> Subject: Re: [HACKERS] Request for review: tsearch2 patch Date: Wed, 10 Jan 2007 18:50:44 +0300 Message-ID: <45A50B54.6090608@sigaev.ru> > > I have tested with local-enabled environment and found a bug. Included > > is the new version of patches. > Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale > C', simple way to reproduce: > # select to_tsquery('default', '''New York'''); > server closed the connection unexpectedly > This probably means the server terminated abnormally > before or while processing the request. > The connection to the server was lost. Attempting reset: Failed. It seems it's a bug with original tsearch2. Here is the patches. ------------------------------------------------------------------ *** wordparser/parser.c~ 2007-01-07 09:54:39.000000000 +0900 --- wordparser/parser.c 2007-01-11 10:33:41.000000000 +0900 *************** *** 51,57 **** if (prs->charmaxlen > 1) { prs->usewide = true; ! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); prs->lenwstr = char2wchar(prs->wstr, prs->str,prs->lenstr); } else --- 51,57 ---- if (prs->charmaxlen > 1) { prs->usewide = true; ! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1)); prs->lenwstr = char2wchar(prs->wstr,prs->str, prs->lenstr); } else ------------------------------------------------------------------ > >> ! static int p_isalnum(TParser *prs) { > ... > >> ! if (lc_ctype_is_c()) > >> ! { > >> ! if (c > 0x7f) > >> ! return 1; > > I have some some doubts that any character greater than 0x7f is an alpha symbol. > Is it simple assumption or workaround? Yeah, it's a workaround. Since there's no concept other than alpha/numeric/latin in tsearch2, Asian characters have to be fall in one of them. -- Tatsuo Ishii SRA OSS, Inc. Japan
> > I have tested with local-enabled environment and found a bug. Included > > is the new version of patches. > Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale > C', simple way to reproduce: > # select to_tsquery('default', '''New York'''); > server closed the connection unexpectedly > This probably means the server terminated abnormally > before or while processing the request. > The connection to the server was lost. Attempting reset: Failed. It seems it's a bug with original tsearch2. Here is the patches. ------------------------------------------------------------------ *** wordparser/parser.c~ 2007-01-07 09:54:39.000000000 +0900 --- wordparser/parser.c 2007-01-11 10:33:41.000000000 +0900 *************** *** 51,57 **** if (prs->charmaxlen > 1) { prs->usewide = true; ! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); prs->lenwstr = char2wchar(prs->wstr, prs->str,prs->lenstr); } else --- 51,57 ---- if (prs->charmaxlen > 1) { prs->usewide = true; ! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1)); prs->lenwstr = char2wchar(prs->wstr,prs->str, prs->lenstr); } else ------------------------------------------------------------------ > >> ! static int p_isalnum(TParser *prs) { > ... > >> ! if (lc_ctype_is_c()) > >> ! { > >> ! if (c > 0x7f) > >> ! return 1; > > I have some some doubts that any character greater than 0x7f is an alpha symbol. > Is it simple assumption or workaround? Yeah, it's a workaround. Since there's no concept other than alpha/numeric/latin in tsearch2, Asian characters have to be fall in one of them. -- Tatsuo Ishii SRA OSS, Inc. Japan
> Yeah, it's a workaround. Since there's no concept other than > alpha/numeric/latin in tsearch2, Asian characters have to be fall in > one of them. Ok, I see. Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD and 8.2 branches. PS. Magnus, may I ask you to test under Windows? Thank you. -- Teodor Sigaev E-mail: teodor@sigaev.ru WWW: http://www.sigaev.ru/ diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c *** ../tsearch2.orig/ts_locale.c Fri Jan 12 10:53:11 2007 --- ./ts_locale.c Fri Jan 12 18:10:27 2007 *************** *** 12,24 **** size_t wchar2char(char *to, const wchar_t *from, size_t len) { if (GetDatabaseEncoding() == PG_UTF8) { int r; - if (len == 0) - return 0; - r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len, NULL, NULL); --- 12,24 ---- size_t wchar2char(char *to, const wchar_t *from, size_t len) { + if (len == 0) + return 0; + if (GetDatabaseEncoding() == PG_UTF8) { int r; r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len, NULL, NULL); *************** *** 34,50 **** return wcstombs(to, from, len); } size_t char2wchar(wchar_t *to, const char *from, size_t len) { if (GetDatabaseEncoding() == PG_UTF8) { int r; - if (len == 0) - return 0; - r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); if (!r) --- 34,52 ---- return wcstombs(to, from, len); } + #endif /* WIN32 */ size_t char2wchar(wchar_t *to, const char *from, size_t len) { + if (len == 0) + return 0; + + #ifdef WIN32 if (GetDatabaseEncoding() == PG_UTF8) { int r; r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); if (!r) *************** *** 60,88 **** return r; } return mbstowcs(to, from, len); } - #endif /* WIN32 */ int _t_isalpha(const char *ptr) { ! wchar_t character; ! char2wchar(&character, ptr, 1); ! return iswalpha((wint_t) character); } int _t_isprint(const char *ptr) { ! wchar_t character; ! char2wchar(&character, ptr, 1); ! return iswprint((wint_t) character); } #endif /* TS_USE_WIDE */ --- 62,105 ---- return r; } + else + #endif /* WIN32 */ + if ( lc_ctype_is_c() ) + { + /* + * pg_mb2wchar_with_len always adds trailing '\0', so + * 'to' should be allocated with sufficient space + */ + return pg_mb2wchar_with_len(from, (pg_wchar *)to, len); + } return mbstowcs(to, from, len); } int _t_isalpha(const char *ptr) { ! wchar_t character[2]; ! ! if (lc_ctype_is_c()) ! return isalpha(TOUCHAR(ptr)); ! char2wchar(character, ptr, 1); ! return iswalpha((wint_t) *character); } int _t_isprint(const char *ptr) { ! wchar_t character[2]; ! ! if (lc_ctype_is_c()) ! return isprint(TOUCHAR(ptr)); ! char2wchar(character, ptr, 1); ! return iswprint((wint_t) *character); } #endif /* TS_USE_WIDE */ *************** *** 126,132 **** if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("transalation failed from server encoding to wchar_t"))); Assert(wlen<=len); wstr[wlen] = 0; --- 143,149 ---- if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("translation failed from server encoding to wchar_t"))); Assert(wlen<=len); wstr[wlen] = 0; *************** *** 152,158 **** if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("transalation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } --- 169,175 ---- if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("translation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h *** ../tsearch2.orig/ts_locale.h Fri Jan 12 10:53:11 2007 --- ./ts_locale.h Fri Jan 12 18:10:19 2007 *************** *** 30,45 **** #define TOUCHAR(x) (*((unsigned char*)(x))) #ifdef TS_USE_WIDE #ifdef WIN32 size_t wchar2char(char *to, const wchar_t *from, size_t len); ! size_t char2wchar(wchar_t *to, const char *from, size_t len); #else /* WIN32 */ ! /* correct mbstowcs */ ! #define char2wchar mbstowcs #define wchar2char wcstombs #endif /* WIN32 */ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) --- 30,46 ---- #define TOUCHAR(x) (*((unsigned char*)(x))) #ifdef TS_USE_WIDE + size_t char2wchar(wchar_t *to, const char *from, size_t len); #ifdef WIN32 size_t wchar2char(char *to, const wchar_t *from, size_t len); ! #else /* WIN32 */ ! /* correct wcstombs */ #define wchar2char wcstombs + #endif /* WIN32 */ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) *************** *** 55,64 **** */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) ! #define COPYCHAR(d,s) do { \ ! int lll = pg_mblen( s ); \ ! \ ! while( lll-- ) \ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \ } while(0) --- 56,65 ---- */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) ! #define COPYCHAR(d,s) do { \ ! int lll = pg_mblen( s ); \ ! \ ! while( lll-- ) \ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \ } while(0) diff -c -r -N ../tsearch2.orig/tsearch2.patch ./tsearch2.patch *** ../tsearch2.orig/tsearch2.patch Thu Jan 1 03:00:00 1970 --- ./tsearch2.patch Fri Jan 12 18:12:30 2007 *************** *** 0 **** --- 1,243 ---- + diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c + *** ../tsearch2.orig/ts_locale.c Fri Jan 12 10:53:11 2007 + --- ./ts_locale.c Fri Jan 12 18:10:27 2007 + *************** + *** 12,24 **** + size_t + wchar2char(char *to, const wchar_t *from, size_t len) + { + if (GetDatabaseEncoding() == PG_UTF8) + { + int r; + + - if (len == 0) + - return 0; + - + r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len, + NULL, NULL); + + --- 12,24 ---- + size_t + wchar2char(char *to, const wchar_t *from, size_t len) + { + + if (len == 0) + + return 0; + + + if (GetDatabaseEncoding() == PG_UTF8) + { + int r; + + r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len, + NULL, NULL); + + *************** + *** 34,50 **** + + return wcstombs(to, from, len); + } + + size_t + char2wchar(wchar_t *to, const char *from, size_t len) + { + if (GetDatabaseEncoding() == PG_UTF8) + { + int r; + + - if (len == 0) + - return 0; + - + r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); + + if (!r) + --- 34,52 ---- + + return wcstombs(to, from, len); + } + + #endif /* WIN32 */ + + size_t + char2wchar(wchar_t *to, const char *from, size_t len) + { + + if (len == 0) + + return 0; + + + + #ifdef WIN32 + if (GetDatabaseEncoding() == PG_UTF8) + { + int r; + + r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); + + if (!r) + *************** + *** 60,88 **** + + return r; + } + + return mbstowcs(to, from, len); + } + - #endif /* WIN32 */ + + int + _t_isalpha(const char *ptr) + { + ! wchar_t character; + + ! char2wchar(&character, ptr, 1); + + ! return iswalpha((wint_t) character); + } + + int + _t_isprint(const char *ptr) + { + ! wchar_t character; + + ! char2wchar(&character, ptr, 1); + + ! return iswprint((wint_t) character); + } + #endif /* TS_USE_WIDE */ + + --- 62,105 ---- + + return r; + } + + else + + #endif /* WIN32 */ + + if ( lc_ctype_is_c() ) + + { + + /* + + * pg_mb2wchar_with_len always adds trailing '\0', so + + * 'to' should be allocated with sufficient space + + */ + + return pg_mb2wchar_with_len(from, (pg_wchar *)to, len); + + } + + return mbstowcs(to, from, len); + } + + int + _t_isalpha(const char *ptr) + { + ! wchar_t character[2]; + ! + ! if (lc_ctype_is_c()) + ! return isalpha(TOUCHAR(ptr)); + + ! char2wchar(character, ptr, 1); + + ! return iswalpha((wint_t) *character); + } + + int + _t_isprint(const char *ptr) + { + ! wchar_t character[2]; + ! + ! if (lc_ctype_is_c()) + ! return isprint(TOUCHAR(ptr)); + + ! char2wchar(character, ptr, 1); + + ! return iswprint((wint_t) *character); + } + #endif /* TS_USE_WIDE */ + + *************** + *** 126,132 **** + if ( wlen < 0 ) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + ! errmsg("transalation failed from server encoding to wchar_t"))); + + Assert(wlen<=len); + wstr[wlen] = 0; + --- 143,149 ---- + if ( wlen < 0 ) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + ! errmsg("translation failed from server encoding to wchar_t"))); + + Assert(wlen<=len); + wstr[wlen] = 0; + *************** + *** 152,158 **** + if ( wlen < 0 ) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + ! errmsg("transalation failed from wchar_t to server encoding %d", errno))); + Assert(wlen<=len); + out[wlen]='\0'; + } + --- 169,175 ---- + if ( wlen < 0 ) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + ! errmsg("translation failed from wchar_t to server encoding %d", errno))); + Assert(wlen<=len); + out[wlen]='\0'; + } + diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h + *** ../tsearch2.orig/ts_locale.h Fri Jan 12 10:53:11 2007 + --- ./ts_locale.h Fri Jan 12 18:10:19 2007 + *************** + *** 30,45 **** + #define TOUCHAR(x) (*((unsigned char*)(x))) + + #ifdef TS_USE_WIDE + + #ifdef WIN32 + + size_t wchar2char(char *to, const wchar_t *from, size_t len); + ! size_t char2wchar(wchar_t *to, const char *from, size_t len); + #else /* WIN32 */ + + ! /* correct mbstowcs */ + ! #define char2wchar mbstowcs + #define wchar2char wcstombs + #endif /* WIN32 */ + + #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) + --- 30,46 ---- + #define TOUCHAR(x) (*((unsigned char*)(x))) + + #ifdef TS_USE_WIDE + + size_t char2wchar(wchar_t *to, const char *from, size_t len); + + #ifdef WIN32 + + size_t wchar2char(char *to, const wchar_t *from, size_t len); + ! + #else /* WIN32 */ + + ! /* correct wcstombs */ + #define wchar2char wcstombs + + + #endif /* WIN32 */ + + #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) + *************** + *** 55,64 **** + */ + #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) + + ! #define COPYCHAR(d,s) do { \ + ! int lll = pg_mblen( s ); \ + ! \ + ! while( lll-- ) \ + TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \ + } while(0) + + --- 56,65 ---- + */ + #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) + + ! #define COPYCHAR(d,s) do { \ + ! int lll = pg_mblen( s ); \ + ! \ + ! while( lll-- ) \ + TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \ + } while(0) + diff -c -r -N ../tsearch2.orig/wordparser/parser.c ./wordparser/parser.c *** ../tsearch2.orig/wordparser/parser.c Fri Jan 12 10:53:11 2007 --- ./wordparser/parser.c Fri Jan 12 18:10:38 2007 *************** *** 40,55 **** #ifdef TS_USE_WIDE /* ! * Use wide char code only when max encoding length > 1 and ctype != C. ! * Some operating systems fail with multi-byte encodings and a C locale. ! * Also, for a C locale there is no need to process as multibyte. From ! * backend/utils/adt/oracle_compat.c Teodor */ ! if (prs->charmaxlen > 1 && !lc_ctype_is_c()) { prs->usewide = true; ! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); } else --- 40,52 ---- #ifdef TS_USE_WIDE /* ! * Use wide char code only when max encoding length > 1. */ ! if (prs->charmaxlen > 1) { prs->usewide = true; ! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1)); prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); } else *************** *** 83,107 **** /* * defining support function, equvalent is* macroses, but ! * working with any possible encodings and locales */ #ifdef TS_USE_WIDE ! #define p_iswhat(type) \ ! static int \ ! p_is##type(TParser *prs) { \ ! Assert( prs->state ); \ ! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ ! is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ ! } \ ! \ ! static int \ ! p_isnot##type(TParser *prs) { \ ! return !p_is##type(prs); \ } /* p_iseq should be used only for ascii symbols */ --- 80,178 ---- /* * defining support function, equvalent is* macroses, but ! * working with any possible encodings and locales. Note, ! * that with multibyte encoding and C-locale isw* function may fail ! * or give wrong result. Note 2: multibyte encoding and C-locale ! * often are used for Asian languages. */ #ifdef TS_USE_WIDE ! #define p_iswhat(type) \ ! static int \ ! p_is##type(TParser *prs) { \ ! Assert( prs->state ); \ ! if ( prs->usewide ) \ ! { \ ! if ( lc_ctype_is_c() ) \ ! return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \ ! \ ! return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \ ! } \ ! \ ! return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ ! } \ ! \ ! static int \ ! p_isnot##type(TParser *prs) { \ ! return !p_is##type(prs); \ } + static int + p_isalnum(TParser *prs) + { + Assert( prs->state ); + + if (prs->usewide) + { + if (lc_ctype_is_c()) + { + unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar); + + /* + * any non-ascii symbol with multibyte encoding + * with C-locale is an alpha character + */ + if ( c > 0x7f ) + return 1; + + return isalnum(0xff & c); + } + + return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar)); + } + + return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte )); + } + static int + p_isnotalnum(TParser *prs) + { + return !p_isalnum(prs); + } + + static int + p_isalpha(TParser *prs) + { + Assert( prs->state ); + + if (prs->usewide) + { + if (lc_ctype_is_c()) + { + unsigned int c = *(prs->wstr + prs->state->poschar); + + /* + * any non-ascii symbol with multibyte encoding + * with C-locale is an alpha character + */ + if ( c > 0x7f ) + return 1; + + return isalpha(0xff & c); + } + + return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar)); + } + + return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte )); + } + + static int + p_isnotalpha(TParser *prs) + { + return !p_isalpha(prs); + } /* p_iseq should be used only for ascii symbols */ *************** *** 111,128 **** Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } #else /* TS_USE_WIDE */ ! #define p_iswhat(type) \ ! static int \ ! p_is##type(TParser *prs) { \ ! Assert( prs->state ); \ ! return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ ! } \ ! \ ! static int \ ! p_isnot##type(TParser *prs) { \ ! return !p_is##type(prs); \ } --- 182,200 ---- Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } + #else /* TS_USE_WIDE */ ! #define p_iswhat(type) \ ! static int \ ! p_is##type(TParser *prs) { \ ! Assert( prs->state ); \ ! return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ ! } \ ! \ ! static int \ ! p_isnot##type(TParser *prs) { \ ! return !p_is##type(prs); \ } *************** *** 132,141 **** Assert(prs->state); return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; } - #endif /* TS_USE_WIDE */ p_iswhat(alnum) p_iswhat(alpha) p_iswhat(digit) p_iswhat(lower) p_iswhat(print) --- 204,215 ---- Assert(prs->state); return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; } p_iswhat(alnum) p_iswhat(alpha) + + #endif /* TS_USE_WIDE */ + p_iswhat(digit) p_iswhat(lower) p_iswhat(print)
> > Yeah, it's a workaround. Since there's no concept other than > > alpha/numeric/latin in tsearch2, Asian characters have to be fall in > > one of them. > > Ok, I see. > > Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD > and 8.2 branches. I have tested on a Linux box running PostgreSQL 8.2.1 (C locale, EUC_JP encoding), and it worked great! BTW, is your patch supposed to work with PostgreSQL 8.1? -- Tatsuo Ishii SRA OSS, Inc. Japan > PS. Magnus, may I ask you to test under Windows? Thank you. > > -- > Teodor Sigaev E-mail: teodor@sigaev.ru > WWW: http://www.sigaev.ru/