Thread: Request for review: tsearch2 patch

Request for review: tsearch2 patch

From
Tatsuo Ishii
Date:
Hi,

Here are patches against tsearch2 with CVS head.  Currently tsearch2
does not work with multibyte encoding which uses C locale. These
patches are intended to solve the problem by using PostgreSQL in-house
multibyte function instead of mbstowcs which does not work with C
locale. Also iswalpha etc. will not be called in case of C locale
since they are not working with it. Tested with the EUC_JP encoding
(should be working with any multibye encodings). Existing single byte
encodings should not be broken by the patches, I did not test though.
--
Tatsuo Ishii
SRA OSS, Inc. Japan
Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c    20 Nov 2006 14:03:30 -0000    1.7
--- ts_locale.c    1 Jan 2007 12:22:50 -0000
***************
*** 63,68 ****
--- 63,101 ----      return mbstowcs(to, from, len); }
+ 
+ #else    /* WIN32 */
+ 
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+     wchar_t *result;
+     size_t n;
+ 
+     if (to == NULL)
+         return 0;
+ 
+     if (lc_ctype_is_c)
+     {
+         /* allocate neccesary memory for "to" including NULL terminate */
+         result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+ 
+         /* do the conversion */
+         n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+         if (n > 0)
+         {
+             /* store the result */
+             if (n > len)
+                 n = len;
+             memcpy(to, result, n*sizeof(wchar_t));
+             pfree(result);
+             *(to + n) = '\0';
+         }
+         return n;
+     }
+     return mbstowcs(to, from, len);
+ }
+  #endif   /* WIN32 */  int
***************
*** 70,75 ****
--- 103,113 ---- {     wchar_t        character; 
+     if (lc_ctype_is_c)
+     {
+         return isalpha(TOUCHAR(ptr));
+     }
+      char2wchar(&character, ptr, 1);      return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ---- {     wchar_t        character; 
+     if (lc_ctype_is_c)
+     {
+         return isprint(TOUCHAR(ptr));
+     }
+      char2wchar(&character, ptr, 1);      return iswprint((wint_t) character);
***************
*** 126,132 ****         if ( wlen < 0 )             ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("transalation failed from server encoding to wchar_t")));          Assert(wlen<=len);
   wstr[wlen] = 0;
 
--- 169,175 ----         if ( wlen < 0 )             ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("translation failed from server encoding to wchar_t")));          Assert(wlen<=len);
  wstr[wlen] = 0;
 
***************
*** 152,158 ****         if ( wlen < 0 )             ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);        out[wlen]='\0';     }
 
--- 195,201 ----         if ( wlen < 0 )             ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);        out[wlen]='\0';     }
 
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h    4 Oct 2006 00:29:47 -0000    1.7
--- ts_locale.h    1 Jan 2007 12:22:50 -0000
***************
*** 38,45 **** #else                            /* WIN32 */  /* correct mbstowcs */
- #define char2wchar mbstowcs #define wchar2char wcstombs #endif   /* WIN32 */  #define t_isdigit(x)    (
pg_mblen(x)==1&& isdigit( TOUCHAR(x) ) )
 
--- 38,46 ---- #else                            /* WIN32 */  /* correct mbstowcs */ #define wchar2char wcstombs
+ size_t        char2wchar(wchar_t *to, const char *from, size_t len);
+  #endif   /* WIN32 */  #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ----  * t_iseq() should be called only for ASCII symbols  */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? (
TOUCHAR(x)== ((unsigned char)(c)) ) : false )
 
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/  #define COPYCHAR(d,s)    do {                \     int
lll= pg_mblen( s );            \
 
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c    4 Oct 2006 00:29:47 -0000    1.11
--- wordparser/parser.c    1 Jan 2007 12:22:51 -0000
***************
*** 44,52 ****      * Some operating systems fail with multi-byte encodings and a C locale.      * Also, for a C locale
thereis no need to process as multibyte. From      * backend/utils/adt/oracle_compat.c Teodor      */ 
 
!     if (prs->charmaxlen > 1 && !lc_ctype_is_c())     {         prs->usewide = true;         prs->wstr = (wchar_t *)
palloc(sizeof(wchar_t)* prs->lenstr);
 
--- 44,54 ----      * Some operating systems fail with multi-byte encodings and a C locale.      * Also, for a C locale
thereis no need to process as multibyte. From      * backend/utils/adt/oracle_compat.c Teodor
 
+      *
+      * This is wrong assumption. even if locale is C, multibyte is necceary.      */ 
!     if (prs->charmaxlen > 1)     {         prs->usewide = true;         prs->wstr = (wchar_t *)
palloc(sizeof(wchar_t)* prs->lenstr);
 
***************
*** 92,98 **** static int                                            \ p_is##type(TParser *prs) {
            \     Assert( prs->state );                                    \
 
!     return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \         is##type(
(unsignedchar)*( prs->str + prs->state->posbyte ) ) );        \ }    \
\
--- 94,102 ---- static int                                            \ p_is##type(TParser *prs) {
             \     Assert( prs->state );                                    \
 
!     return ( ( prs->usewide ) ? \
!              (lc_ctype_is_c? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
!               isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \         is##type( (unsigned char)*(
prs->str+ prs->state->posbyte ) ) );        \ }    \                                                 \
 
***************
*** 134,141 **** } #endif   /* TS_USE_WIDE */ 
! p_iswhat(alnum)
! p_iswhat(alpha) p_iswhat(digit) p_iswhat(lower) p_iswhat(print)
--- 138,197 ---- } #endif   /* TS_USE_WIDE */ 
! static int p_isalnum(TParser *prs) {
!     Assert( prs->state );
! 
!     if (prs->usewide)
!     {
!         unsigned int c;
! 
!         c = *(prs->wstr + prs->state->poschar);
! 
!         if (lc_ctype_is_c)
!         {
!             if (c > 0x7f)
!                 return 1;
!             return isalnum(0xff & c);
!         }
!         else
!             return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
!     }
!     else
!         return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
! 
! static int    p_isnotalnum(TParser *prs)
! {
!     return !p_isalnum(prs);
! }
! 
! static int p_isalpha(TParser *prs) {
!     Assert( prs->state );
! 
!     if (prs->usewide)
!     {
!         unsigned int c;
! 
!         c = *(prs->wstr + prs->state->poschar);
! 
!         if (lc_ctype_is_c)
!         {
!             if (c > 0x7f)
!                 return 1;
!             return isalpha(0xff & c);
!         }
!         else
!             return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
!     }
!     else
!         return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
! 
! static int    p_isnotalpha(TParser *prs)
! {
!     return !p_isalpha(prs);
! }
!  p_iswhat(digit) p_iswhat(lower) p_iswhat(print)

Re: Request for review: tsearch2 patch

From
Tatsuo Ishii
Date:
I have tested with local-enabled environment and found a bug. Included
is the new version of patches. 

Teodor, Oleg, what do you think about these patches?
If ok, shall I commit to CVS head?
--
Tatsuo Ishii
SRA OSS, Inc. Japan

> Hi,
> 
> Here are patches against tsearch2 with CVS head.  Currently tsearch2
> does not work with multibyte encoding which uses C locale. These
> patches are intended to solve the problem by using PostgreSQL in-house
> multibyte function instead of mbstowcs which does not work with C
> locale. Also iswalpha etc. will not be called in case of C locale
> since they are not working with it. Tested with the EUC_JP encoding
> (should be working with any multibye encodings). Existing single byte
> encodings should not be broken by the patches, I did not test though.
> --
> Tatsuo Ishii
> SRA OSS, Inc. Japan
Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c    20 Nov 2006 14:03:30 -0000    1.7
--- ts_locale.c    4 Jan 2007 12:16:00 -0000
***************
*** 63,68 ****
--- 63,101 ----      return mbstowcs(to, from, len); }
+ 
+ #else    /* WIN32 */
+ 
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+     wchar_t *result;
+     size_t n;
+ 
+     if (to == NULL)
+         return 0;
+ 
+     if (lc_ctype_is_c())
+     {
+         /* allocate neccesary memory for "to" including NULL terminate */
+         result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+ 
+         /* do the conversion */
+         n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+         if (n > 0)
+         {
+             /* store the result */
+             if (n > len)
+                 n = len;
+             memcpy(to, result, n*sizeof(wchar_t));
+             pfree(result);
+             *(to + n) = '\0';
+         }
+         return n;
+     }
+     return mbstowcs(to, from, len);
+ }
+  #endif   /* WIN32 */  int
***************
*** 70,75 ****
--- 103,113 ---- {     wchar_t        character; 
+     if (lc_ctype_is_c())
+     {
+         return isalpha(TOUCHAR(ptr));
+     }
+      char2wchar(&character, ptr, 1);      return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ---- {     wchar_t        character; 
+     if (lc_ctype_is_c())
+     {
+         return isprint(TOUCHAR(ptr));
+     }
+      char2wchar(&character, ptr, 1);      return iswprint((wint_t) character);
***************
*** 126,132 ****         if ( wlen < 0 )             ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("transalation failed from server encoding to wchar_t")));          Assert(wlen<=len);
   wstr[wlen] = 0;
 
--- 169,175 ----         if ( wlen < 0 )             ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("translation failed from server encoding to wchar_t")));          Assert(wlen<=len);
  wstr[wlen] = 0;
 
***************
*** 152,158 ****         if ( wlen < 0 )             ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);        out[wlen]='\0';     }
 
--- 195,201 ----         if ( wlen < 0 )             ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);        out[wlen]='\0';     }
 
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h    4 Oct 2006 00:29:47 -0000    1.7
--- ts_locale.h    4 Jan 2007 12:16:00 -0000
***************
*** 38,45 **** #else                            /* WIN32 */  /* correct mbstowcs */
- #define char2wchar mbstowcs #define wchar2char wcstombs #endif   /* WIN32 */  #define t_isdigit(x)    (
pg_mblen(x)==1&& isdigit( TOUCHAR(x) ) )
 
--- 38,46 ---- #else                            /* WIN32 */  /* correct mbstowcs */ #define wchar2char wcstombs
+ size_t        char2wchar(wchar_t *to, const char *from, size_t len);
+  #endif   /* WIN32 */  #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ----  * t_iseq() should be called only for ASCII symbols  */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? (
TOUCHAR(x)== ((unsigned char)(c)) ) : false )
 
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/  #define COPYCHAR(d,s)    do {                \     int
lll= pg_mblen( s );            \
 
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c    4 Oct 2006 00:29:47 -0000    1.11
--- wordparser/parser.c    4 Jan 2007 12:16:01 -0000
***************
*** 44,52 ****      * Some operating systems fail with multi-byte encodings and a C locale.      * Also, for a C locale
thereis no need to process as multibyte. From      * backend/utils/adt/oracle_compat.c Teodor      */ 
 
!     if (prs->charmaxlen > 1 && !lc_ctype_is_c())     {         prs->usewide = true;         prs->wstr = (wchar_t *)
palloc(sizeof(wchar_t)* prs->lenstr);
 
--- 44,54 ----      * Some operating systems fail with multi-byte encodings and a C locale.      * Also, for a C locale
thereis no need to process as multibyte. From      * backend/utils/adt/oracle_compat.c Teodor
 
+      *
+      * This is wrong assumption. even if locale is C, multibyte is necceary.      */ 
!     if (prs->charmaxlen > 1)     {         prs->usewide = true;         prs->wstr = (wchar_t *)
palloc(sizeof(wchar_t)* prs->lenstr);
 
***************
*** 92,98 **** static int                                            \ p_is##type(TParser *prs) {
            \     Assert( prs->state );                                    \
 
!     return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \         is##type(
(unsignedchar)*( prs->str + prs->state->posbyte ) ) );        \ }    \
\
--- 94,102 ---- static int                                            \ p_is##type(TParser *prs) {
             \     Assert( prs->state );                                    \
 
!     return ( ( prs->usewide ) ? \
!              (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
!               isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \         is##type( (unsigned char)*(
prs->str+ prs->state->posbyte ) ) );        \ }    \                                                 \
 
***************
*** 134,141 **** } #endif   /* TS_USE_WIDE */ 
! p_iswhat(alnum)
! p_iswhat(alpha) p_iswhat(digit) p_iswhat(lower) p_iswhat(print)
--- 138,197 ---- } #endif   /* TS_USE_WIDE */ 
! static int p_isalnum(TParser *prs) {
!     Assert( prs->state );
! 
!     if (prs->usewide)
!     {
!         unsigned int c;
! 
!         c = *(prs->wstr + prs->state->poschar);
! 
!         if (lc_ctype_is_c())
!         {
!             if (c > 0x7f)
!                 return 1;
!             return isalnum(0xff & c);
!         }
!         else
!             return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
!     }
!     else
!         return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
! 
! static int    p_isnotalnum(TParser *prs)
! {
!     return !p_isalnum(prs);
! }
! 
! static int p_isalpha(TParser *prs) {
!     Assert( prs->state );
! 
!     if (prs->usewide)
!     {
!         unsigned int c;
! 
!         c = *(prs->wstr + prs->state->poschar);
! 
!         if (lc_ctype_is_c())
!         {
!             if (c > 0x7f)
!                 return 1;
!             return isalpha(0xff & c);
!         }
!         else
!             return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
!     }
!     else
!         return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
! 
! static int    p_isnotalpha(TParser *prs)
! {
!     return !p_isalpha(prs);
! }
!  p_iswhat(digit) p_iswhat(lower) p_iswhat(print)

Re: Request for review: tsearch2 patch

From
Teodor Sigaev
Date:
Sorry for delay, I was on holidays :)

Did you test patch on Windows platform?

Tatsuo Ishii wrote:
> I have tested with local-enabled environment and found a bug. Included
> is the new version of patches. 
> 
> Teodor, Oleg, what do you think about these patches?
> If ok, shall I commit to CVS head?
> --
> Tatsuo Ishii
> SRA OSS, Inc. Japan
> 
>> Hi,
>>
>> Here are patches against tsearch2 with CVS head.  Currently tsearch2
>> does not work with multibyte encoding which uses C locale. These
>> patches are intended to solve the problem by using PostgreSQL in-house
>> multibyte function instead of mbstowcs which does not work with C
>> locale. Also iswalpha etc. will not be called in case of C locale
>> since they are not working with it. Tested with the EUC_JP encoding
>> (should be working with any multibye encodings). Existing single byte
>> encodings should not be broken by the patches, I did not test though.
>> --
>> Tatsuo Ishii
>> SRA OSS, Inc. Japan
>>
>> ------------------------------------------------------------------------
>>
>> Index: ts_locale.c
>> ===================================================================
>> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
>> retrieving revision 1.7
>> diff -c -r1.7 ts_locale.c
>> *** ts_locale.c    20 Nov 2006 14:03:30 -0000    1.7
>> --- ts_locale.c    4 Jan 2007 12:16:00 -0000
>> ***************
>> *** 63,68 ****
>> --- 63,101 ----
>>   
>>       return mbstowcs(to, from, len);
>>   }
>> + 
>> + #else    /* WIN32 */
>> + 
>> + size_t
>> + char2wchar(wchar_t *to, const char *from, size_t len)
>> + {
>> +     wchar_t *result;
>> +     size_t n;
>> + 
>> +     if (to == NULL)
>> +         return 0;
>> + 
>> +     if (lc_ctype_is_c())
>> +     {
>> +         /* allocate neccesary memory for "to" including NULL terminate */
>> +         result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
>> + 
>> +         /* do the conversion */
>> +         n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
>> +         if (n > 0)
>> +         {
>> +             /* store the result */
>> +             if (n > len)
>> +                 n = len;
>> +             memcpy(to, result, n*sizeof(wchar_t));
>> +             pfree(result);
>> +             *(to + n) = '\0';
>> +         }
>> +         return n;
>> +     }
>> +     return mbstowcs(to, from, len);
>> + }
>> + 
>>   #endif   /* WIN32 */
>>   
>>   int
>> ***************
>> *** 70,75 ****
>> --- 103,113 ----
>>   {
>>       wchar_t        character;
>>   
>> +     if (lc_ctype_is_c())
>> +     {
>> +         return isalpha(TOUCHAR(ptr));
>> +     }
>> + 
>>       char2wchar(&character, ptr, 1);
>>   
>>       return iswalpha((wint_t) character);
>> ***************
>> *** 80,85 ****
>> --- 118,128 ----
>>   {
>>       wchar_t        character;
>>   
>> +     if (lc_ctype_is_c())
>> +     {
>> +         return isprint(TOUCHAR(ptr));
>> +     }
>> + 
>>       char2wchar(&character, ptr, 1);
>>   
>>       return iswprint((wint_t) character);
>> ***************
>> *** 126,132 ****
>>           if ( wlen < 0 )
>>               ereport(ERROR,
>>                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
>> !                      errmsg("transalation failed from server encoding to wchar_t")));
>>   
>>           Assert(wlen<=len);
>>           wstr[wlen] = 0;
>> --- 169,175 ----
>>           if ( wlen < 0 )
>>               ereport(ERROR,
>>                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
>> !                      errmsg("translation failed from server encoding to wchar_t")));
>>   
>>           Assert(wlen<=len);
>>           wstr[wlen] = 0;
>> ***************
>> *** 152,158 ****
>>           if ( wlen < 0 )
>>               ereport(ERROR,
>>                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
>> !                      errmsg("transalation failed from wchar_t to server encoding %d", errno)));
>>           Assert(wlen<=len);
>>           out[wlen]='\0';
>>       }
>> --- 195,201 ----
>>           if ( wlen < 0 )
>>               ereport(ERROR,
>>                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
>> !                      errmsg("translation failed from wchar_t to server encoding %d", errno)));
>>           Assert(wlen<=len);
>>           out[wlen]='\0';
>>       }
>> Index: ts_locale.h
>> ===================================================================
>> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
>> retrieving revision 1.7
>> diff -c -r1.7 ts_locale.h
>> *** ts_locale.h    4 Oct 2006 00:29:47 -0000    1.7
>> --- ts_locale.h    4 Jan 2007 12:16:00 -0000
>> ***************
>> *** 38,45 ****
>>   #else                            /* WIN32 */
>>   
>>   /* correct mbstowcs */
>> - #define char2wchar mbstowcs
>>   #define wchar2char wcstombs
>>   #endif   /* WIN32 */
>>   
>>   #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
>> --- 38,46 ----
>>   #else                            /* WIN32 */
>>   
>>   /* correct mbstowcs */
>>   #define wchar2char wcstombs
>> + size_t        char2wchar(wchar_t *to, const char *from, size_t len);
>> + 
>>   #endif   /* WIN32 */
>>   
>>   #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
>> ***************
>> *** 54,59 ****
>> --- 55,61 ----
>>    * t_iseq() should be called only for ASCII symbols
>>    */
>>   #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
>> + /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
>>   
>>   #define COPYCHAR(d,s)    do {                \
>>       int lll = pg_mblen( s );            \
>> Index: wordparser/parser.c
>> ===================================================================
>> RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
>> retrieving revision 1.11
>> diff -c -r1.11 parser.c
>> *** wordparser/parser.c    4 Oct 2006 00:29:47 -0000    1.11
>> --- wordparser/parser.c    4 Jan 2007 12:16:01 -0000
>> ***************
>> *** 44,52 ****
>>        * Some operating systems fail with multi-byte encodings and a C locale.
>>        * Also, for a C locale there is no need to process as multibyte. From
>>        * backend/utils/adt/oracle_compat.c Teodor
>>        */
>>   
>> !     if (prs->charmaxlen > 1 && !lc_ctype_is_c())
>>       {
>>           prs->usewide = true;
>>           prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
>> --- 44,54 ----
>>        * Some operating systems fail with multi-byte encodings and a C locale.
>>        * Also, for a C locale there is no need to process as multibyte. From
>>        * backend/utils/adt/oracle_compat.c Teodor
>> +      *
>> +      * This is wrong assumption. even if locale is C, multibyte is necceary.
>>        */
>>   
>> !     if (prs->charmaxlen > 1)
>>       {
>>           prs->usewide = true;
>>           prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
>> ***************
>> *** 92,98 ****
>>   static int                                            \
>>   p_is##type(TParser *prs) {                                    \
>>       Assert( prs->state );                                    \
>> !     return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
>>           is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );        \
>>   }    \
>>                                                   \
>> --- 94,102 ----
>>   static int                                            \
>>   p_is##type(TParser *prs) {                                    \
>>       Assert( prs->state );                                    \
>> !     return ( ( prs->usewide ) ? \
>> !              (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
>> !               isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
>>           is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );        \
>>   }    \
>>                                                   \
>> ***************
>> *** 134,141 ****
>>   }
>>   #endif   /* TS_USE_WIDE */
>>   
>> ! p_iswhat(alnum)
>> ! p_iswhat(alpha)
>>   p_iswhat(digit)
>>   p_iswhat(lower)
>>   p_iswhat(print)
>> --- 138,197 ----
>>   }
>>   #endif   /* TS_USE_WIDE */
>>   
>> ! static int p_isalnum(TParser *prs) {
>> !     Assert( prs->state );
>> ! 
>> !     if (prs->usewide)
>> !     {
>> !         unsigned int c;
>> ! 
>> !         c = *(prs->wstr + prs->state->poschar);
>> ! 
>> !         if (lc_ctype_is_c())
>> !         {
>> !             if (c > 0x7f)
>> !                 return 1;
>> !             return isalnum(0xff & c);
>> !         }
>> !         else
>> !             return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
>> !     }
>> !     else
>> !         return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
>> ! }
>> ! 
>> ! static int    p_isnotalnum(TParser *prs)
>> ! {
>> !     return !p_isalnum(prs);
>> ! }
>> !
>> ! static int p_isalpha(TParser *prs) {
>> !     Assert( prs->state );
>> ! 
>> !     if (prs->usewide)
>> !     {
>> !         unsigned int c;
>> ! 
>> !         c = *(prs->wstr + prs->state->poschar);
>> ! 
>> !         if (lc_ctype_is_c())
>> !         {
>> !             if (c > 0x7f)
>> !                 return 1;
>> !             return isalpha(0xff & c);
>> !         }
>> !         else
>> !             return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
>> !     }
>> !     else
>> !         return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
>> ! }
>> ! 
>> ! static int    p_isnotalpha(TParser *prs)
>> ! {
>> !     return !p_isalpha(prs);
>> ! }
>> ! 
>>   p_iswhat(digit)
>>   p_iswhat(lower)
>>   p_iswhat(print)
>>
>> ------------------------------------------------------------------------
>>
>>
>> ---------------------------(end of broadcast)---------------------------
>> TIP 9: In versions below 8.0, the planner will ignore your desire to
>>        choose an index scan if your joining column's datatypes do not
>>        match

-- 
Teodor Sigaev                                   E-mail: teodor@sigaev.ru
  WWW: http://www.sigaev.ru/
 


Re: Request for review: tsearch2 patch

From
Tatsuo Ishii
Date:
> Sorry for delay, I was on holidays :)
> 
> Did you test patch on Windows platform?

No. I myself does not use Windows platform.

Do you have any concern on Windows regarding my patches?
--
Tatsuo Ishii
SRA OSS, Inc. Japan

> Tatsuo Ishii wrote:
> > I have tested with local-enabled environment and found a bug. Included
> > is the new version of patches. 
> > 
> > Teodor, Oleg, what do you think about these patches?
> > If ok, shall I commit to CVS head?
> > --
> > Tatsuo Ishii
> > SRA OSS, Inc. Japan
> > 
> >> Hi,
> >>
> >> Here are patches against tsearch2 with CVS head.  Currently tsearch2
> >> does not work with multibyte encoding which uses C locale. These
> >> patches are intended to solve the problem by using PostgreSQL in-house
> >> multibyte function instead of mbstowcs which does not work with C
> >> locale. Also iswalpha etc. will not be called in case of C locale
> >> since they are not working with it. Tested with the EUC_JP encoding
> >> (should be working with any multibye encodings). Existing single byte
> >> encodings should not be broken by the patches, I did not test though.
> >> --
> >> Tatsuo Ishii
> >> SRA OSS, Inc. Japan
> >>
> >> ------------------------------------------------------------------------
> >>
> >> Index: ts_locale.c
> >> ===================================================================
> >> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
> >> retrieving revision 1.7
> >> diff -c -r1.7 ts_locale.c
> >> *** ts_locale.c    20 Nov 2006 14:03:30 -0000    1.7
> >> --- ts_locale.c    4 Jan 2007 12:16:00 -0000
> >> ***************
> >> *** 63,68 ****
> >> --- 63,101 ----
> >>   
> >>       return mbstowcs(to, from, len);
> >>   }
> >> + 
> >> + #else    /* WIN32 */
> >> + 
> >> + size_t
> >> + char2wchar(wchar_t *to, const char *from, size_t len)
> >> + {
> >> +     wchar_t *result;
> >> +     size_t n;
> >> + 
> >> +     if (to == NULL)
> >> +         return 0;
> >> + 
> >> +     if (lc_ctype_is_c())
> >> +     {
> >> +         /* allocate neccesary memory for "to" including NULL terminate */
> >> +         result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
> >> + 
> >> +         /* do the conversion */
> >> +         n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
> >> +         if (n > 0)
> >> +         {
> >> +             /* store the result */
> >> +             if (n > len)
> >> +                 n = len;
> >> +             memcpy(to, result, n*sizeof(wchar_t));
> >> +             pfree(result);
> >> +             *(to + n) = '\0';
> >> +         }
> >> +         return n;
> >> +     }
> >> +     return mbstowcs(to, from, len);
> >> + }
> >> + 
> >>   #endif   /* WIN32 */
> >>   
> >>   int
> >> ***************
> >> *** 70,75 ****
> >> --- 103,113 ----
> >>   {
> >>       wchar_t        character;
> >>   
> >> +     if (lc_ctype_is_c())
> >> +     {
> >> +         return isalpha(TOUCHAR(ptr));
> >> +     }
> >> + 
> >>       char2wchar(&character, ptr, 1);
> >>   
> >>       return iswalpha((wint_t) character);
> >> ***************
> >> *** 80,85 ****
> >> --- 118,128 ----
> >>   {
> >>       wchar_t        character;
> >>   
> >> +     if (lc_ctype_is_c())
> >> +     {
> >> +         return isprint(TOUCHAR(ptr));
> >> +     }
> >> + 
> >>       char2wchar(&character, ptr, 1);
> >>   
> >>       return iswprint((wint_t) character);
> >> ***************
> >> *** 126,132 ****
> >>           if ( wlen < 0 )
> >>               ereport(ERROR,
> >>                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> >> !                      errmsg("transalation failed from server encoding to wchar_t")));
> >>   
> >>           Assert(wlen<=len);
> >>           wstr[wlen] = 0;
> >> --- 169,175 ----
> >>           if ( wlen < 0 )
> >>               ereport(ERROR,
> >>                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> >> !                      errmsg("translation failed from server encoding to wchar_t")));
> >>   
> >>           Assert(wlen<=len);
> >>           wstr[wlen] = 0;
> >> ***************
> >> *** 152,158 ****
> >>           if ( wlen < 0 )
> >>               ereport(ERROR,
> >>                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> >> !                      errmsg("transalation failed from wchar_t to server encoding %d", errno)));
> >>           Assert(wlen<=len);
> >>           out[wlen]='\0';
> >>       }
> >> --- 195,201 ----
> >>           if ( wlen < 0 )
> >>               ereport(ERROR,
> >>                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
> >> !                      errmsg("translation failed from wchar_t to server encoding %d", errno)));
> >>           Assert(wlen<=len);
> >>           out[wlen]='\0';
> >>       }
> >> Index: ts_locale.h
> >> ===================================================================
> >> RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
> >> retrieving revision 1.7
> >> diff -c -r1.7 ts_locale.h
> >> *** ts_locale.h    4 Oct 2006 00:29:47 -0000    1.7
> >> --- ts_locale.h    4 Jan 2007 12:16:00 -0000
> >> ***************
> >> *** 38,45 ****
> >>   #else                            /* WIN32 */
> >>   
> >>   /* correct mbstowcs */
> >> - #define char2wchar mbstowcs
> >>   #define wchar2char wcstombs
> >>   #endif   /* WIN32 */
> >>   
> >>   #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
> >> --- 38,46 ----
> >>   #else                            /* WIN32 */
> >>   
> >>   /* correct mbstowcs */
> >>   #define wchar2char wcstombs
> >> + size_t        char2wchar(wchar_t *to, const char *from, size_t len);
> >> + 
> >>   #endif   /* WIN32 */
> >>   
> >>   #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
> >> ***************
> >> *** 54,59 ****
> >> --- 55,61 ----
> >>    * t_iseq() should be called only for ASCII symbols
> >>    */
> >>   #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
> >> + /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
> >>   
> >>   #define COPYCHAR(d,s)    do {                \
> >>       int lll = pg_mblen( s );            \
> >> Index: wordparser/parser.c
> >> ===================================================================
> >> RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
> >> retrieving revision 1.11
> >> diff -c -r1.11 parser.c
> >> *** wordparser/parser.c    4 Oct 2006 00:29:47 -0000    1.11
> >> --- wordparser/parser.c    4 Jan 2007 12:16:01 -0000
> >> ***************
> >> *** 44,52 ****
> >>        * Some operating systems fail with multi-byte encodings and a C locale.
> >>        * Also, for a C locale there is no need to process as multibyte. From
> >>        * backend/utils/adt/oracle_compat.c Teodor
> >>        */
> >>   
> >> !     if (prs->charmaxlen > 1 && !lc_ctype_is_c())
> >>       {
> >>           prs->usewide = true;
> >>           prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
> >> --- 44,54 ----
> >>        * Some operating systems fail with multi-byte encodings and a C locale.
> >>        * Also, for a C locale there is no need to process as multibyte. From
> >>        * backend/utils/adt/oracle_compat.c Teodor
> >> +      *
> >> +      * This is wrong assumption. even if locale is C, multibyte is necceary.
> >>        */
> >>   
> >> !     if (prs->charmaxlen > 1)
> >>       {
> >>           prs->usewide = true;
> >>           prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
> >> ***************
> >> *** 92,98 ****
> >>   static int                                            \
> >>   p_is##type(TParser *prs) {                                    \
> >>       Assert( prs->state );                                    \
> >> !     return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
> >>           is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );        \
> >>   }    \
> >>                                                   \
> >> --- 94,102 ----
> >>   static int                                            \
> >>   p_is##type(TParser *prs) {                                    \
> >>       Assert( prs->state );                                    \
> >> !     return ( ( prs->usewide ) ? \
> >> !              (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
> >> !               isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
> >>           is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );        \
> >>   }    \
> >>                                                   \
> >> ***************
> >> *** 134,141 ****
> >>   }
> >>   #endif   /* TS_USE_WIDE */
> >>   
> >> ! p_iswhat(alnum)
> >> ! p_iswhat(alpha)
> >>   p_iswhat(digit)
> >>   p_iswhat(lower)
> >>   p_iswhat(print)
> >> --- 138,197 ----
> >>   }
> >>   #endif   /* TS_USE_WIDE */
> >>
> >> ! static int p_isalnum(TParser *prs) {
> >> !     Assert( prs->state );
> >> ! 
> >> !     if (prs->usewide)
> >> !     {
> >> !         unsigned int c;
> >> ! 
> >> !         c = *(prs->wstr + prs->state->poschar);
> >> ! 
> >> !         if (lc_ctype_is_c())
> >> !         {
> >> !             if (c > 0x7f)
> >> !                 return 1;
> >> !             return isalnum(0xff & c);
> >> !         }
> >> !         else
> >> !             return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
> >> !     }
> >> !     else
> >> !         return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
> >> ! }
> >> ! 
> >> ! static int    p_isnotalnum(TParser *prs)
> >> ! {
> >> !     return !p_isalnum(prs);
> >> ! }
> >> ! 
> >> ! static int p_isalpha(TParser *prs) {
> >> !     Assert( prs->state );
> >> ! 
> >> !     if (prs->usewide)
> >> !     {
> >> !         unsigned int c;
> >> ! 
> >> !         c = *(prs->wstr + prs->state->poschar);
> >> ! 
> >> !         if (lc_ctype_is_c())
> >> !         {
> >> !             if (c > 0x7f)
> >> !                 return 1;
> >> !             return isalpha(0xff & c);
> >> !         }
> >> !         else
> >> !             return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
> >> !     }
> >> !     else
> >> !         return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
> >> ! }
> >> ! 
> >> ! static int    p_isnotalpha(TParser *prs)
> >> ! {
> >> !     return !p_isalpha(prs);
> >> ! }
> >> ! 
> >>   p_iswhat(digit)
> >>   p_iswhat(lower)
> >>   p_iswhat(print)
> >>
> >> ------------------------------------------------------------------------
> >>
> >>
> >> ---------------------------(end of broadcast)---------------------------
> >> TIP 9: In versions below 8.0, the planner will ignore your desire to
> >>        choose an index scan if your joining column's datatypes do not
> >>        match
> 
> -- 
> Teodor Sigaev                                   E-mail: teodor@sigaev.ru
>                                                     WWW: http://www.sigaev.ru/
> 


Re: Request for review: tsearch2 patch

From
Teodor Sigaev
Date:
> I have tested with local-enabled environment and found a bug. Included
> is the new version of patches. 
Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale 
C', simple way to reproduce:
# select to_tsquery('default', '''New York''');
server closed the connection unexpectedly        This probably means the server terminated abnormally        before or
whileprocessing the request.
 
The connection to the server was lost. Attempting reset: Failed.


>> ! static int p_isalnum(TParser *prs) {
...
>> !         if (lc_ctype_is_c())
>> !         {
>> !             if (c > 0x7f)
>> !                 return 1;

I have some some doubts that any character greater than 0x7f is an alpha symbol. 
Is it simple assumption or workaround?

-- 
Teodor Sigaev                                   E-mail: teodor@sigaev.ru
  WWW: http://www.sigaev.ru/
 


Re: Request for review: tsearch2 patch

From
Tatsuo Ishii
Date:
From: Teodor Sigaev <teodor@sigaev.ru>
Subject: Re: [HACKERS] Request for review: tsearch2 patch
Date: Wed, 10 Jan 2007 18:50:44 +0300
Message-ID: <45A50B54.6090608@sigaev.ru>

> > I have tested with local-enabled environment and found a bug. Included
> > is the new version of patches. 
> Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale 
> C', simple way to reproduce:
> # select to_tsquery('default', '''New York''');
> server closed the connection unexpectedly
>          This probably means the server terminated abnormally
>          before or while processing the request.
> The connection to the server was lost. Attempting reset: Failed.

It seems it's a bug with original tsearch2. Here is the patches.

------------------------------------------------------------------
*** wordparser/parser.c~    2007-01-07 09:54:39.000000000 +0900
--- wordparser/parser.c    2007-01-11 10:33:41.000000000 +0900
***************
*** 51,57 ****     if (prs->charmaxlen > 1)     {         prs->usewide = true;
!         prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);         prs->lenwstr = char2wchar(prs->wstr,
prs->str,prs->lenstr);     }     else
 
--- 51,57 ----     if (prs->charmaxlen > 1)     {         prs->usewide = true;
!         prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));         prs->lenwstr =
char2wchar(prs->wstr,prs->str, prs->lenstr);     }     else
 
------------------------------------------------------------------

> >> ! static int p_isalnum(TParser *prs) {
> ...
> >> !         if (lc_ctype_is_c())
> >> !         {
> >> !             if (c > 0x7f)
> >> !                 return 1;
> 
> I have some some doubts that any character greater than 0x7f is an alpha symbol. 
> Is it simple assumption or workaround?

Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.
--
Tatsuo Ishii
SRA OSS, Inc. Japan


Re: Request for review: tsearch2 patch

From
Tatsuo Ishii
Date:
> > I have tested with local-enabled environment and found a bug. Included
> > is the new version of patches. 
> Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale 
> C', simple way to reproduce:
> # select to_tsquery('default', '''New York''');
> server closed the connection unexpectedly
>          This probably means the server terminated abnormally
>          before or while processing the request.
> The connection to the server was lost. Attempting reset: Failed.

It seems it's a bug with original tsearch2. Here is the patches.

------------------------------------------------------------------
*** wordparser/parser.c~    2007-01-07 09:54:39.000000000 +0900
--- wordparser/parser.c    2007-01-11 10:33:41.000000000 +0900
***************
*** 51,57 ****     if (prs->charmaxlen > 1)     {         prs->usewide = true;
!         prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);         prs->lenwstr = char2wchar(prs->wstr,
prs->str,prs->lenstr);     }     else
 
--- 51,57 ----     if (prs->charmaxlen > 1)     {         prs->usewide = true;
!         prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));         prs->lenwstr =
char2wchar(prs->wstr,prs->str, prs->lenstr);     }     else
 
------------------------------------------------------------------

> >> ! static int p_isalnum(TParser *prs) {
> ...
> >> !         if (lc_ctype_is_c())
> >> !         {
> >> !             if (c > 0x7f)
> >> !                 return 1;
> 
> I have some some doubts that any character greater than 0x7f is an alpha symbol. 
> Is it simple assumption or workaround?

Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.
--
Tatsuo Ishii
SRA OSS, Inc. Japan


Re: Request for review: tsearch2 patch

From
Teodor Sigaev
Date:
> Yeah, it's a workaround. Since there's no concept other than
> alpha/numeric/latin in tsearch2, Asian characters have to be fall in
> one of them.

Ok, I see.

Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD
and 8.2 branches.

PS. Magnus, may I ask you to test under Windows? Thank you.

--
Teodor Sigaev                                   E-mail: teodor@sigaev.ru
                                                    WWW: http://www.sigaev.ru/
diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
*** ../tsearch2.orig/ts_locale.c    Fri Jan 12 10:53:11 2007
--- ./ts_locale.c    Fri Jan 12 18:10:27 2007
***************
*** 12,24 ****
  size_t
  wchar2char(char *to, const wchar_t *from, size_t len)
  {
      if (GetDatabaseEncoding() == PG_UTF8)
      {
          int            r;

-         if (len == 0)
-             return 0;
-
          r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
                                  NULL, NULL);

--- 12,24 ----
  size_t
  wchar2char(char *to, const wchar_t *from, size_t len)
  {
+     if (len == 0)
+         return 0;
+
      if (GetDatabaseEncoding() == PG_UTF8)
      {
          int            r;

          r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
                                  NULL, NULL);

***************
*** 34,50 ****

      return wcstombs(to, from, len);
  }

  size_t
  char2wchar(wchar_t *to, const char *from, size_t len)
  {
      if (GetDatabaseEncoding() == PG_UTF8)
      {
          int            r;

-         if (len == 0)
-             return 0;
-
          r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);

          if (!r)
--- 34,52 ----

      return wcstombs(to, from, len);
  }
+ #endif   /* WIN32 */

  size_t
  char2wchar(wchar_t *to, const char *from, size_t len)
  {
+     if (len == 0)
+         return 0;
+
+ #ifdef WIN32
      if (GetDatabaseEncoding() == PG_UTF8)
      {
          int            r;

          r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);

          if (!r)
***************
*** 60,88 ****

          return r;
      }

      return mbstowcs(to, from, len);
  }
- #endif   /* WIN32 */

  int
  _t_isalpha(const char *ptr)
  {
!     wchar_t        character;

!     char2wchar(&character, ptr, 1);

!     return iswalpha((wint_t) character);
  }

  int
  _t_isprint(const char *ptr)
  {
!     wchar_t        character;

!     char2wchar(&character, ptr, 1);

!     return iswprint((wint_t) character);
  }
  #endif   /* TS_USE_WIDE */

--- 62,105 ----

          return r;
      }
+     else
+ #endif /* WIN32 */
+     if ( lc_ctype_is_c() )
+     {
+         /*
+          * pg_mb2wchar_with_len always adds trailing '\0', so
+          * 'to' should be allocated with sufficient space
+          */
+         return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+     }

      return mbstowcs(to, from, len);
  }

  int
  _t_isalpha(const char *ptr)
  {
!     wchar_t        character[2];
!
!     if (lc_ctype_is_c())
!         return isalpha(TOUCHAR(ptr));

!     char2wchar(character, ptr, 1);

!     return iswalpha((wint_t) *character);
  }

  int
  _t_isprint(const char *ptr)
  {
!     wchar_t        character[2];
!
!     if (lc_ctype_is_c())
!         return isprint(TOUCHAR(ptr));

!     char2wchar(character, ptr, 1);

!     return iswprint((wint_t) *character);
  }
  #endif   /* TS_USE_WIDE */

***************
*** 126,132 ****
          if ( wlen < 0 )
              ereport(ERROR,
                      (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("transalation failed from server encoding to wchar_t")));

          Assert(wlen<=len);
          wstr[wlen] = 0;
--- 143,149 ----
          if ( wlen < 0 )
              ereport(ERROR,
                      (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("translation failed from server encoding to wchar_t")));

          Assert(wlen<=len);
          wstr[wlen] = 0;
***************
*** 152,158 ****
          if ( wlen < 0 )
              ereport(ERROR,
                      (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("transalation failed from wchar_t to server encoding %d", errno)));
          Assert(wlen<=len);
          out[wlen]='\0';
      }
--- 169,175 ----
          if ( wlen < 0 )
              ereport(ERROR,
                      (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                      errmsg("translation failed from wchar_t to server encoding %d", errno)));
          Assert(wlen<=len);
          out[wlen]='\0';
      }
diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
*** ../tsearch2.orig/ts_locale.h    Fri Jan 12 10:53:11 2007
--- ./ts_locale.h    Fri Jan 12 18:10:19 2007
***************
*** 30,45 ****
  #define TOUCHAR(x)    (*((unsigned char*)(x)))

  #ifdef TS_USE_WIDE

  #ifdef WIN32

  size_t        wchar2char(char *to, const wchar_t *from, size_t len);
! size_t        char2wchar(wchar_t *to, const char *from, size_t len);
  #else                            /* WIN32 */

! /* correct mbstowcs */
! #define char2wchar mbstowcs
  #define wchar2char wcstombs
  #endif   /* WIN32 */

  #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 30,46 ----
  #define TOUCHAR(x)    (*((unsigned char*)(x)))

  #ifdef TS_USE_WIDE
+ size_t        char2wchar(wchar_t *to, const char *from, size_t len);

  #ifdef WIN32

  size_t        wchar2char(char *to, const wchar_t *from, size_t len);
!
  #else                            /* WIN32 */

! /* correct wcstombs */
  #define wchar2char wcstombs
+
  #endif   /* WIN32 */

  #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 55,64 ****
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )

! #define COPYCHAR(d,s)    do {                \
!     int lll = pg_mblen( s );            \
!                             \
!     while( lll-- )                    \
          TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
  } while(0)

--- 56,65 ----
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )

! #define COPYCHAR(d,s)    do {                    \
!     int lll = pg_mblen( s );                    \
!                                                 \
!     while( lll-- )                                \
          TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
  } while(0)

diff -c -r -N ../tsearch2.orig/tsearch2.patch ./tsearch2.patch
*** ../tsearch2.orig/tsearch2.patch    Thu Jan  1 03:00:00 1970
--- ./tsearch2.patch    Fri Jan 12 18:12:30 2007
***************
*** 0 ****
--- 1,243 ----
+ diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
+ *** ../tsearch2.orig/ts_locale.c    Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.c    Fri Jan 12 18:10:27 2007
+ ***************
+ *** 12,24 ****
+   size_t
+   wchar2char(char *to, const wchar_t *from, size_t len)
+   {
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+           int            r;
+
+ -         if (len == 0)
+ -             return 0;
+ -
+           r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+                                   NULL, NULL);
+
+ --- 12,24 ----
+   size_t
+   wchar2char(char *to, const wchar_t *from, size_t len)
+   {
+ +     if (len == 0)
+ +         return 0;
+ +
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+           int            r;
+
+           r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+                                   NULL, NULL);
+
+ ***************
+ *** 34,50 ****
+
+       return wcstombs(to, from, len);
+   }
+
+   size_t
+   char2wchar(wchar_t *to, const char *from, size_t len)
+   {
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+           int            r;
+
+ -         if (len == 0)
+ -             return 0;
+ -
+           r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+
+           if (!r)
+ --- 34,52 ----
+
+       return wcstombs(to, from, len);
+   }
+ + #endif   /* WIN32 */
+
+   size_t
+   char2wchar(wchar_t *to, const char *from, size_t len)
+   {
+ +     if (len == 0)
+ +         return 0;
+ +
+ + #ifdef WIN32
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+           int            r;
+
+           r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+
+           if (!r)
+ ***************
+ *** 60,88 ****
+
+           return r;
+       }
+
+       return mbstowcs(to, from, len);
+   }
+ - #endif   /* WIN32 */
+
+   int
+   _t_isalpha(const char *ptr)
+   {
+ !     wchar_t        character;
+
+ !     char2wchar(&character, ptr, 1);
+
+ !     return iswalpha((wint_t) character);
+   }
+
+   int
+   _t_isprint(const char *ptr)
+   {
+ !     wchar_t        character;
+
+ !     char2wchar(&character, ptr, 1);
+
+ !     return iswprint((wint_t) character);
+   }
+   #endif   /* TS_USE_WIDE */
+
+ --- 62,105 ----
+
+           return r;
+       }
+ +     else
+ + #endif /* WIN32 */
+ +     if ( lc_ctype_is_c() )
+ +     {
+ +         /*
+ +          * pg_mb2wchar_with_len always adds trailing '\0', so
+ +          * 'to' should be allocated with sufficient space
+ +          */
+ +         return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ +     }
+
+       return mbstowcs(to, from, len);
+   }
+
+   int
+   _t_isalpha(const char *ptr)
+   {
+ !     wchar_t        character[2];
+ !
+ !     if (lc_ctype_is_c())
+ !         return isalpha(TOUCHAR(ptr));
+
+ !     char2wchar(character, ptr, 1);
+
+ !     return iswalpha((wint_t) *character);
+   }
+
+   int
+   _t_isprint(const char *ptr)
+   {
+ !     wchar_t        character[2];
+ !
+ !     if (lc_ctype_is_c())
+ !         return isprint(TOUCHAR(ptr));
+
+ !     char2wchar(character, ptr, 1);
+
+ !     return iswprint((wint_t) *character);
+   }
+   #endif   /* TS_USE_WIDE */
+
+ ***************
+ *** 126,132 ****
+           if ( wlen < 0 )
+               ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                      errmsg("transalation failed from server encoding to wchar_t")));
+
+           Assert(wlen<=len);
+           wstr[wlen] = 0;
+ --- 143,149 ----
+           if ( wlen < 0 )
+               ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                      errmsg("translation failed from server encoding to wchar_t")));
+
+           Assert(wlen<=len);
+           wstr[wlen] = 0;
+ ***************
+ *** 152,158 ****
+           if ( wlen < 0 )
+               ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                      errmsg("transalation failed from wchar_t to server encoding %d", errno)));
+           Assert(wlen<=len);
+           out[wlen]='\0';
+       }
+ --- 169,175 ----
+           if ( wlen < 0 )
+               ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                      errmsg("translation failed from wchar_t to server encoding %d", errno)));
+           Assert(wlen<=len);
+           out[wlen]='\0';
+       }
+ diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
+ *** ../tsearch2.orig/ts_locale.h    Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.h    Fri Jan 12 18:10:19 2007
+ ***************
+ *** 30,45 ****
+   #define TOUCHAR(x)    (*((unsigned char*)(x)))
+
+   #ifdef TS_USE_WIDE
+
+   #ifdef WIN32
+
+   size_t        wchar2char(char *to, const wchar_t *from, size_t len);
+ ! size_t        char2wchar(wchar_t *to, const char *from, size_t len);
+   #else                            /* WIN32 */
+
+ ! /* correct mbstowcs */
+ ! #define char2wchar mbstowcs
+   #define wchar2char wcstombs
+   #endif   /* WIN32 */
+
+   #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ --- 30,46 ----
+   #define TOUCHAR(x)    (*((unsigned char*)(x)))
+
+   #ifdef TS_USE_WIDE
+ + size_t        char2wchar(wchar_t *to, const char *from, size_t len);
+
+   #ifdef WIN32
+
+   size_t        wchar2char(char *to, const wchar_t *from, size_t len);
+ !
+   #else                            /* WIN32 */
+
+ ! /* correct wcstombs */
+   #define wchar2char wcstombs
+ +
+   #endif   /* WIN32 */
+
+   #define t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ ***************
+ *** 55,64 ****
+    */
+   #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+
+ ! #define COPYCHAR(d,s)    do {                \
+ !     int lll = pg_mblen( s );            \
+ !                             \
+ !     while( lll-- )                    \
+           TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
+   } while(0)
+
+ --- 56,65 ----
+    */
+   #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+
+ ! #define COPYCHAR(d,s)    do {                    \
+ !     int lll = pg_mblen( s );                    \
+ !                                                 \
+ !     while( lll-- )                                \
+           TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
+   } while(0)
+
diff -c -r -N ../tsearch2.orig/wordparser/parser.c ./wordparser/parser.c
*** ../tsearch2.orig/wordparser/parser.c    Fri Jan 12 10:53:11 2007
--- ./wordparser/parser.c    Fri Jan 12 18:10:38 2007
***************
*** 40,55 ****
  #ifdef TS_USE_WIDE

      /*
!      * Use wide char code only when max encoding length > 1 and ctype != C.
!      * Some operating systems fail with multi-byte encodings and a C locale.
!      * Also, for a C locale there is no need to process as multibyte. From
!      * backend/utils/adt/oracle_compat.c Teodor
       */

!     if (prs->charmaxlen > 1 && !lc_ctype_is_c())
      {
          prs->usewide = true;
!         prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
          prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
      }
      else
--- 40,52 ----
  #ifdef TS_USE_WIDE

      /*
!      * Use wide char code only when max encoding length > 1.
       */

!     if (prs->charmaxlen > 1)
      {
          prs->usewide = true;
!         prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
          prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
      }
      else
***************
*** 83,107 ****

  /*
   * defining support function, equvalent is* macroses, but
!  * working with any possible encodings and locales
   */

  #ifdef TS_USE_WIDE

! #define p_iswhat(type)                                        \
! static int                                            \
! p_is##type(TParser *prs) {                                    \
!     Assert( prs->state );                                    \
!     return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
!         is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );        \
! }    \
!                                                 \
! static int                                            \
! p_isnot##type(TParser *prs) {                                    \
!     return !p_is##type(prs);                                \
  }



  /* p_iseq should be used only for ascii symbols */

--- 80,178 ----

  /*
   * defining support function, equvalent is* macroses, but
!  * working with any possible encodings and locales. Note,
!  * that with multibyte encoding and C-locale isw* function may fail
!  * or give wrong result. Note 2: multibyte encoding and C-locale
!  * often are used for Asian languages.
   */

  #ifdef TS_USE_WIDE

! #define p_iswhat(type)                                                        \
! static int                                                                    \
! p_is##type(TParser *prs) {                                                    \
!     Assert( prs->state );                                                    \
!     if ( prs->usewide )                                                        \
!     {                                                                        \
!         if ( lc_ctype_is_c() )                                                \
!             return is##type( 0xff & *( prs->wstr + prs->state->poschar) );    \
!                                                                             \
!         return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );    \
!     }                                                                        \
!                                                                             \
!     return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) );    \
! }                                                                            \
!                                                                             \
! static int                                                                    \
! p_isnot##type(TParser *prs) {                                                \
!     return !p_is##type(prs);                                                \
  }

+ static int
+ p_isalnum(TParser *prs)
+ {
+     Assert( prs->state );
+
+     if (prs->usewide)
+     {
+         if (lc_ctype_is_c())
+         {
+             unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar);
+
+             /*
+              * any non-ascii symbol with multibyte encoding
+              * with C-locale is an alpha character
+              */
+             if ( c > 0x7f )
+                 return 1;
+
+             return isalnum(0xff & c);
+         }
+
+         return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
+     }
+
+     return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }

+ static int
+ p_isnotalnum(TParser *prs)
+ {
+     return !p_isalnum(prs);
+ }
+
+ static int
+ p_isalpha(TParser *prs)
+ {
+     Assert( prs->state );
+
+     if (prs->usewide)
+     {
+         if (lc_ctype_is_c())
+         {
+             unsigned int c = *(prs->wstr + prs->state->poschar);
+
+             /*
+              * any non-ascii symbol with multibyte encoding
+              * with C-locale is an alpha character
+              */
+             if ( c > 0x7f )
+                 return 1;
+
+             return isalpha(0xff & c);
+         }
+
+         return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
+     }
+
+     return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
+
+ static int
+ p_isnotalpha(TParser *prs)
+ {
+     return !p_isalpha(prs);
+ }

  /* p_iseq should be used only for ascii symbols */

***************
*** 111,128 ****
      Assert(prs->state);
      return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
  }
  #else                            /* TS_USE_WIDE */

! #define p_iswhat(type)                                        \
! static int                                            \
! p_is##type(TParser *prs) {                                    \
!     Assert( prs->state );                                    \
!     return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );            \
! }    \
!                                                 \
! static int                                            \
! p_isnot##type(TParser *prs) {                                    \
!     return !p_is##type(prs);                                \
  }


--- 182,200 ----
      Assert(prs->state);
      return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
  }
+
  #else                            /* TS_USE_WIDE */

! #define p_iswhat(type)                                                        \
! static int                                                                    \
! p_is##type(TParser *prs) {                                                    \
!     Assert( prs->state );                                                    \
!     return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );    \
! }                                                                            \
!                                                                             \
! static int                                                                    \
! p_isnot##type(TParser *prs) {                                                \
!     return !p_is##type(prs);                                                \
  }


***************
*** 132,141 ****
      Assert(prs->state);
      return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
  }
- #endif   /* TS_USE_WIDE */

  p_iswhat(alnum)
  p_iswhat(alpha)
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
--- 204,215 ----
      Assert(prs->state);
      return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
  }

  p_iswhat(alnum)
  p_iswhat(alpha)
+
+ #endif   /* TS_USE_WIDE */
+
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)

Re: Request for review: tsearch2 patch

From
Tatsuo Ishii
Date:
> > Yeah, it's a workaround. Since there's no concept other than
> > alpha/numeric/latin in tsearch2, Asian characters have to be fall in
> > one of them.
> 
> Ok, I see.
> 
> Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD 
> and 8.2 branches.

I have tested on a Linux box running PostgreSQL 8.2.1 (C locale,
EUC_JP encoding), and it worked great!

BTW, is your patch supposed to work with PostgreSQL 8.1?
--
Tatsuo Ishii
SRA OSS, Inc. Japan

> PS. Magnus, may I ask you to test under Windows? Thank you.
> 
> -- 
> Teodor Sigaev                                   E-mail: teodor@sigaev.ru
>                                                     WWW: http://www.sigaev.ru/