Locale bug - Mailing list pgsql-ports

From Andriy I Pilipenko
Subject Locale bug
Date
Msg-id Pine.BSF.4.21.0004191118560.87161-100000@bamby.marka.net.ua
Whole thread Raw
Responses Re: Locale bug
Re: Locale bug
Re: Locale bug
List pgsql-ports
============================================================================
                        POSTGRESQL BUG REPORT TEMPLATE
============================================================================


Your name        :    Andriy I Pilipenko
Your email address    :    bamby@marka.net.ua


System Configuration
---------------------
  Architecture (example: Intel Pentium)      :  Intel Pentium

  Operating System (example: Linux 2.0.26 ELF)     :  FreeBSD 3.x, 4.0

  PostgreSQL version (example: PostgreSQL-6.5.1):  PostgreSQL-6.5.3,
                                                   PostgreSQL-7.0.beta5

  Compiler used (example:  gcc 2.8.0)        :  gcc 2.7.2.2, gcc 2.9.5


Please enter a FULL description of your problem:
------------------------------------------------

There is at least FreeBSD specific bug in PostgreSQL. If Postgres
configured with locale support but without multibyte support one cannot
perform case insensitive search using national language characters.
Problem comes from declaration pg_wchar as char for non-multibyte mode.
Character values above 127 considered to be negative values and this
result in improper return values of functions isalpha(), isupper() etc.
Declaring pg_wchar as unsigned char eliminates this problem.

This problem not exists on Linux. On this system functions like isalpha(),
isupper() etc. successfully accept negative values as well as their
positive counterparts.


Please describe a way to repeat the problem.   Please try to provide a
concise reproducible example, if at all possible:
----------------------------------------------------------------------

Compile and install postgres with locale support enabled and multibyte
support disabled on FreeBSD. Create table with field of some character
type. Put in the table couple of recods with some character with code
above 127 in lower and upper case. Try query like this:

  SELECT * FROM table WHERE field ~* '<the_character>'

where <the_character> is the mentioned character. You will receive only
one record with character exactly the same as in query.


If you know how this problem might be fixed, list the solution below:
---------------------------------------------------------------------

Here is the patch. I tried it on FreeBSD and Linux with success. This
patch applies to PostgreSQL 6.5.3 and 7.0.beta5.


Index: postgres/src/backend/regex/engine.c
diff -c postgres/src/backend/regex/engine.c:1.1.1.1 postgres/src/backend/regex/engine.c:1.2
*** postgres/src/backend/regex/engine.c:1.1.1.1    Tue Apr 18 21:45:09 2000
--- postgres/src/backend/regex/engine.c    Wed Apr 19 09:46:38 2000
***************
*** 123,130 ****
  #define NONCHAR(c)      ((c) > 16777216)    /* 16777216 == 2^24 == 3 bytes */
  #define NNONCHAR  (CODEMAX-16777216)
  #else
! #define NONCHAR(c)          ((c) > CHAR_MAX)
! #define NNONCHAR      (CODEMAX-CHAR_MAX)
  #endif

  #ifdef REDEBUG
--- 123,130 ----
  #define NONCHAR(c)      ((c) > 16777216)    /* 16777216 == 2^24 == 3 bytes */
  #define NNONCHAR  (CODEMAX-16777216)
  #else
! #define NONCHAR(c)          ((c) > UCHAR_MAX)
! #define NNONCHAR      (CODEMAX-UCHAR_MAX)
  #endif

  #ifdef REDEBUG
***************
*** 958,965 ****
   == #define        BOW        (BOL+4)
   == #define        EOW        (BOL+5)
   == #define        CODEMAX (BOL+5)            // highest code used
!  == #define        NONCHAR(c)        ((c) > CHAR_MAX)
!  == #define        NNONCHAR        (CODEMAX-CHAR_MAX)
   */
  static states
  step(g, start, stop, bef, ch, aft)
--- 958,965 ----
   == #define        BOW        (BOL+4)
   == #define        EOW        (BOL+5)
   == #define        CODEMAX (BOL+5)            // highest code used
!  == #define        NONCHAR(c)        ((c) > UCHAR_MAX)
!  == #define        NNONCHAR        (CODEMAX-UCHAR_MAX)
   */
  static states
  step(g, start, stop, bef, ch, aft)
Index: postgres/src/backend/regex/regcomp.c
diff -c postgres/src/backend/regex/regcomp.c:1.1.1.1 postgres/src/backend/regex/regcomp.c:1.2
*** postgres/src/backend/regex/regcomp.c:1.1.1.1    Tue Apr 18 21:45:09 2000
--- postgres/src/backend/regex/regcomp.c    Wed Apr 19 09:46:38 2000
***************
*** 97,107 ****
      static void p_b_eclass(struct parse * p, cset *cs);
      static pg_wchar p_b_symbol(struct parse * p);
      static char p_b_coll_elem(struct parse * p, int endc);
- #ifdef MULTIBYTE
      static unsigned char othercase(int ch);
- #else
-     static char othercase(int ch);
- #endif
      static void bothcases(struct parse * p, int ch);
      static void ordinary(struct parse * p, int ch);
      static void nonnewline(struct parse * p);
--- 97,103 ----
***************
*** 224,232 ****
              return REG_INVARG;
          len = preg->re_endp - wcp;
  #else
!         if (preg->re_endp < pattern)
              return REG_INVARG;
!         len = preg->re_endp - pattern;
  #endif
      }
      else
--- 220,228 ----
              return REG_INVARG;
          len = preg->re_endp - wcp;
  #else
!         if (preg->re_endp < (pg_wchar *) pattern)
              return REG_INVARG;
!         len = preg->re_endp - (pg_wchar *) pattern;
  #endif
      }
      else
***************
*** 1038,1071 ****
   - othercase - return the case counterpart of an alphabetic
   == static char othercase(int ch);
   */
- #ifdef MULTIBYTE
  static unsigned char            /* if no counterpart, return ch */
- #else
- static char                        /* if no counterpart, return ch */
- #endif
  othercase(ch)
  int            ch;
  {
      assert(pg_isalpha(ch));
      if (pg_isupper(ch))
- #ifdef MULTIBYTE
-         return (unsigned char) tolower(ch);
- #else
          return tolower(ch);
- #endif
      else if (pg_islower(ch))
- #ifdef MULTIBYTE
-         return (unsigned char) toupper(ch);
- #else
          return toupper(ch);
- #endif
      else
  /* peculiar, but could happen */
- #ifdef MULTIBYTE
-         return (unsigned char) ch;
- #else
          return ch;
- #endif
  }

  /*
--- 1034,1051 ----
Index: postgres/src/include/mb/pg_wchar.h
diff -c postgres/src/include/mb/pg_wchar.h:1.1.1.1 postgres/src/include/mb/pg_wchar.h:1.2
*** postgres/src/include/mb/pg_wchar.h:1.1.1.1    Tue Apr 18 21:45:31 2000
--- postgres/src/include/mb/pg_wchar.h    Wed Apr 19 09:46:42 2000
***************
*** 34,40 ****
  typedef unsigned int pg_wchar;

  #else
! #define pg_wchar char
  #endif

  /*
--- 34,40 ----
  typedef unsigned int pg_wchar;

  #else
! typedef unsigned char pg_wchar;
  #endif

  /*
Index: postgres/src/include/regex/regex2.h
diff -c postgres/src/include/regex/regex2.h:1.1.1.1 postgres/src/include/regex/regex2.h:1.2
*** postgres/src/include/regex/regex2.h:1.1.1.1    Tue Apr 18 21:45:35 2000
--- postgres/src/include/regex/regex2.h    Wed Apr 19 09:46:47 2000
***************
*** 201,207 ****
  #ifdef MULTIBYTE
  #define OUT          (16777216+1)    /* 16777216 == 2^24 == 3 bytes */
  #else
! #define OUT          (CHAR_MAX+1)    /* a non-character value */
  #endif

  #ifdef MULTIBYTE
--- 201,207 ----
  #ifdef MULTIBYTE
  #define OUT          (16777216+1)    /* 16777216 == 2^24 == 3 bytes */
  #else
! #define OUT          (UCHAR_MAX+1)    /* a non-character value */
  #endif

  #ifdef MULTIBYTE


pgsql-ports by date:

Previous
From: Lamar Owen
Date:
Subject: 7.0RC1-0.6 RPM's now available.
Next
From: "John Boris"
Date:
Subject: Problem compiling 7.0beta5 on SCO Openserver5.0.5