Rough draft for Unicode-aware UPPER()/LOWER()/INITCAP() - Mailing list pgsql-hackers

From Tom Lane
Subject Rough draft for Unicode-aware UPPER()/LOWER()/INITCAP()
Date
Msg-id 2739.1084416146@sss.pgh.pa.us
Whole thread Raw
Responses Re: Rough draft for Unicode-aware UPPER()/LOWER()/INITCAP()  (Jean-Michel POURE <jm@poure.com>)
Re: Rough draft for Unicode-aware  (Markus Bertheau <twanger@bluetwanger.de>)
Re: Rough draft for Unicode-aware UPPER()/LOWER()/INITCAP()  (Marko Karppinen <marko@karppinen.fi>)
List pgsql-hackers
I got tired of reading complaints about how upper/lower don't work with
Unicode, so I went and prototyped a solution.  The attached code uses
the C99-standard functions mbstowcs and wcstombs to convert to and from
a "wchar_t[]" representation that can be fed to the also-C99 functions
towupper, towlower, etc.

This code will only work if the database is running under an LC_CTYPE
setting that implies the same encoding specified by server_encoding.
However, I don't see that as a fatal objection, because in point of fact
the existing upper/lower code assumes the same thing.  When they don't
match, this code may deliver an "invalid multibyte character" error
rather than silently producing a wrong answer, but is that really a step
backward?

Note this patch is *not* meant for application to CVS yet.  It's not
autoconfiscated.  But if you have a platform that has mbstowcs and
friends, please try it and let me know about any portability gotchas
you see.

Also, as a character-set-impaired American, I'm probably not the best
qualified person to judge whether the patch actually does what's wanted.
It seemed to do the right sorts of conversions in my limited testing,
but does it do what *you* want it to do?

            regards, tom lane

PS: the patch works against either 7.4 or CVS tip.

*** src/backend/utils/adt/oracle_compat.c.orig    Sat Feb 28 12:53:23 2004
--- src/backend/utils/adt/oracle_compat.c    Wed May 12 21:19:33 2004
***************
*** 15,21 ****
   */
  #include "postgres.h"

! #include <ctype.h>

  #include "utils/builtins.h"
  #include "mb/pg_wchar.h"
--- 15,22 ----
   */
  #include "postgres.h"

! #include <wchar.h>
! #include <wctype.h>

  #include "utils/builtins.h"
  #include "mb/pg_wchar.h"
***************
*** 26,31 ****
--- 27,124 ----
         bool doltrim, bool dortrim);


+ /*
+  * Convert a TEXT value into a palloc'd wchar string.
+  */
+ static wchar_t *
+ texttowcs(const text *txt)
+ {
+     int            nbytes = VARSIZE(txt) - VARHDRSZ;
+     char       *workstr;
+     wchar_t       *result;
+     size_t        ncodes;
+
+     /* Overflow paranoia */
+     if (nbytes < 0 ||
+         nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
+         ereport(ERROR,
+                 (errcode(ERRCODE_OUT_OF_MEMORY),
+                  errmsg("out of memory")));
+
+     /* Need a null-terminated version of the input */
+     workstr = (char *) palloc(nbytes + 1);
+     memcpy(workstr, VARDATA(txt), nbytes);
+     workstr[nbytes] = '\0';
+
+     /* Output workspace cannot have more codes than input bytes */
+     result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
+
+     /* Do the conversion */
+     ncodes = mbstowcs(result, workstr, nbytes + 1);
+
+     if (ncodes == (size_t) -1)
+     {
+         /*
+          * Invalid multibyte character encountered.  We try to give a useful
+          * error message by letting pg_verifymbstr check the string.  But
+          * it's possible that the string is OK to us, and not OK to mbstowcs
+          * --- this suggests that the LC_CTYPE locale is different from the
+          * database encoding.  Give a generic error message if verifymbstr
+          * can't find anything wrong.
+          */
+         pg_verifymbstr(workstr, nbytes, false);
+         ereport(ERROR,
+                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                  errmsg("invalid multibyte character for locale")));
+     }
+
+     Assert(ncodes <= (size_t) nbytes);
+
+     return result;
+ }
+
+
+ /*
+  * Convert a wchar string into a palloc'd TEXT value.  The wchar string
+  * must be zero-terminated, but we also require the caller to pass the string
+  * length, since it will know it anyway in current uses.
+  */
+ static text *
+ wcstotext(const wchar_t *str, int ncodes)
+ {
+     text       *result;
+     size_t        nbytes;
+
+     /* Overflow paranoia */
+     if (ncodes < 0 ||
+         ncodes > (int) ((INT_MAX - VARHDRSZ) / MB_CUR_MAX) - 1)
+         ereport(ERROR,
+                 (errcode(ERRCODE_OUT_OF_MEMORY),
+                  errmsg("out of memory")));
+
+     /* Make workspace certainly large enough for result */
+     result = (text *) palloc((ncodes + 1) * MB_CUR_MAX + VARHDRSZ);
+
+     /* Do the conversion */
+     nbytes = wcstombs((char *) VARDATA(result), str,
+                       (ncodes + 1) * MB_CUR_MAX);
+
+     if (nbytes == (size_t) -1)
+     {
+         /* Invalid multibyte character encountered ... shouldn't happen */
+         ereport(ERROR,
+                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                  errmsg("invalid multibyte character for locale")));
+     }
+
+     Assert(nbytes <= (size_t) (ncodes * MB_CUR_MAX));
+
+     VARATT_SIZEP(result) = nbytes + VARHDRSZ;
+
+     return result;
+ }
+
+
  /********************************************************************
   *
   * lower
***************
*** 43,63 ****
  Datum
  lower(PG_FUNCTION_ARGS)
  {
!     text       *string = PG_GETARG_TEXT_P_COPY(0);
!     char       *ptr;
!     int            m;
!
!     /* Since we copied the string, we can scribble directly on the value */
!     ptr = VARDATA(string);
!     m = VARSIZE(string) - VARHDRSZ;

!     while (m-- > 0)
!     {
!         *ptr = tolower((unsigned char) *ptr);
!         ptr++;
!     }

!     PG_RETURN_TEXT_P(string);
  }


--- 136,156 ----
  Datum
  lower(PG_FUNCTION_ARGS)
  {
!     text       *string = PG_GETARG_TEXT_P(0);
!     text       *result;
!     wchar_t       *workspace;
!     int            i;

!     workspace = texttowcs(string);
!
!     for (i = 0; workspace[i] != 0; i++)
!         workspace[i] = towlower(workspace[i]);
!
!     result = wcstotext(workspace, i);
!
!     pfree(workspace);

!     PG_RETURN_TEXT_P(result);
  }


***************
*** 78,98 ****
  Datum
  upper(PG_FUNCTION_ARGS)
  {
!     text       *string = PG_GETARG_TEXT_P_COPY(0);
!     char       *ptr;
!     int            m;
!
!     /* Since we copied the string, we can scribble directly on the value */
!     ptr = VARDATA(string);
!     m = VARSIZE(string) - VARHDRSZ;

!     while (m-- > 0)
!     {
!         *ptr = toupper((unsigned char) *ptr);
!         ptr++;
!     }

!     PG_RETURN_TEXT_P(string);
  }


--- 171,191 ----
  Datum
  upper(PG_FUNCTION_ARGS)
  {
!     text       *string = PG_GETARG_TEXT_P(0);
!     text       *result;
!     wchar_t       *workspace;
!     int            i;

!     workspace = texttowcs(string);
!
!     for (i = 0; workspace[i] != 0; i++)
!         workspace[i] = towupper(workspace[i]);
!
!     result = wcstotext(workspace, i);
!
!     pfree(workspace);

!     PG_RETURN_TEXT_P(result);
  }


***************
*** 116,147 ****
  Datum
  initcap(PG_FUNCTION_ARGS)
  {
!     text       *string = PG_GETARG_TEXT_P_COPY(0);
!     char       *ptr;
!     int            m;
!
!     /* Since we copied the string, we can scribble directly on the value */
!     ptr = VARDATA(string);
!     m = VARSIZE(string) - VARHDRSZ;

!     if (m > 0)
!     {
!         *ptr = toupper((unsigned char) *ptr);
!         ptr++;
!         m--;
!     }

!     while (m-- > 0)
      {
!         /* Oracle capitalizes after all non-alphanumeric */
!         if (!isalnum((unsigned char) ptr[-1]))
!             *ptr = toupper((unsigned char) *ptr);
          else
!             *ptr = tolower((unsigned char) *ptr);
!         ptr++;
      }

!     PG_RETURN_TEXT_P(string);
  }


--- 209,236 ----
  Datum
  initcap(PG_FUNCTION_ARGS)
  {
!     text       *string = PG_GETARG_TEXT_P(0);
!     text       *result;
!     wchar_t       *workspace;
!     int            wasalnum = 0;
!     int            i;

!     workspace = texttowcs(string);

!     for (i = 0; workspace[i] != 0; i++)
      {
!         if (wasalnum)
!             workspace[i] = towlower(workspace[i]);
          else
!             workspace[i] = towupper(workspace[i]);
!         wasalnum = iswalnum(workspace[i]);
      }

!     result = wcstotext(workspace, i);
!
!     pfree(workspace);
!
!     PG_RETURN_TEXT_P(result);
  }



pgsql-hackers by date:

Previous
From: Christopher Kings-Lynne
Date:
Subject: Re: Subtle pg_dump problem...
Next
From: Larry Rosenman
Date:
Subject: Re: threads stuff/UnixWare