Home > mailing lists
Backported the very useful oracle_compat.c from postgres 8.0 beta to 7.4.5 - Mailing list pgsql-patches

From	Johannes Weberhofer
Subject	Backported the very useful oracle_compat.c from postgres 8.0 beta to 7.4.5
Date	August 26, 2004 12:31:40
Msg-id	412E023A.3010207@weberhofer.at Whole thread Raw
List	pgsql-patches
Tree view
Dear all!

I'm new to the list, so please excuse if I repost this patch.

In the found the _very_ useful oracle_compat.c function which implements
correct UPPER(), LOWER() and INITCAP() for multibyte charactersets.

This change is very important for everyone who uses multilingual
databases, because it makes case insensitive search possible (like
"Ärzte" and "ärzte"). We have tested the patch in our work, and we did
not see any problems so far. I think Postgres without that patch is buggy.

Thank you to Tom (who wrote the code) for the great work.

The attached patch will patch the current stable version 7.4.5

Best whishes,

Johannes
--- postgresql-7.4.5/src/backend/utils/adt/oracle_compat.c    2003-08-08 23:42:06.000000000 +0200
+++ postgresql-8.0.0beta1/src/backend/utils/adt/oracle_compat.c    2004-06-07 00:17:01.000000000 +0200
@@ -9,23 +9,145 @@
  *
  *
  * IDENTIFICATION
- *    $Header: /cvsroot/pgsql-server/src/backend/utils/adt/oracle_compat.c,v 1.48 2003/08/08 21:42:06 momjian Exp $
+ *    $PostgreSQL: pgsql-server/src/backend/utils/adt/oracle_compat.c,v 1.53 2004/06/06 22:17:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include <ctype.h>
+#include <limits.h>
+/*
+ * towlower() and friends should be in <wctype.h>, but some pre-C99 systems
+ * declare them in <wchar.h>.
+ */
+#ifdef HAVE_WCHAR_H
+#include <wchar.h>
+#endif
+#ifdef HAVE_WCTYPE_H
+#include <wctype.h>
+#endif

 #include "utils/builtins.h"
 #include "mb/pg_wchar.h"


+/*
+ * If the system provides the needed functions for wide-character manipulation
+ * (which are all standardized by C99), then we implement upper/lower/initcap
+ * using wide-character functions.  Otherwise we use the traditional <ctype.h>
+ * functions, which of course will not work as desired in multibyte character
+ * sets.  Note that in either case we are effectively assuming that the
+ * database character encoding matches the encoding implied by LC_CTYPE.
+ *
+ * We assume if we have these two functions, we have their friends too, and
+ * can use the wide-character method.
+ */
+#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
+#define USE_WIDE_UPPER_LOWER
+#endif
+
 static text *dotrim(const char *string, int stringlen,
        const char *set, int setlen,
        bool doltrim, bool dortrim);


+#ifdef USE_WIDE_UPPER_LOWER
+
+/*
+ * Convert a TEXT value into a palloc'd wchar string.
+ */
+static wchar_t *
+texttowcs(const text *txt)
+{
+    int            nbytes = VARSIZE(txt) - VARHDRSZ;
+    char       *workstr;
+    wchar_t       *result;
+    size_t        ncodes;
+
+    /* Overflow paranoia */
+    if (nbytes < 0 ||
+        nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1)
+        ereport(ERROR,
+                (errcode(ERRCODE_OUT_OF_MEMORY),
+                 errmsg("out of memory")));
+
+    /* Need a null-terminated version of the input */
+    workstr = (char *) palloc(nbytes + 1);
+    memcpy(workstr, VARDATA(txt), nbytes);
+    workstr[nbytes] = '\0';
+
+    /* Output workspace cannot have more codes than input bytes */
+    result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
+
+    /* Do the conversion */
+    ncodes = mbstowcs(result, workstr, nbytes + 1);
+
+    if (ncodes == (size_t) -1)
+    {
+        /*
+         * Invalid multibyte character encountered.  We try to give a useful
+         * error message by letting pg_verifymbstr check the string.  But
+         * it's possible that the string is OK to us, and not OK to mbstowcs
+         * --- this suggests that the LC_CTYPE locale is different from the
+         * database encoding.  Give a generic error message if verifymbstr
+         * can't find anything wrong.
+         */
+        pg_verifymbstr(workstr, nbytes, false);
+        ereport(ERROR,
+                (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                 errmsg("invalid multibyte character for locale")));
+    }
+
+    Assert(ncodes <= (size_t) nbytes);
+
+    return result;
+}
+
+
+/*
+ * Convert a wchar string into a palloc'd TEXT value.  The wchar string
+ * must be zero-terminated, but we also require the caller to pass the string
+ * length, since it will know it anyway in current uses.
+ */
+static text *
+wcstotext(const wchar_t *str, int ncodes)
+{
+    text       *result;
+    size_t        nbytes;
+
+    /* Overflow paranoia */
+    if (ncodes < 0 ||
+        ncodes > (int) ((INT_MAX - VARHDRSZ) / MB_CUR_MAX) - 1)
+        ereport(ERROR,
+                (errcode(ERRCODE_OUT_OF_MEMORY),
+                 errmsg("out of memory")));
+
+    /* Make workspace certainly large enough for result */
+    result = (text *) palloc((ncodes + 1) * MB_CUR_MAX + VARHDRSZ);
+
+    /* Do the conversion */
+    nbytes = wcstombs((char *) VARDATA(result), str,
+                      (ncodes + 1) * MB_CUR_MAX);
+
+    if (nbytes == (size_t) -1)
+    {
+        /* Invalid multibyte character encountered ... shouldn't happen */
+        ereport(ERROR,
+                (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                 errmsg("invalid multibyte character for locale")));
+    }
+
+    Assert(nbytes <= (size_t) (ncodes * MB_CUR_MAX));
+
+    VARATT_SIZEP(result) = nbytes + VARHDRSZ;
+
+    return result;
+}
+
+#endif /* USE_WIDE_UPPER_LOWER */
+
+
 /********************************************************************
  *
  * lower
@@ -43,21 +165,45 @@
 Datum
 lower(PG_FUNCTION_ARGS)
 {
-    text       *string = PG_GETARG_TEXT_P_COPY(0);
-    char       *ptr;
-    int            m;
+#ifdef USE_WIDE_UPPER_LOWER
+    /* use wide char code only when max encoding length > one */
+    if (pg_database_encoding_max_length() > 1)
+    {
+        text       *string = PG_GETARG_TEXT_P(0);
+        text       *result;
+        wchar_t       *workspace;
+        int            i;

-    /* Since we copied the string, we can scribble directly on the value */
-    ptr = VARDATA(string);
-    m = VARSIZE(string) - VARHDRSZ;
+        workspace = texttowcs(string);

-    while (m-- > 0)
-    {
-        *ptr = tolower((unsigned char) *ptr);
-        ptr++;
+        for (i = 0; workspace[i] != 0; i++)
+            workspace[i] = towlower(workspace[i]);
+
+        result = wcstotext(workspace, i);
+
+        pfree(workspace);
+
+        PG_RETURN_TEXT_P(result);
     }
+    else
+#endif /* USE_WIDE_UPPER_LOWER */
+    {
+        text       *string = PG_GETARG_TEXT_P_COPY(0);
+        char       *ptr;
+        int            m;
+
+        /* Since we copied the string, we can scribble directly on the value */
+        ptr = VARDATA(string);
+        m = VARSIZE(string) - VARHDRSZ;
+
+        while (m-- > 0)
+        {
+            *ptr = tolower((unsigned char) *ptr);
+            ptr++;
+        }

-    PG_RETURN_TEXT_P(string);
+        PG_RETURN_TEXT_P(string);
+    }
 }


@@ -78,21 +224,45 @@
 Datum
 upper(PG_FUNCTION_ARGS)
 {
-    text       *string = PG_GETARG_TEXT_P_COPY(0);
-    char       *ptr;
-    int            m;
+#ifdef USE_WIDE_UPPER_LOWER
+    /* use wide char code only when max encoding length > one */
+    if (pg_database_encoding_max_length() > 1)
+    {
+        text       *string = PG_GETARG_TEXT_P(0);
+        text       *result;
+        wchar_t       *workspace;
+        int            i;

-    /* Since we copied the string, we can scribble directly on the value */
-    ptr = VARDATA(string);
-    m = VARSIZE(string) - VARHDRSZ;
+        workspace = texttowcs(string);

-    while (m-- > 0)
-    {
-        *ptr = toupper((unsigned char) *ptr);
-        ptr++;
+        for (i = 0; workspace[i] != 0; i++)
+            workspace[i] = towupper(workspace[i]);
+
+        result = wcstotext(workspace, i);
+
+        pfree(workspace);
+
+        PG_RETURN_TEXT_P(result);
     }
+    else
+#endif /* USE_WIDE_UPPER_LOWER */
+    {
+        text       *string = PG_GETARG_TEXT_P_COPY(0);
+        char       *ptr;
+        int            m;
+
+        /* Since we copied the string, we can scribble directly on the value */
+        ptr = VARDATA(string);
+        m = VARSIZE(string) - VARHDRSZ;
+
+        while (m-- > 0)
+        {
+            *ptr = toupper((unsigned char) *ptr);
+            ptr++;
+        }

-    PG_RETURN_TEXT_P(string);
+        PG_RETURN_TEXT_P(string);
+    }
 }


@@ -106,41 +276,67 @@
  *
  * Purpose:
  *
- *     Returns string, with first letter of each word in uppercase,
- *     all other letters in lowercase. A word is delimited by white
- *     space.
+ *     Returns string, with first letter of each word in uppercase, all
+ *     other letters in lowercase. A word is defined as a sequence of
+ *     alphanumeric characters, delimited by non-alphanumeric
+ *     characters.
  *
  ********************************************************************/

 Datum
 initcap(PG_FUNCTION_ARGS)
 {
-    text       *string = PG_GETARG_TEXT_P_COPY(0);
-    char       *ptr;
-    int            m;
+#ifdef USE_WIDE_UPPER_LOWER
+    /* use wide char code only when max encoding length > one */
+    if (pg_database_encoding_max_length() > 1)
+    {
+        text       *string = PG_GETARG_TEXT_P(0);
+        text       *result;
+        wchar_t       *workspace;
+        int            wasalnum = 0;
+        int            i;

-    /* Since we copied the string, we can scribble directly on the value */
-    ptr = VARDATA(string);
-    m = VARSIZE(string) - VARHDRSZ;
+        workspace = texttowcs(string);

-    if (m > 0)
-    {
-        *ptr = toupper((unsigned char) *ptr);
-        ptr++;
-        m--;
-    }
+        for (i = 0; workspace[i] != 0; i++)
+        {
+            if (wasalnum)
+                workspace[i] = towlower(workspace[i]);
+            else
+                workspace[i] = towupper(workspace[i]);
+            wasalnum = iswalnum(workspace[i]);
+        }

-    while (m-- > 0)
-    {
-        /* Oracle capitalizes after all non-alphanumeric */
-        if (!isalnum((unsigned char) ptr[-1]))
-            *ptr = toupper((unsigned char) *ptr);
-        else
-            *ptr = tolower((unsigned char) *ptr);
-        ptr++;
+        result = wcstotext(workspace, i);
+
+        pfree(workspace);
+
+        PG_RETURN_TEXT_P(result);
     }
+    else
+#endif /* USE_WIDE_UPPER_LOWER */
+    {
+        text       *string = PG_GETARG_TEXT_P_COPY(0);
+        int            wasalnum = 0;
+        char       *ptr;
+        int            m;
+
+        /* Since we copied the string, we can scribble directly on the value */
+        ptr = VARDATA(string);
+        m = VARSIZE(string) - VARHDRSZ;
+
+        while (m-- > 0)
+        {
+            if (wasalnum)
+                *ptr = tolower((unsigned char) *ptr);
+            else
+                *ptr = toupper((unsigned char) *ptr);
+            wasalnum = isalnum((unsigned char) *ptr);
+            ptr++;
+        }

-    PG_RETURN_TEXT_P(string);
+        PG_RETURN_TEXT_P(string);
+    }
 }


@@ -872,7 +1068,7 @@
  ********************************************************************/

 Datum
-chr            (PG_FUNCTION_ARGS)
+chr(PG_FUNCTION_ARGS)
 {
     int32        cvalue = PG_GETARG_INT32(0);
     text       *result;
--- postgresql-7.4.5/configure.in    2004-08-18 05:11:25.000000000 +0200
+++ postgresql-8.0.0beta1/configure.in    2004-08-09 01:27:11.000000000 +0200
@@ -866,7 +810,7 @@
 # SunOS doesn't handle negative byte comparisons properly with +/- return
 AC_FUNC_MEMCMP

-AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getpeereid memmove poll pstat setproctitle setsid sigprocmask symlink
sysconfutime utimes waitpid]) 
+AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getpeereid memmove poll pstat setproctitle setsid sigprocmask symlink
sysconftowlower utime utimes waitpid wcstombs]) 

 AC_CHECK_DECLS(fdatasync, [], [], [#include <unistd.h>])


---  postgresql-7.4.5/src/include/pg_config.h.in~    2004-03-20 16:39:40.000000000 +0100
+++  postgresql-7.4.5/src/include/pg_config.h.in    2004-08-26 13:18:28.000000000 +0200
@@ -509,6 +509,9 @@
    `HAVE_STRUCT_TM_TM_ZONE' instead. */
 #undef HAVE_TM_ZONE

+/* Define to 1 if you have the `towlower' function. */
+#undef HAVE_TOWLOWER
+
 /* Define to 1 if you have the external array `tzname'. */
 #undef HAVE_TZNAME

@@ -542,6 +545,9 @@
 /* Define to 1 if you have the `waitpid' function. */
 #undef HAVE_WAITPID

+/* Define to 1 if you have the `wcstombs' function. */
+#undef HAVE_WCSTOMBS
+
 /* Define to the appropriate snprintf format for 64-bit ints, if any. */
 #undef INT64_FORMAT
pgsql-patches by date:
From: Tom Lane
Date: 26 August 2004, 12:23:40
Subject: Re: [pgsql-hackers-win32] postmaster.pid
From: "Dave Page"
Date: 26 August 2004, 13:11:50
Subject: Re: [pgsql-hackers-win32] postmaster.pid
Backported the very useful oracle_compat.c from postgres 8.0 beta to 7.4.5 - Mailing list pgsql-patches

Previous

Next