I wrote:
> I therefore propose the following idea: if the database encoding is
> UTF8, allow the regc_locale.c functions to call the <wctype.h>
> functions, assuming that wchar_t and pg_wchar_t share the same
> representation. On platforms where wchar_t is only 16 bits, we can do
> this up to U+FFFF and be stupid about code points above that.
Or to be concrete, how about the attached? It seems to do what's
wanted, but I'm hardly the best-qualified person to test it.
regards, tom lane
Index: src/backend/regex/regc_locale.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/regex/regc_locale.c,v
retrieving revision 1.9
diff -c -r1.9 regc_locale.c
*** src/backend/regex/regc_locale.c 14 Feb 2008 17:33:37 -0000 1.9
--- src/backend/regex/regc_locale.c 1 Dec 2009 03:04:29 -0000
***************
*** 349,415 ****
}
};
/*
! * some ctype functions with non-ascii-char guard
*/
static int
pg_wc_isdigit(pg_wchar c)
{
! return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
}
static int
pg_wc_isalpha(pg_wchar c)
{
! return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
}
static int
pg_wc_isalnum(pg_wchar c)
{
! return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
}
static int
pg_wc_isupper(pg_wchar c)
{
! return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
}
static int
pg_wc_islower(pg_wchar c)
{
! return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
}
static int
pg_wc_isgraph(pg_wchar c)
{
! return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
}
static int
pg_wc_isprint(pg_wchar c)
{
! return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c));
}
static int
pg_wc_ispunct(pg_wchar c)
{
! return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
}
static int
pg_wc_isspace(pg_wchar c)
{
! return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
}
static pg_wchar
pg_wc_toupper(pg_wchar c)
{
! if (c >= 0 && c <= UCHAR_MAX)
return toupper((unsigned char) c);
return c;
}
--- 349,500 ----
}
};
+
/*
! * ctype functions adapted to work on pg_wchar (a/k/a chr)
! *
! * When working in UTF8 encoding, we use the <wctype.h> functions if
! * available. This assumes that every platform uses Unicode codepoints
! * directly as the wchar_t representation of Unicode. On some platforms
! * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
! *
! * In all other encodings, we use the <ctype.h> functions for pg_wchar
! * values up to 255, and punt for values above that. This is only 100%
! * correct in single-byte encodings such as LATINn. However, non-Unicode
! * multibyte encodings are mostly Far Eastern character sets for which the
! * properties being tested here aren't relevant for higher code values anyway.
! *
! * NB: the coding here assumes pg_wchar is an unsigned type.
*/
+
static int
pg_wc_isdigit(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return iswdigit((wint_t) c);
! }
! #endif
! return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c));
}
static int
pg_wc_isalpha(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return iswalpha((wint_t) c);
! }
! #endif
! return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c));
}
static int
pg_wc_isalnum(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return iswalnum((wint_t) c);
! }
! #endif
! return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c));
}
static int
pg_wc_isupper(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return iswupper((wint_t) c);
! }
! #endif
! return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c));
}
static int
pg_wc_islower(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return iswlower((wint_t) c);
! }
! #endif
! return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c));
}
static int
pg_wc_isgraph(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return iswgraph((wint_t) c);
! }
! #endif
! return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c));
}
static int
pg_wc_isprint(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return iswprint((wint_t) c);
! }
! #endif
! return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c));
}
static int
pg_wc_ispunct(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return iswpunct((wint_t) c);
! }
! #endif
! return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c));
}
static int
pg_wc_isspace(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return iswspace((wint_t) c);
! }
! #endif
! return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c));
}
static pg_wchar
pg_wc_toupper(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return towupper((wint_t) c);
! }
! #endif
! if (c <= (pg_wchar) UCHAR_MAX)
return toupper((unsigned char) c);
return c;
}
***************
*** 417,423 ****
static pg_wchar
pg_wc_tolower(pg_wchar c)
{
! if (c >= 0 && c <= UCHAR_MAX)
return tolower((unsigned char) c);
return c;
}
--- 502,515 ----
static pg_wchar
pg_wc_tolower(pg_wchar c)
{
! #ifdef USE_WIDE_UPPER_LOWER
! if (GetDatabaseEncoding() == PG_UTF8)
! {
! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
! return towlower((wint_t) c);
! }
! #endif
! if (c <= (pg_wchar) UCHAR_MAX)
return tolower((unsigned char) c);
return c;
}
Index: src/include/regex/regcustom.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/regex/regcustom.h,v
retrieving revision 1.7
diff -c -r1.7 regcustom.h
*** src/include/regex/regcustom.h 14 Feb 2008 17:33:37 -0000 1.7
--- src/include/regex/regcustom.h 1 Dec 2009 03:04:29 -0000
***************
*** 34,39 ****
--- 34,50 ----
#include <ctype.h>
#include <limits.h>
+ /*
+ * towlower() and friends should be in <wctype.h>, but some pre-C99 systems
+ * declare them in <wchar.h>.
+ */
+ #ifdef HAVE_WCHAR_H
+ #include <wchar.h>
+ #endif
+ #ifdef HAVE_WCTYPE_H
+ #include <wctype.h>
+ #endif
+
#include "mb/pg_wchar.h"