Re: A thought about regex versus multibyte character sets - Mailing list pgsql-hackers
From | Tom Lane |
---|---|
Subject | Re: A thought about regex versus multibyte character sets |
Date | |
Msg-id | 17821.1259637190@sss.pgh.pa.us Whole thread Raw |
In response to | A thought about regex versus multibyte character sets (Tom Lane <tgl@sss.pgh.pa.us>) |
List | pgsql-hackers |
I wrote: > I therefore propose the following idea: if the database encoding is > UTF8, allow the regc_locale.c functions to call the <wctype.h> > functions, assuming that wchar_t and pg_wchar_t share the same > representation. On platforms where wchar_t is only 16 bits, we can do > this up to U+FFFF and be stupid about code points above that. Or to be concrete, how about the attached? It seems to do what's wanted, but I'm hardly the best-qualified person to test it. regards, tom lane Index: src/backend/regex/regc_locale.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/regex/regc_locale.c,v retrieving revision 1.9 diff -c -r1.9 regc_locale.c *** src/backend/regex/regc_locale.c 14 Feb 2008 17:33:37 -0000 1.9 --- src/backend/regex/regc_locale.c 1 Dec 2009 03:04:29 -0000 *************** *** 349,415 **** } }; /* ! * some ctype functions with non-ascii-char guard */ static int pg_wc_isdigit(pg_wchar c) { ! return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c)); } static int pg_wc_isalpha(pg_wchar c) { ! return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c)); } static int pg_wc_isalnum(pg_wchar c) { ! return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c)); } static int pg_wc_isupper(pg_wchar c) { ! return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c)); } static int pg_wc_islower(pg_wchar c) { ! return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c)); } static int pg_wc_isgraph(pg_wchar c) { ! return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c)); } static int pg_wc_isprint(pg_wchar c) { ! return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c)); } static int pg_wc_ispunct(pg_wchar c) { ! return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c)); } static int pg_wc_isspace(pg_wchar c) { ! return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c)); } static pg_wchar pg_wc_toupper(pg_wchar c) { ! if (c >= 0 && c <= UCHAR_MAX) return toupper((unsigned char) c); return c; } --- 349,500 ---- } }; + /* ! * ctype functions adapted to work on pg_wchar (a/k/a chr) ! * ! * When working in UTF8 encoding, we use the <wctype.h> functions if ! * available. This assumes that every platform uses Unicode codepoints ! * directly as the wchar_t representation of Unicode. On some platforms ! * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. ! * ! * In all other encodings, we use the <ctype.h> functions for pg_wchar ! * values up to 255, and punt for values above that. This is only 100% ! * correct in single-byte encodings such as LATINn. However, non-Unicode ! * multibyte encodings are mostly Far Eastern character sets for which the ! * properties being tested here aren't relevant for higher code values anyway. ! * ! * NB: the coding here assumes pg_wchar is an unsigned type. */ + static int pg_wc_isdigit(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return iswdigit((wint_t) c); ! } ! #endif ! return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c)); } static int pg_wc_isalpha(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return iswalpha((wint_t) c); ! } ! #endif ! return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c)); } static int pg_wc_isalnum(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return iswalnum((wint_t) c); ! } ! #endif ! return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c)); } static int pg_wc_isupper(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return iswupper((wint_t) c); ! } ! #endif ! return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c)); } static int pg_wc_islower(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return iswlower((wint_t) c); ! } ! #endif ! return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c)); } static int pg_wc_isgraph(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return iswgraph((wint_t) c); ! } ! #endif ! return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c)); } static int pg_wc_isprint(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return iswprint((wint_t) c); ! } ! #endif ! return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c)); } static int pg_wc_ispunct(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return iswpunct((wint_t) c); ! } ! #endif ! return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c)); } static int pg_wc_isspace(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return iswspace((wint_t) c); ! } ! #endif ! return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c)); } static pg_wchar pg_wc_toupper(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return towupper((wint_t) c); ! } ! #endif ! if (c <= (pg_wchar) UCHAR_MAX) return toupper((unsigned char) c); return c; } *************** *** 417,423 **** static pg_wchar pg_wc_tolower(pg_wchar c) { ! if (c >= 0 && c <= UCHAR_MAX) return tolower((unsigned char) c); return c; } --- 502,515 ---- static pg_wchar pg_wc_tolower(pg_wchar c) { ! #ifdef USE_WIDE_UPPER_LOWER ! if (GetDatabaseEncoding() == PG_UTF8) ! { ! if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) ! return towlower((wint_t) c); ! } ! #endif ! if (c <= (pg_wchar) UCHAR_MAX) return tolower((unsigned char) c); return c; } Index: src/include/regex/regcustom.h =================================================================== RCS file: /cvsroot/pgsql/src/include/regex/regcustom.h,v retrieving revision 1.7 diff -c -r1.7 regcustom.h *** src/include/regex/regcustom.h 14 Feb 2008 17:33:37 -0000 1.7 --- src/include/regex/regcustom.h 1 Dec 2009 03:04:29 -0000 *************** *** 34,39 **** --- 34,50 ---- #include <ctype.h> #include <limits.h> + /* + * towlower() and friends should be in <wctype.h>, but some pre-C99 systems + * declare them in <wchar.h>. + */ + #ifdef HAVE_WCHAR_H + #include <wchar.h> + #endif + #ifdef HAVE_WCTYPE_H + #include <wctype.h> + #endif + #include "mb/pg_wchar.h"
pgsql-hackers by date: