ts_locale.c: why no t_isalnum() test? - Mailing list pgsql-hackers

From Tom Lane
Subject ts_locale.c: why no t_isalnum() test?
Date
Msg-id 2548310.1664999615@sss.pgh.pa.us
Whole thread Raw
Responses Re: ts_locale.c: why no t_isalnum() test?  (Corey Huinker <corey.huinker@gmail.com>)
List pgsql-hackers
I happened to wonder why various places are testing things like

#define ISWORDCHR(c)    (t_isalpha(c) || t_isdigit(c))

rather than using an isalnum-equivalent test.  The direct answer
is that ts_locale.c/.h provides no such test function, which
apparently is because there's not a lot of potential callers in
the core code.  However, both pg_trgm and ltree could benefit
from adding one.

There's no semantic hazard here: the documentation I consulted
is all pretty explicit that is[w]alnum is true exactly when
either is[w]alpha or is[w]digit are.  For example, POSIX saith

    The iswalpha() and iswalpha_l() functions shall test whether wc is a
    wide-character code representing a character of class alpha in the
    current locale, or in the locale represented by locale, respectively;
    see XBD Locale.

    The iswdigit() and iswdigit_l() functions shall test whether wc is a
    wide-character code representing a character of class digit in the
    current locale, or in the locale represented by locale, respectively;
    see XBD Locale.

    The iswalnum() and iswalnum_l() functions shall test whether wc is a
    wide-character code representing a character of class alpha or digit
    in the current locale, or in the locale represented by locale,
    respectively; see XBD Locale.

While I didn't try to actually measure it, these functions don't
look remarkably cheap.  Doing char2wchar() twice when we only need
to do it once seems silly, and the libc functions themselves are
probably none too cheap for multibyte characters either.

Hence, I propose the attached.  I got rid of some places that were
unnecessarily checking pg_mblen before applying t_iseq(), too.

            regards, tom lane

diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c
index d580339283..a6466f575f 100644
--- a/contrib/ltree/lquery_op.c
+++ b/contrib/ltree/lquery_op.c
@@ -25,17 +25,16 @@ static char *
 getlexeme(char *start, char *end, int *len)
 {
     char       *ptr;
-    int            charlen;

-    while (start < end && (charlen = pg_mblen(start)) == 1 && t_iseq(start, '_'))
-        start += charlen;
+    while (start < end && t_iseq(start, '_'))
+        start += pg_mblen(start);

     ptr = start;
     if (ptr >= end)
         return NULL;

-    while (ptr < end && !((charlen = pg_mblen(ptr)) == 1 && t_iseq(ptr, '_')))
-        ptr += charlen;
+    while (ptr < end && !t_iseq(ptr, '_'))
+        ptr += pg_mblen(ptr);

     *len = ptr - start;
     return start;
diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h
index 40aed0ca0c..2a80a02495 100644
--- a/contrib/ltree/ltree.h
+++ b/contrib/ltree/ltree.h
@@ -126,7 +126,7 @@ typedef struct

 #define LQUERY_HASNOT        0x01

-#define ISALNUM(x)    ( t_isalpha(x) || t_isdigit(x)    || ( pg_mblen(x) == 1 && t_iseq((x), '_') ) )
+#define ISALNUM(x)    ( t_isalnum(x) || t_iseq(x, '_') )

 /* full text query */

diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c
index 3eca5cb8ff..8ab0ce8e52 100644
--- a/contrib/ltree/ltxtquery_io.c
+++ b/contrib/ltree/ltxtquery_io.c
@@ -64,13 +64,13 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint
         switch (state->state)
         {
             case WAITOPERAND:
-                if (charlen == 1 && t_iseq(state->buf, '!'))
+                if (t_iseq(state->buf, '!'))
                 {
                     (state->buf)++;
                     *val = (int32) '!';
                     return OPR;
                 }
-                else if (charlen == 1 && t_iseq(state->buf, '('))
+                else if (t_iseq(state->buf, '('))
                 {
                     state->count++;
                     (state->buf)++;
@@ -97,11 +97,11 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint
                                  errmsg("modifiers syntax error")));
                     *lenval += charlen;
                 }
-                else if (charlen == 1 && t_iseq(state->buf, '%'))
+                else if (t_iseq(state->buf, '%'))
                     *flag |= LVAR_SUBLEXEME;
-                else if (charlen == 1 && t_iseq(state->buf, '@'))
+                else if (t_iseq(state->buf, '@'))
                     *flag |= LVAR_INCASE;
-                else if (charlen == 1 && t_iseq(state->buf, '*'))
+                else if (t_iseq(state->buf, '*'))
                     *flag |= LVAR_ANYEND;
                 else
                 {
@@ -110,14 +110,14 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint
                 }
                 break;
             case WAITOPERATOR:
-                if (charlen == 1 && (t_iseq(state->buf, '&') || t_iseq(state->buf, '|')))
+                if (t_iseq(state->buf, '&') || t_iseq(state->buf, '|'))
                 {
                     state->state = WAITOPERAND;
                     *val = (int32) *(state->buf);
                     (state->buf)++;
                     return OPR;
                 }
-                else if (charlen == 1 && t_iseq(state->buf, ')'))
+                else if (t_iseq(state->buf, ')'))
                 {
                     (state->buf)++;
                     state->count--;
@@ -125,7 +125,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint
                 }
                 else if (*(state->buf) == '\0')
                     return (state->count) ? ERR : END;
-                else if (charlen == 1 && !t_iseq(state->buf, ' '))
+                else if (!t_iseq(state->buf, ' '))
                     return ERR;
                 break;
             default:
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index 405a1d9552..afb0adb222 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -52,7 +52,7 @@ typedef char trgm[3];
 } while(0)

 #ifdef KEEPONLYALNUM
-#define ISWORDCHR(c)    (t_isalpha(c) || t_isdigit(c))
+#define ISWORDCHR(c)    (t_isalnum(c))
 #define ISPRINTABLECHAR(a)    ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned
char*)(a)=='') ) 
 #else
 #define ISWORDCHR(c)    (!t_isspace(c))
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index e0aa570bf5..fc21bf9ee8 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -81,6 +81,22 @@ t_isalpha(const char *ptr)
     return iswalpha((wint_t) character[0]);
 }

+int
+t_isalnum(const char *ptr)
+{
+    int            clen = pg_mblen(ptr);
+    wchar_t        character[WC_BUF_LEN];
+    Oid            collation = DEFAULT_COLLATION_OID;    /* TODO */
+    pg_locale_t mylocale = 0;    /* TODO */
+
+    if (clen == 1 || lc_ctype_is_c(collation))
+        return isalnum(TOUCHAR(ptr));
+
+    char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+
+    return iswalnum((wint_t) character[0]);
+}
+
 int
 t_isprint(const char *ptr)
 {
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c
index f49e6bb157..a206926042 100644
--- a/src/backend/utils/adt/tsquery.c
+++ b/src/backend/utils/adt/tsquery.c
@@ -248,7 +248,7 @@ parse_or_operator(TSQueryParserState pstate)
         return false;

     /* it shouldn't be a part of any word */
-    if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr))
+    if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum(ptr))
         return false;

     for (;;)
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h
index d14cb4ed26..10887290dc 100644
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -42,6 +42,7 @@ typedef struct
 extern int    t_isdigit(const char *ptr);
 extern int    t_isspace(const char *ptr);
 extern int    t_isalpha(const char *ptr);
+extern int    t_isalnum(const char *ptr);
 extern int    t_isprint(const char *ptr);

 extern char *lowerstr(const char *str);

pgsql-hackers by date:

Previous
From: Garen Torikian
Date:
Subject: Re: [PATCH] Expand character set for ltree labels
Next
From: Tom Lane
Date:
Subject: Re: [PATCH] Expand character set for ltree labels