Thread: ts_locale.c: why no t_isalnum() test?
I happened to wonder why various places are testing things like #define ISWORDCHR(c) (t_isalpha(c) || t_isdigit(c)) rather than using an isalnum-equivalent test. The direct answer is that ts_locale.c/.h provides no such test function, which apparently is because there's not a lot of potential callers in the core code. However, both pg_trgm and ltree could benefit from adding one. There's no semantic hazard here: the documentation I consulted is all pretty explicit that is[w]alnum is true exactly when either is[w]alpha or is[w]digit are. For example, POSIX saith The iswalpha() and iswalpha_l() functions shall test whether wc is a wide-character code representing a character of class alpha in the current locale, or in the locale represented by locale, respectively; see XBD Locale. The iswdigit() and iswdigit_l() functions shall test whether wc is a wide-character code representing a character of class digit in the current locale, or in the locale represented by locale, respectively; see XBD Locale. The iswalnum() and iswalnum_l() functions shall test whether wc is a wide-character code representing a character of class alpha or digit in the current locale, or in the locale represented by locale, respectively; see XBD Locale. While I didn't try to actually measure it, these functions don't look remarkably cheap. Doing char2wchar() twice when we only need to do it once seems silly, and the libc functions themselves are probably none too cheap for multibyte characters either. Hence, I propose the attached. I got rid of some places that were unnecessarily checking pg_mblen before applying t_iseq(), too. regards, tom lane diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c index d580339283..a6466f575f 100644 --- a/contrib/ltree/lquery_op.c +++ b/contrib/ltree/lquery_op.c @@ -25,17 +25,16 @@ static char * getlexeme(char *start, char *end, int *len) { char *ptr; - int charlen; - while (start < end && (charlen = pg_mblen(start)) == 1 && t_iseq(start, '_')) - start += charlen; + while (start < end && t_iseq(start, '_')) + start += pg_mblen(start); ptr = start; if (ptr >= end) return NULL; - while (ptr < end && !((charlen = pg_mblen(ptr)) == 1 && t_iseq(ptr, '_'))) - ptr += charlen; + while (ptr < end && !t_iseq(ptr, '_')) + ptr += pg_mblen(ptr); *len = ptr - start; return start; diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h index 40aed0ca0c..2a80a02495 100644 --- a/contrib/ltree/ltree.h +++ b/contrib/ltree/ltree.h @@ -126,7 +126,7 @@ typedef struct #define LQUERY_HASNOT 0x01 -#define ISALNUM(x) ( t_isalpha(x) || t_isdigit(x) || ( pg_mblen(x) == 1 && t_iseq((x), '_') ) ) +#define ISALNUM(x) ( t_isalnum(x) || t_iseq(x, '_') ) /* full text query */ diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c index 3eca5cb8ff..8ab0ce8e52 100644 --- a/contrib/ltree/ltxtquery_io.c +++ b/contrib/ltree/ltxtquery_io.c @@ -64,13 +64,13 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint switch (state->state) { case WAITOPERAND: - if (charlen == 1 && t_iseq(state->buf, '!')) + if (t_iseq(state->buf, '!')) { (state->buf)++; *val = (int32) '!'; return OPR; } - else if (charlen == 1 && t_iseq(state->buf, '(')) + else if (t_iseq(state->buf, '(')) { state->count++; (state->buf)++; @@ -97,11 +97,11 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint errmsg("modifiers syntax error"))); *lenval += charlen; } - else if (charlen == 1 && t_iseq(state->buf, '%')) + else if (t_iseq(state->buf, '%')) *flag |= LVAR_SUBLEXEME; - else if (charlen == 1 && t_iseq(state->buf, '@')) + else if (t_iseq(state->buf, '@')) *flag |= LVAR_INCASE; - else if (charlen == 1 && t_iseq(state->buf, '*')) + else if (t_iseq(state->buf, '*')) *flag |= LVAR_ANYEND; else { @@ -110,14 +110,14 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint } break; case WAITOPERATOR: - if (charlen == 1 && (t_iseq(state->buf, '&') || t_iseq(state->buf, '|'))) + if (t_iseq(state->buf, '&') || t_iseq(state->buf, '|')) { state->state = WAITOPERAND; *val = (int32) *(state->buf); (state->buf)++; return OPR; } - else if (charlen == 1 && t_iseq(state->buf, ')')) + else if (t_iseq(state->buf, ')')) { (state->buf)++; state->count--; @@ -125,7 +125,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint } else if (*(state->buf) == '\0') return (state->count) ? ERR : END; - else if (charlen == 1 && !t_iseq(state->buf, ' ')) + else if (!t_iseq(state->buf, ' ')) return ERR; break; default: diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index 405a1d9552..afb0adb222 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -52,7 +52,7 @@ typedef char trgm[3]; } while(0) #ifdef KEEPONLYALNUM -#define ISWORDCHR(c) (t_isalpha(c) || t_isdigit(c)) +#define ISWORDCHR(c) (t_isalnum(c)) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)=='') ) #else #define ISWORDCHR(c) (!t_isspace(c)) diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index e0aa570bf5..fc21bf9ee8 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -81,6 +81,22 @@ t_isalpha(const char *ptr) return iswalpha((wint_t) character[0]); } +int +t_isalnum(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[WC_BUF_LEN]; + Oid collation = DEFAULT_COLLATION_OID; /* TODO */ + pg_locale_t mylocale = 0; /* TODO */ + + if (clen == 1 || lc_ctype_is_c(collation)) + return isalnum(TOUCHAR(ptr)); + + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); + + return iswalnum((wint_t) character[0]); +} + int t_isprint(const char *ptr) { diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index f49e6bb157..a206926042 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -248,7 +248,7 @@ parse_or_operator(TSQueryParserState pstate) return false; /* it shouldn't be a part of any word */ - if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr)) + if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum(ptr)) return false; for (;;) diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h index d14cb4ed26..10887290dc 100644 --- a/src/include/tsearch/ts_locale.h +++ b/src/include/tsearch/ts_locale.h @@ -42,6 +42,7 @@ typedef struct extern int t_isdigit(const char *ptr); extern int t_isspace(const char *ptr); extern int t_isalpha(const char *ptr); +extern int t_isalnum(const char *ptr); extern int t_isprint(const char *ptr); extern char *lowerstr(const char *str);
On Wed, Oct 5, 2022 at 3:53 PM Tom Lane <tgl@sss.pgh.pa.us> wrote:
I happened to wonder why various places are testing things like
#define ISWORDCHR(c) (t_isalpha(c) || t_isdigit(c))
rather than using an isalnum-equivalent test. The direct answer
is that ts_locale.c/.h provides no such test function, which
apparently is because there's not a lot of potential callers in
the core code. However, both pg_trgm and ltree could benefit
from adding one.
There's no semantic hazard here: the documentation I consulted
is all pretty explicit that is[w]alnum is true exactly when
either is[w]alpha or is[w]digit are. For example, POSIX saith
The iswalpha() and iswalpha_l() functions shall test whether wc is a
wide-character code representing a character of class alpha in the
current locale, or in the locale represented by locale, respectively;
see XBD Locale.
The iswdigit() and iswdigit_l() functions shall test whether wc is a
wide-character code representing a character of class digit in the
current locale, or in the locale represented by locale, respectively;
see XBD Locale.
The iswalnum() and iswalnum_l() functions shall test whether wc is a
wide-character code representing a character of class alpha or digit
in the current locale, or in the locale represented by locale,
respectively; see XBD Locale.
While I didn't try to actually measure it, these functions don't
look remarkably cheap. Doing char2wchar() twice when we only need
to do it once seems silly, and the libc functions themselves are
probably none too cheap for multibyte characters either.
Hence, I propose the attached. I got rid of some places that were
unnecessarily checking pg_mblen before applying t_iseq(), too.
regards, tom lane
I see this is already committed, but I'm curious, why do t_isalpha and t_isdigit have the pair of /* TODO */ comments? This unfinished business isn't explained anywhere in the file.
Corey Huinker <corey.huinker@gmail.com> writes: > I see this is already committed, but I'm curious, why do t_isalpha and > t_isdigit have the pair of /* TODO */ comments? This unfinished business > isn't explained anywhere in the file. We really ought to be consulting the locale/collation passed to the SQL-level operator or function that's calling these things, instead of hard-wiring the database default. Passing that down would take a large and boring patch, but somebody ought to tackle it someday. regards, tom lane