From 94b77294ac95901f07f1e2a571fad483a7409639 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 12 Jan 2026 08:58:43 -0800 Subject: [PATCH v1 1/4] ILIKE: use CASEFOLD() rather than LOWER(). For non-C locales, we casefold the entire string before performing pattern matching with ILIKE. Previously, casefolding was done with the LOWER() function; now that a proper CASEFOLD() function exists, use that instead. CASEFOLD() is better than LOWER() for case-insensitive comparisons in builtin and ICU locales. For instance, CASEFOLD() transforms a GREEK SMALL LETTER FINAL SIGMA (U+03C2) into GREEK SMALL LETTER SIGMA (U+03C3) so that the two characters match in a case-insensitive comparison; whereas LOWER() does not transform it because it's already lowercase, so they will not match. --- src/backend/utils/adt/like.c | 8 +++----- src/test/regress/expected/collate.utf8.out | 24 ++++++++++++++++++++++ src/test/regress/sql/collate.utf8.sql | 6 ++++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 2143d8658e8..b04c6cc6661 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -190,10 +190,8 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) errmsg("nondeterministic collations are not supported for ILIKE"))); /* - * For efficiency reasons, in the C locale we don't call lower() on the + * For efficiency reasons, in the C locale we don't call casefold() on the * pattern and text, but instead lowercase each character lazily. - * - * XXX: use casefolding instead? */ if (locale->ctype_is_c) @@ -206,11 +204,11 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) } else { - pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, + pat = DatumGetTextPP(DirectFunctionCall1Coll(casefold, collation, PointerGetDatum(pat))); p = VARDATA_ANY(pat); plen = VARSIZE_ANY_EXHDR(pat); - str = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, + str = DatumGetTextPP(DirectFunctionCall1Coll(casefold, collation, PointerGetDatum(str))); s = VARDATA_ANY(str); slen = VARSIZE_ANY_EXHDR(str); diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out index 0c3ab5c89b2..3d4292611e2 100644 --- a/src/test/regress/expected/collate.utf8.out +++ b/src/test/regress/expected/collate.utf8.out @@ -169,6 +169,18 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8); abcd 123 #$% ıiiİ ß ß dždždž σσσ (1 row) +SELECT U&'ς' ILIKE U&'σ' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_C_UTF8; + ?column? +---------- + f +(1 row) + -- -- Test PG_UNICODE_FAST -- @@ -338,3 +350,15 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FA abcd 123 #$% ıiii̇ ss ss dždždž σσσ (1 row) +SELECT U&'ς' ILIKE U&'σ' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql index d6d14220ab3..4a5e519cf07 100644 --- a/src/test/regress/sql/collate.utf8.sql +++ b/src/test/regress/sql/collate.utf8.sql @@ -85,6 +85,9 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed -- case folding select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8); +SELECT U&'ς' ILIKE U&'σ' COLLATE PG_C_UTF8; +SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_C_UTF8; + -- -- Test PG_UNICODE_FAST -- @@ -148,3 +151,6 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases re -- case folding select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FAST); + +SELECT U&'ς' ILIKE U&'σ' COLLATE PG_UNICODE_FAST; +SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_UNICODE_FAST; -- 2.43.0