From c6721272200c14931ad757185a3aaeb615c432ed Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 24 Apr 2023 15:46:17 -0700 Subject: [PATCH 1/2] Interpret C locales consistently between ICU and libc. Treat a locale named C, C.anything, POSIX, or POSIX.anything as equivalent to the C locale; implemented with built-in semantics (memcmp() for collation and pg_ascii_*() for ctype). Such locales are not passed to the provider at all, so have identical behavior regardless of whether it's declared with provider ICU or libc. Previously, only C and POSIX locales had this behavior (not e.g. "C.UTF-8"), and only if the provider was declared as libc. That caused problems on libc for locales like C.UTF-8, which may have subtly different behavior in some versions of libc; and it caused problems on ICU because newer versions don't recognize C locales. Discussion: https://postgr.es/m/1559006.1685040536@sss.pgh.pa.us Discussion: https://postgr.es/m/c840107b-4cb9-c8e9-abb7-1d8c5e0d51df%40enterprisedb.com Discussion: https://postgr.es/m/87v8hoexdv.fsf@news-spur.riddles.org.uk --- doc/src/sgml/charset.sgml | 3 +- src/backend/commands/collationcmds.c | 42 +++--- src/backend/commands/dbcommands.c | 41 +++--- src/backend/utils/adt/pg_locale.c | 126 +++++++++++++----- src/backend/utils/init/postinit.c | 4 +- src/backend/utils/mb/mbutils.c | 3 +- src/include/utils/pg_locale.h | 1 + .../regress/expected/collate.icu.utf8.out | 6 + src/test/regress/expected/collate.out | 5 + src/test/regress/sql/collate.icu.utf8.sql | 4 + src/test/regress/sql/collate.sql | 5 + 11 files changed, 167 insertions(+), 73 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index ed84465996..8ba3117557 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -136,7 +136,8 @@ initdb --locale=sv_SE If you want the system to behave as if it had no locale support, use the special locale name C, or equivalently - POSIX. + POSIX. An encoding may also be appended, for + example C.UTF-8. diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 2969a2bb21..a451ae8843 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -264,26 +264,38 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("parameter \"locale\" must be specified"))); - /* - * During binary upgrade, preserve the locale string. Otherwise, - * canonicalize to a language tag. - */ - if (!IsBinaryUpgrade) + if (locale_name_is_c(colliculocale)) { - char *langtag = icu_language_tag(colliculocale, - icu_validation_level); - - if (langtag && strcmp(colliculocale, langtag) != 0) + if (!collisdeterministic) + ereport(ERROR, + (errmsg("nondeterministic collations not supported for C or POSIX locale"))); + if (collicurules != NULL) + ereport(ERROR, + (errmsg("RULES not supported for C or POSIX locale"))); + } + else + { + /* + * During binary upgrade, preserve the locale + * string. Otherwise, canonicalize to a language tag. + */ + if (!IsBinaryUpgrade) { - ereport(NOTICE, - (errmsg("using standard form \"%s\" for locale \"%s\"", - langtag, colliculocale))); + char *langtag = icu_language_tag(colliculocale, + icu_validation_level); + + if (langtag && strcmp(colliculocale, langtag) != 0) + { + ereport(NOTICE, + (errmsg("using standard form \"%s\" for locale \"%s\"", + langtag, colliculocale))); - colliculocale = langtag; + colliculocale = langtag; + } } - } - icu_validate_locale(colliculocale); + icu_validate_locale(colliculocale); + } } /* diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 99d4080ea9..601a08ef11 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1058,27 +1058,36 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ICU locale must be specified"))); - /* - * During binary upgrade, or when the locale came from the template - * database, preserve locale string. Otherwise, canonicalize to a - * language tag. - */ - if (!IsBinaryUpgrade && dbiculocale != src_iculocale) + if (locale_name_is_c(dbiculocale)) { - char *langtag = icu_language_tag(dbiculocale, - icu_validation_level); - - if (langtag && strcmp(dbiculocale, langtag) != 0) + if (dbicurules != NULL) + ereport(ERROR, + (errmsg("ICU_RULES not supported for C or POSIX locale"))); + } + else + { + /* + * During binary upgrade, or when the locale came from the + * template database, preserve locale string. Otherwise, + * canonicalize to a language tag. + */ + if (!IsBinaryUpgrade && dbiculocale != src_iculocale) { - ereport(NOTICE, - (errmsg("using standard form \"%s\" for locale \"%s\"", - langtag, dbiculocale))); + char *langtag = icu_language_tag(dbiculocale, + icu_validation_level); + + if (langtag && strcmp(dbiculocale, langtag) != 0) + { + ereport(NOTICE, + (errmsg("using standard form \"%s\" for locale \"%s\"", + langtag, dbiculocale))); - dbiculocale = langtag; + dbiculocale = langtag; + } } - } - icu_validate_locale(dbiculocale); + icu_validate_locale(dbiculocale); + } } else { diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 31e3b16ae0..2f2734a405 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1239,15 +1239,19 @@ lookup_collation_cache(Oid collation, bool set_flags) datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype); collctype = TextDatumGetCString(datum); - cache_entry->collate_is_c = ((strcmp(collcollate, "C") == 0) || - (strcmp(collcollate, "POSIX") == 0)); - cache_entry->ctype_is_c = ((strcmp(collctype, "C") == 0) || - (strcmp(collctype, "POSIX") == 0)); + cache_entry->collate_is_c = locale_name_is_c(collcollate); + cache_entry->ctype_is_c = locale_name_is_c(collctype); } else { - cache_entry->collate_is_c = false; - cache_entry->ctype_is_c = false; + Datum datum; + const char *colliculocale; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colliculocale); + colliculocale = TextDatumGetCString(datum); + + cache_entry->collate_is_c = locale_name_is_c(colliculocale); + cache_entry->ctype_is_c = cache_entry->collate_is_c; } cache_entry->flags_valid = true; @@ -1258,6 +1262,22 @@ lookup_collation_cache(Oid collation, bool set_flags) return cache_entry; } +/* + * Check if the locale name should be handled like the C locale. + * + * If so, the locale should be handled with built-in memcmp() and + * pg_ascii_*(); otherwise, the locale should be handled by the collation + * provider. + */ +bool +locale_name_is_c(const char *locale) +{ + if (strcmp(locale, "C") == 0 || strncmp(locale, "C.", 2) == 0 || + strcmp(locale, "POSIX") == 0 || strncmp(locale, "POSIX.", 6) == 0) + return true; + + return false; +} /* * Detect whether collation's LC_COLLATE property is C @@ -1279,23 +1299,30 @@ lc_collate_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_COLLATE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_COLLATE setting"); - - if (strcmp(localeptr, "C") == 0) - result = true; - else if (strcmp(localeptr, "POSIX") == 0) - result = true; + + if (default_locale.provider == COLLPROVIDER_ICU) + { +#ifdef USE_ICU + localeptr = default_locale.info.icu.locale; +#else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); +#endif + } else - result = false; + { + localeptr = setlocale(LC_COLLATE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_COLLATE setting"); + } + + result = locale_name_is_c(localeptr); + return (bool) result; } @@ -1332,23 +1359,30 @@ lc_ctype_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_CTYPE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_CTYPE setting"); - - if (strcmp(localeptr, "C") == 0) - result = true; - else if (strcmp(localeptr, "POSIX") == 0) - result = true; + + if (default_locale.provider == COLLPROVIDER_ICU) + { +#ifdef USE_ICU + localeptr = default_locale.info.icu.locale; +#else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); +#endif + } else - result = false; + { + localeptr = setlocale(LC_CTYPE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_CTYPE setting"); + } + + result = locale_name_is_c(localeptr); + return (bool) result; } @@ -1375,7 +1409,13 @@ make_icu_collator(const char *iculocstr, #ifdef USE_ICU UCollator *collator; - collator = pg_ucol_open(iculocstr); + if (locale_name_is_c(iculocstr)) + { + Assert(icurules == NULL); + collator = NULL; + } + else + collator = pg_ucol_open(iculocstr); /* * If rules are specified, we extract the rules of the standard collation, @@ -1525,6 +1565,9 @@ pg_newlocale_from_collation(Oid collid) datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype); collctype = TextDatumGetCString(datum); + Assert(!locale_name_is_c(collcollate)); + Assert(!locale_name_is_c(collctype)); + if (strcmp(collcollate, collctype) == 0) { /* Normal case where they're the same */ @@ -1581,6 +1624,8 @@ pg_newlocale_from_collation(Oid collid) datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colliculocale); iculocstr = TextDatumGetCString(datum); + Assert(!locale_name_is_c(iculocstr)); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull); if (!isnull) icurules = TextDatumGetCString(datum); @@ -1650,6 +1695,9 @@ get_collation_actual_version(char collprovider, const char *collcollate) { char *collversion = NULL; + if (locale_name_is_c(collcollate)) + return NULL; + #ifdef USE_ICU if (collprovider == COLLPROVIDER_ICU) { @@ -1667,10 +1715,7 @@ get_collation_actual_version(char collprovider, const char *collcollate) } else #endif - if (collprovider == COLLPROVIDER_LIBC && - pg_strcasecmp("C", collcollate) != 0 && - pg_strncasecmp("C.", collcollate, 2) != 0 && - pg_strcasecmp("POSIX", collcollate) != 0) + if (collprovider == COLLPROVIDER_LIBC) { #if defined(__GLIBC__) /* Use the glibc version because we don't have anything better. */ @@ -2457,6 +2502,13 @@ pg_ucol_open(const char *loc_str) if (loc_str == NULL) elog(ERROR, "opening default collator is not supported"); + /* + * Must never open special values C or POSIX, which are treated specially + * and not passed to the provider. + */ + if (locale_name_is_c(loc_str)) + elog(ERROR, "unexpected ICU locale string: %s", loc_str); + /* * In ICU versions 54 and earlier, "und" is not a recognized spelling of * the root locale. If the first component of the locale is "und", replace diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 6856ed99e7..92928133c0 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -419,9 +419,7 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect " which is not recognized by setlocale().", ctype), errhint("Recreate the database with another locale or install the missing locale."))); - if (strcmp(ctype, "C") == 0 || - strcmp(ctype, "POSIX") == 0) - database_ctype_is_c = true; + database_ctype_is_c = locale_name_is_c(ctype); if (dbform->datlocprovider == COLLPROVIDER_ICU) { diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 67a1ab2ab2..9a54a952e0 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -39,6 +39,7 @@ #include "mb/pg_wchar.h" #include "utils/builtins.h" #include "utils/memutils.h" +#include "utils/pg_locale.h" #include "utils/syscache.h" #include "varatt.h" @@ -1239,7 +1240,7 @@ pg_bind_textdomain_codeset(const char *domainname) #ifndef WIN32 const char *ctype = setlocale(LC_CTYPE, NULL); - if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0) + if (locale_name_is_c(ctype)) #endif if (encoding != PG_SQL_ASCII && raw_pg_bind_textdomain_codeset(domainname, encoding)) diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index e2a7243542..0e26346546 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -54,6 +54,7 @@ extern PGDLLIMPORT bool database_ctype_is_c; extern bool check_locale(int category, const char *locale, char **canonname); extern char *pg_perm_setlocale(int category, const char *locale); +extern bool locale_name_is_c(const char *locale); extern bool lc_collate_is_c(Oid collation); extern bool lc_ctype_is_c(Oid collation); diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index c658ee1404..79ce33abbd 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1043,12 +1043,18 @@ ERROR: ICU locale "nonsense-nowhere" has unknown language "nonsense" HINT: To disable ICU locale validation, set parameter icu_validation_level to DISABLED. CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails ERROR: could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR +CREATE COLLATION testx (provider = icu, locale = 'C', deterministic = false); -- fails +ERROR: nondeterministic collations not supported for C or POSIX locale +CREATE COLLATION testx (provider = icu, locale = 'C', rules = '&V << w <<< W'); -- fails +ERROR: RULES not supported for C or POSIX locale RESET icu_validation_level; CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx; WARNING: could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx; WARNING: ICU locale "nonsense-nowhere" has unknown language "nonsense" HINT: To disable ICU locale validation, set parameter icu_validation_level to DISABLED. +CREATE COLLATION testx (provider = icu, locale = 'C.UTF-8'); DROP COLLATION testx; +CREATE COLLATION testx (provider = icu, locale = 'POSIX'); DROP COLLATION testx; CREATE COLLATION test4 FROM nonsense; ERROR: collation "nonsense" for encoding "UTF8" does not exist CREATE COLLATION test5 FROM test0; diff --git a/src/test/regress/expected/collate.out b/src/test/regress/expected/collate.out index 0649564485..e2d0a39732 100644 --- a/src/test/regress/expected/collate.out +++ b/src/test/regress/expected/collate.out @@ -649,6 +649,11 @@ EXPLAIN (COSTS OFF) -> Seq Scan on collate_test10 (3 rows) +-- test alternate spellings of special locale C +CREATE COLLATION coll_c_locale ( LOCALE = "C.something" ); +DROP COLLATION coll_c_locale; +CREATE COLLATION coll_c_locale ( LOCALE = "POSIX.something" ); +DROP COLLATION coll_c_locale; -- CREATE/DROP COLLATION CREATE COLLATION mycoll1 FROM "C"; CREATE COLLATION mycoll2 ( LC_COLLATE = "POSIX", LC_CTYPE = "POSIX" ); diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 7bd0901281..adc6b7deec 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -379,9 +379,13 @@ CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, nee SET icu_validation_level = ERROR; CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails +CREATE COLLATION testx (provider = icu, locale = 'C', deterministic = false); -- fails +CREATE COLLATION testx (provider = icu, locale = 'C', rules = '&V << w <<< W'); -- fails RESET icu_validation_level; CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx; CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx; +CREATE COLLATION testx (provider = icu, locale = 'C.UTF-8'); DROP COLLATION testx; +CREATE COLLATION testx (provider = icu, locale = 'POSIX'); DROP COLLATION testx; CREATE COLLATION test4 FROM nonsense; CREATE COLLATION test5 FROM test0; diff --git a/src/test/regress/sql/collate.sql b/src/test/regress/sql/collate.sql index c3d40fc195..10ff532169 100644 --- a/src/test/regress/sql/collate.sql +++ b/src/test/regress/sql/collate.sql @@ -241,6 +241,11 @@ EXPLAIN (COSTS OFF) EXPLAIN (COSTS OFF) SELECT * FROM collate_test10 ORDER BY x DESC, y COLLATE "C" ASC NULLS FIRST; +-- test alternate spellings of special locale C +CREATE COLLATION coll_c_locale ( LOCALE = "C.something" ); +DROP COLLATION coll_c_locale; +CREATE COLLATION coll_c_locale ( LOCALE = "POSIX.something" ); +DROP COLLATION coll_c_locale; -- CREATE/DROP COLLATION -- 2.34.1