From 23e85920dbcfd1d3e71041f92c4adea589acd4f2 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 8 May 2023 13:48:01 -0700 Subject: [PATCH v5 5/7] ICU: for locale "C", automatically use "none" provider instead. Postgres expects locale C to be optimizable to simple locale-unaware byte operations; while ICU does not recognize the locale "C" at all, and falls back to the root locale. If the user specifies locale "C" when creating a new collation or a new database with the ICU provider, automatically switch it to the "none" provider. If provider is libc, behavior is unchanged. --- doc/src/sgml/charset.sgml | 6 +++ doc/src/sgml/ref/create_collation.sgml | 6 +++ doc/src/sgml/ref/create_database.sgml | 5 +++ doc/src/sgml/ref/createdb.sgml | 5 +++ doc/src/sgml/ref/initdb.sgml | 5 +++ src/backend/commands/collationcmds.c | 17 ++++++++ src/backend/commands/dbcommands.c | 21 ++++++++++ src/bin/initdb/initdb.c | 10 +++++ src/bin/initdb/t/001_initdb.pl | 39 +++++++++++++++++++ src/bin/scripts/createdb.c | 11 ++++++ src/bin/scripts/t/020_createdb.pl | 12 ++++++ .../regress/expected/collate.icu.utf8.out | 12 ++++-- src/test/regress/sql/collate.icu.utf8.sql | 3 ++ 13 files changed, 149 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index de7c65ae35..5c4f713e8b 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -405,6 +405,12 @@ initdb --locale-provider=icu --icu-locale=en change in results. LC_COLLATE and LC_CTYPE can be set independently of the ICU locale. + + The ICU provider does not accept the C + locale. Commands that create collations or database with the + icu provider and ICU locale C use + the provider none instead. + For the ICU provider, results may depend on the version of the ICU diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index 5489ae7413..1ac41831d8 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -126,6 +126,12 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM libc is the default. See for details. + + If the provider is icu and the locale is + C or POSIX, the provider is + automatically set to none; as the ICU provider + doesn't support an ICU locale of C. + diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 60b9da0952..c730d02e15 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -190,6 +190,11 @@ CREATE DATABASE name Specifies the ICU locale ID if the ICU locale provider is used. + + If specified as C or POSIX, the + provider is automatically set to none, as the ICU + provider doesn't support an ICU locale of C. + diff --git a/doc/src/sgml/ref/createdb.sgml b/doc/src/sgml/ref/createdb.sgml index 326a371d34..7c573e848a 100644 --- a/doc/src/sgml/ref/createdb.sgml +++ b/doc/src/sgml/ref/createdb.sgml @@ -154,6 +154,11 @@ PostgreSQL documentation Specifies the ICU locale ID to be used in this database, if the ICU locale provider is selected. + + If specified as C or POSIX, the + provider is automatically set to none, as the ICU + provider doesn't support an ICU locale of C. + diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index e604ab48b7..76993acdfe 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -250,6 +250,11 @@ PostgreSQL documentation Specifies the ICU locale when the ICU provider is used. Locale support is described in . + + If specified as C or POSIX, the + provider is automatically set to none, as the ICU + provider doesn't support an ICU locale of C. + If this option is not specified, the locale is inherited from the environment in which initdb runs. The environment's diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 267a551818..ed64e17504 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -254,6 +254,23 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e if (lcctypeEl) collctype = defGetString(lcctypeEl); + /* + * Postgres defines the "C" (and equivalently, "POSIX") locales to be + * optimizable to byte operations (memcmp(), pg_ascii_tolower(), + * etc.); transform into the "none" provider. Don't transform during + * binary upgrade. + */ + if (!IsBinaryUpgrade && collprovider == COLLPROVIDER_ICU && + colliculocale && (pg_strcasecmp(colliculocale, "C") == 0 || + pg_strcasecmp(colliculocale, "POSIX") == 0)) + { + ereport(NOTICE, + (errmsg("using locale provider \"none\" for ICU locale \"%s\"", + colliculocale))); + colliculocale = NULL; + collprovider = COLLPROVIDER_NONE; + } + if (collprovider == COLLPROVIDER_LIBC) { if (!collcollate) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 9e73f54803..6dc737aebb 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1043,6 +1043,27 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) check_encoding_locale_matches(encoding, dbcollate, dbctype); + /* + * Postgres defines the "C" (and equivalently, "POSIX") locales to be + * optimizable to byte operations (memcmp(), pg_ascii_tolower(), etc.); + * transform into the "none" provider. + * + * Don't transform during binary upgrade or when both the provider and ICU + * locale are unchanged from the template. + */ + if (!IsBinaryUpgrade && dblocprovider == COLLPROVIDER_ICU && + (src_locprovider != COLLPROVIDER_ICU || + strcmp(dbiculocale, src_iculocale) != 0) && + dbiculocale && (pg_strcasecmp(dbiculocale, "C") == 0 || + pg_strcasecmp(dbiculocale, "POSIX") == 0)) + { + ereport(NOTICE, + (errmsg("using locale provider \"none\" for ICU locale \"%s\"", + dbiculocale))); + dbiculocale = NULL; + dblocprovider = COLLPROVIDER_NONE; + } + if (dblocprovider == COLLPROVIDER_ICU) { if (!(is_encoding_supported_by_icu(encoding))) diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 4cf6892bee..ea26bf8361 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2501,6 +2501,16 @@ setlocales(void) lc_messages = locale; } + if (icu_locale && locale_provider == COLLPROVIDER_ICU && + (pg_strcasecmp(icu_locale, "C") == 0 || + pg_strcasecmp(icu_locale, "POSIX") == 0)) + { + pg_log_info("using locale provider \"none\" for ICU locale \"%s\"", + icu_locale); + icu_locale = NULL; + locale_provider = COLLPROVIDER_NONE; + } + /* * canonicalize locale names, and obtain any missing values from our * current environment diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index fe6d224e5b..ea92b08511 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -111,6 +111,45 @@ if ($ENV{with_icu} eq 'yes') ], 'option --icu-locale'); + # transformed to provider=none + command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=icu', '--icu-locale=C', + "$tempdir/data4a" + ], + 'option --icu-locale=C'); + + # transformed to provider=none + command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=icu', '--icu-locale=C', + '--locale=C', + "$tempdir/data4b" + ], + 'option --icu-locale=C --locale=C'); + + # transformed to provider=none + command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=icu', '--icu-locale=C', + '--lc-collate=C', + "$tempdir/data4c" + ], + 'option --icu-locale=C --lc-collate=C'); + + # transformed to provider=none + command_ok( + [ + 'initdb', '--no-sync', + '--locale-provider=icu', '--icu-locale=C', + '--lc-ctype=C', + "$tempdir/data4d" + ], + 'option --icu-locale=C --lc-ctype=C'); + command_fails_like( [ 'initdb', '--no-sync', diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index 79367d933b..9caf9190cf 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -172,6 +172,17 @@ main(int argc, char *argv[]) lc_collate = locale; } + if (locale_provider && pg_strcasecmp(locale_provider, "icu") == 0 && + icu_locale && + (pg_strcasecmp(icu_locale, "C") == 0 || + pg_strcasecmp(icu_locale, "POSIX") == 0)) + { + pg_log_info("using locale provider \"none\" for ICU locale \"%s\"", + icu_locale); + icu_locale = NULL; + locale_provider = "none"; + } + if (encoding) { if (pg_char_to_encoding(encoding) < 0) diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index 5aa658b671..eb3682f0fd 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -75,6 +75,18 @@ if ($ENV{with_icu} eq 'yes') $node2->command_ok( [ 'createdb', '-T', 'template0', '--icu-locale', 'en-US', 'foobar56' ], 'create database with icu locale from template database with icu provider'); + + # transformed into provider "none" + $node->command_ok( + [ 'createdb', '-T', 'template0', '--locale-provider=icu', '--icu-locale=C', + 'test_none_icu1' ], + 'create database with provider "icu" and ICU_LOCALE="C"'); + + # transformed into provider "none" + $node->command_ok( + [ 'createdb', '-T', 'template0', '--locale-provider=icu', '--icu-locale=C', + '--lc-ctype=C', 'test_none_icu_2' ], + 'create database with provider "icu" and ICU_LOCALE="C" and LC_CTYPE=C'); } else { diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 12afc3b65a..c0437231ad 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1035,6 +1035,9 @@ BEGIN END $$; RESET client_min_messages; +-- uses "none" provider instead +CREATE COLLATION testc (provider = icu, locale='C'); +NOTICE: using locale provider "none" for ICU locale "C" CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale" ERROR: parameter "locale" must be specified SET icu_validation_level = ERROR; @@ -1069,7 +1072,8 @@ SELECT collname FROM pg_collation WHERE collname LIKE 'test%' ORDER BY 1; test0 test1 test5 -(3 rows) + testc +(4 rows) ALTER COLLATION test1 RENAME TO test11; ALTER COLLATION test0 RENAME TO test11; -- fail @@ -1090,7 +1094,8 @@ SELECT collname, nspname, obj_description(pg_collation.oid, 'pg_collation') test0 | collate_tests | US English test11 | test_schema | test5 | collate_tests | -(3 rows) + testc | collate_tests | +(4 rows) DROP COLLATION test0, test_schema.test11, test5; DROP COLLATION test0; -- fail @@ -1100,7 +1105,8 @@ NOTICE: collation "test0" does not exist, skipping SELECT collname FROM pg_collation WHERE collname LIKE 'test%'; collname ---------- -(0 rows) + testc +(1 row) DROP SCHEMA test_schema; DROP ROLE regress_test_role; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 655c965f46..63c29dfe2a 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -375,6 +375,9 @@ $$; RESET client_min_messages; +-- uses "none" provider instead +CREATE COLLATION testc (provider = icu, locale='C'); + CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale" SET icu_validation_level = ERROR; CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails -- 2.34.1