From 00117afe789db96daf39f6a8e2fa4b68c4469b35 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 1 May 2023 15:38:29 -0700 Subject: [PATCH v14 5/5] Introduce collation provider "builtin" for "C" and "C.UTF-8". The builtin "C" locale is equal (in semantics and implementation) to the libc "C" locale, except that LC_COLLATE and LC_CTYPE can be configured independently. The builtin "C.UTF-8" locale is especially useful. It provides a fast memcmp-based collation (like "C") that supports abbrevated keys, while also providing richer ctype semantics (upper/lower and regexes). The semantics are derived from Unicode by building in lookup tables in the same way as for text normalization. By using built-in semantics, the behavior is stabilized within a Postgres major version, and also matches the behavior of other built-in Unicode functionality, such as normalization. Discussion: https://postgr.es/m/ab925f69-5f9d-f85e-b87c-bd2a44798659@joeconway.com Discussion: https://postgr.es/m/dd9261f4-7a98-4565-93ec-336c1c110d90@manitou-mail.org --- doc/src/sgml/charset.sgml | 88 ++++++++++--- doc/src/sgml/ref/create_collation.sgml | 11 +- doc/src/sgml/ref/create_database.sgml | 8 +- doc/src/sgml/ref/createdb.sgml | 2 +- doc/src/sgml/ref/initdb.sgml | 7 +- src/backend/catalog/pg_collation.c | 5 +- src/backend/commands/collationcmds.c | 91 +++++++++++--- src/backend/commands/dbcommands.c | 105 ++++++++++++---- src/backend/regex/regc_pg_locale.c | 41 ++++++- src/backend/utils/adt/formatting.c | 122 +++++++++++++++++++ src/backend/utils/adt/pg_locale.c | 107 +++++++++++++--- src/backend/utils/init/postinit.c | 27 +++- src/bin/initdb/initdb.c | 16 ++- src/bin/initdb/t/001_initdb.pl | 47 +++++++ src/bin/pg_dump/pg_dump.c | 15 ++- src/bin/pg_upgrade/t/002_pg_upgrade.pl | 25 +++- src/bin/psql/describe.c | 4 +- src/bin/scripts/createdb.c | 2 +- src/bin/scripts/t/020_createdb.pl | 56 +++++++++ src/include/catalog/pg_collation.dat | 4 +- src/include/catalog/pg_collation.h | 3 + src/include/utils/pg_locale.h | 7 +- src/test/icu/t/010_database.pl | 18 +-- src/test/regress/expected/collate.out | 24 +++- src/test/regress/expected/collate.utf8.out | 93 ++++++++++++++ src/test/regress/expected/collate.utf8_1.out | 8 ++ src/test/regress/parallel_schedule | 4 +- src/test/regress/sql/collate.sql | 10 ++ src/test/regress/sql/collate.utf8.sql | 47 +++++++ 29 files changed, 885 insertions(+), 112 deletions(-) create mode 100644 src/test/regress/expected/collate.utf8.out create mode 100644 src/test/regress/expected/collate.utf8_1.out create mode 100644 src/test/regress/sql/collate.utf8.sql diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 74783d148f..51315ceac7 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -342,22 +342,14 @@ initdb --locale=sv_SE Locale Providers - PostgreSQL supports multiple locale - providers. This specifies which library supplies the locale - data. One standard provider name is libc, which uses - the locales provided by the operating system C library. These are the - locales used by most tools provided by the operating system. Another - provider is icu, which uses the external - ICUICU library. ICU locales can - only be used if support for ICU was configured when PostgreSQL was built. + A locale provider specifies which library defines the locale behavior for + collations and character classifications. The commands and tools that select the locale settings, as described - above, each have an option to select the locale provider. The examples - shown earlier all use the libc provider, which is the - default. Here is an example to initialize a database cluster using the - ICU provider: + above, each have an option to select the locale provider. Here is an + example to initialize a database cluster using the ICU provider: initdb --locale-provider=icu --icu-locale=en @@ -370,12 +362,74 @@ initdb --locale-provider=icu --icu-locale=en - Which locale provider to use depends on individual requirements. For most - basic uses, either provider will give adequate results. For the libc - provider, it depends on what the operating system offers; some operating - systems are better than others. For advanced uses, ICU offers more locale - variants and customization options. + Regardless of the locale provider, the operating system is still used to + provide some locale-aware behavior, such as messages (see ). + + + The available locale providers are listed below. + + + + Builtin + + The builtin provider uses built-in operations. Only + the C and C.UTF-8 locales are + supported for this provider. + + + The collation and character classification behavior is equivalent to + using the libc provider with locale + C, except that LC_COLLATE and + LC_CTYPE can be set independently. + + + + When using the builtin locale provider, behavior may + depend on the database encoding. + + + + + ICU + + The icu provider uses the external + ICUICU + library. PostgreSQL must have been configured + with support. + + + ICU provides collation and character classification behavior that is + independent of the operating system and database encoding, which is + preferable if you expect to transition to other platforms without any + change in results. LC_COLLATE and + LC_CTYPE can be set independently of the ICU locale. + + + + For the ICU provider, results may depend on the version of the ICU + library used, as it is updated to reflect changes in natural language + over time. + + + + + libc + + The libc provider uses the operating system's C + library. The collation and character classification behavior is + controlled by the settings LC_COLLATE and + LC_CTYPE, so they cannot be set independently. + + + + The same locale name may have different behavior on different platforms + when using the libc provider. + + + + diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index 5cf9777764..85f18cbbe5 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -96,6 +96,11 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM locale, you cannot specify either of those parameters. + + If provider is builtin, + then locale must be specified and set to + either C or C.UTF-8. + @@ -129,9 +134,9 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM Specifies the provider to use for locale services associated with this - collation. Possible values are - icuICU - (if the server was built with ICU support) or libc. + collation. Possible values are builtin, + icuICU (if + the server was built with ICU support) or libc. libc is the default. See for details. diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index ce7317f81b..6dc3348d1b 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -162,6 +162,12 @@ CREATE DATABASE name linkend="create-database-lc-ctype"/>, or individually. + + If is + builtin, then locale + must be specified and set to either C or + C.UTF-8. + The other locale settings , name Specifies the provider to use for the default collation in this - database. Possible values are + database. Possible values are builtin, icuICU (if the server was built with ICU support) or libc. By default, the provider is the same as that of the - + Specifies the locale provider for the database's default collation. diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index d43c91575c..2192d017b3 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -286,6 +286,11 @@ PostgreSQL documentation environment that initdb runs in. Locale support is described in . + + If is builtin, + must be specified and set to + C or C.UTF-8. + @@ -315,7 +320,7 @@ PostgreSQL documentation - + This option sets the locale provider for databases created in the new diff --git a/src/backend/catalog/pg_collation.c b/src/backend/catalog/pg_collation.c index b615d2fc7a..2144e11fe8 100644 --- a/src/backend/catalog/pg_collation.c +++ b/src/backend/catalog/pg_collation.c @@ -68,7 +68,10 @@ CollationCreate(const char *collname, Oid collnamespace, Assert(collname); Assert(collnamespace); Assert(collowner); - Assert((collcollate && collctype) || colllocale); + Assert((collprovider == COLLPROVIDER_LIBC && + collcollate && collctype && !colllocale) || + (collprovider != COLLPROVIDER_LIBC && + !collcollate && !collctype && colllocale)); /* * Make sure there is no existing collation of same name & encoding. diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 19e61b3e5b..3514cd9d3d 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -215,7 +215,9 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e if (collproviderstr) { - if (pg_strcasecmp(collproviderstr, "icu") == 0) + if (pg_strcasecmp(collproviderstr, "builtin") == 0) + collprovider = COLLPROVIDER_BUILTIN; + else if (pg_strcasecmp(collproviderstr, "icu") == 0) collprovider = COLLPROVIDER_ICU; else if (pg_strcasecmp(collproviderstr, "libc") == 0) collprovider = COLLPROVIDER_LIBC; @@ -245,7 +247,17 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e if (lcctypeEl) collctype = defGetString(lcctypeEl); - if (collprovider == COLLPROVIDER_LIBC) + if (collprovider == COLLPROVIDER_BUILTIN) + { + if (!colllocale) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("parameter \"locale\" must be specified"))); + + colllocale = builtin_validate_locale(GetDatabaseEncoding(), + colllocale); + } + else if (collprovider == COLLPROVIDER_LIBC) { if (!collcollate) ereport(ERROR, @@ -305,7 +317,17 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("ICU rules cannot be specified unless locale provider is ICU"))); - if (collprovider == COLLPROVIDER_ICU) + if (collprovider == COLLPROVIDER_BUILTIN) + { + /* + * Behavior may be different in different encodings, so set + * collencoding to the current database encoding. No validation is + * required, because the "builtin" provider is compatible with any + * encoding. + */ + collencoding = GetDatabaseEncoding(); + } + else if (collprovider == COLLPROVIDER_ICU) { #ifdef USE_ICU /* @@ -334,7 +356,18 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e } if (!collversion) - collversion = get_collation_actual_version(collprovider, collprovider == COLLPROVIDER_ICU ? colllocale : collcollate); + { + char *locale; + + if (collprovider == COLLPROVIDER_ICU) + locale = colllocale; + else if (collprovider == COLLPROVIDER_LIBC) + locale = collcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + collversion = get_collation_actual_version(collprovider, locale); + } newoid = CollationCreate(collName, collNamespace, @@ -409,6 +442,7 @@ AlterCollation(AlterCollationStmt *stmt) Form_pg_collation collForm; Datum datum; bool isnull; + char *locale; char *oldversion; char *newversion; ObjectAddress address; @@ -435,8 +469,20 @@ AlterCollation(AlterCollationStmt *stmt) datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collversion, &isnull); oldversion = isnull ? NULL : TextDatumGetCString(datum); - datum = SysCacheGetAttrNotNull(COLLOID, tup, collForm->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); - newversion = get_collation_actual_version(collForm->collprovider, TextDatumGetCString(datum)); + if (collForm->collprovider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(COLLOID, tup, Anum_pg_collation_colllocale); + locale = TextDatumGetCString(datum); + } + else if (collForm->collprovider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(COLLOID, tup, Anum_pg_collation_collcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + newversion = get_collation_actual_version(collForm->collprovider, locale); /* cannot change from NULL to non-NULL or vice versa */ if ((!oldversion && newversion) || (oldversion && !newversion)) @@ -500,11 +546,18 @@ pg_collation_actual_version(PG_FUNCTION_ARGS) provider = ((Form_pg_database) GETSTRUCT(dbtup))->datlocprovider; - datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, - provider == COLLPROVIDER_ICU ? - Anum_pg_database_datlocale : Anum_pg_database_datcollate); - - locale = TextDatumGetCString(datum); + if (provider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, Anum_pg_database_datlocale); + locale = TextDatumGetCString(datum); + } + else if (provider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, Anum_pg_database_datcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ ReleaseSysCache(dbtup); } @@ -521,11 +574,19 @@ pg_collation_actual_version(PG_FUNCTION_ARGS) provider = ((Form_pg_collation) GETSTRUCT(colltp))->collprovider; Assert(provider != COLLPROVIDER_DEFAULT); - datum = SysCacheGetAttrNotNull(COLLOID, colltp, - provider == COLLPROVIDER_ICU ? - Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); - locale = TextDatumGetCString(datum); + if (provider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(COLLOID, colltp, Anum_pg_collation_colllocale); + locale = TextDatumGetCString(datum); + } + else if (provider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(COLLOID, colltp, Anum_pg_collation_collcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ ReleaseSysCache(colltp); } diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index c8732be067..eaec22e771 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -897,6 +897,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) { dbcollate = defGetString(dlocale); dbctype = defGetString(dlocale); + dblocale = defGetString(dlocale); } if (dcollate && dcollate->arg) dbcollate = defGetString(dcollate); @@ -910,7 +911,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) { char *locproviderstr = defGetString(dlocprovider); - if (pg_strcasecmp(locproviderstr, "icu") == 0) + if (pg_strcasecmp(locproviderstr, "builtin") == 0) + dblocprovider = COLLPROVIDER_BUILTIN; + else if (pg_strcasecmp(locproviderstr, "icu") == 0) dblocprovider = COLLPROVIDER_ICU; else if (pg_strcasecmp(locproviderstr, "libc") == 0) dblocprovider = COLLPROVIDER_LIBC; @@ -1027,14 +1030,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbctype = src_ctype; if (dblocprovider == '\0') dblocprovider = src_locprovider; - if (dblocale == NULL && dblocprovider == COLLPROVIDER_ICU) - { - if (dlocale && dlocale->arg) - dblocale = defGetString(dlocale); - else - dblocale = src_locale; - } - if (dbicurules == NULL && dblocprovider == COLLPROVIDER_ICU) + if (dblocale == NULL) + dblocale = src_locale; + if (dbicurules == NULL) dbicurules = src_icurules; /* Some encodings are client only */ @@ -1059,6 +1057,20 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) check_encoding_locale_matches(encoding, dbcollate, dbctype); + if (dblocprovider == COLLPROVIDER_BUILTIN) + { + /* + * This would happen if template0 uses the libc provider but the new + * database uses builtin. + */ + if (!dblocale) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("LOCALE must be specified for the builtin provider"))); + + dblocale = builtin_validate_locale(encoding, dblocale); + } + if (dblocprovider == COLLPROVIDER_ICU) { if (!(is_encoding_supported_by_icu(encoding))) @@ -1100,7 +1112,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) } else { - if (dblocale) + if (diculocale && diculocale->arg) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("ICU locale cannot be specified unless locale provider is ICU"))); @@ -1111,6 +1123,10 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errmsg("ICU rules cannot be specified unless locale provider is ICU"))); } + /* for libc, locale comes from datcollate and datctype */ + if (dblocprovider == COLLPROVIDER_LIBC) + dblocale = NULL; + /* * Check that the new encoding and locale settings match the source * database. We insist on this because we simply copy the source data --- @@ -1195,9 +1211,17 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) */ if (src_collversion && !dcollversion) { - char *actual_versionstr; + char *actual_versionstr; + char *locale; - actual_versionstr = get_collation_actual_version(dblocprovider, dblocprovider == COLLPROVIDER_ICU ? dblocale : dbcollate); + if (dblocprovider == COLLPROVIDER_ICU) + locale = dblocale; + else if (dblocprovider == COLLPROVIDER_LIBC) + locale = dbcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + actual_versionstr = get_collation_actual_version(dblocprovider, locale); if (!actual_versionstr) ereport(ERROR, (errmsg("template database \"%s\" has a collation version, but no actual collation version could be determined", @@ -1225,7 +1249,18 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) * collation version, which is normally only the case for template0. */ if (dbcollversion == NULL) - dbcollversion = get_collation_actual_version(dblocprovider, dblocprovider == COLLPROVIDER_ICU ? dblocale : dbcollate); + { + char *locale; + + if (dblocprovider == COLLPROVIDER_ICU) + locale = dblocale; + else if (dblocprovider == COLLPROVIDER_LIBC) + locale = dbcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + dbcollversion = get_collation_actual_version(dblocprovider, locale); + } /* Resolve default tablespace for new database */ if (dtablespacename && dtablespacename->arg) @@ -1364,8 +1399,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) * block on the unique index, and fail after we commit). */ - Assert((dblocprovider == COLLPROVIDER_ICU && dblocale) || - (dblocprovider != COLLPROVIDER_ICU && !dblocale)); + Assert((dblocprovider != COLLPROVIDER_LIBC && dblocale) || + (dblocprovider == COLLPROVIDER_LIBC && !dblocale)); /* Form tuple */ new_record[Anum_pg_database_oid - 1] = ObjectIdGetDatum(dboid); @@ -2446,6 +2481,7 @@ AlterDatabaseRefreshColl(AlterDatabaseRefreshCollStmt *stmt) ObjectAddress address; Datum datum; bool isnull; + char *locale; char *oldversion; char *newversion; @@ -2472,10 +2508,24 @@ AlterDatabaseRefreshColl(AlterDatabaseRefreshCollStmt *stmt) datum = heap_getattr(tuple, Anum_pg_database_datcollversion, RelationGetDescr(rel), &isnull); oldversion = isnull ? NULL : TextDatumGetCString(datum); - datum = heap_getattr(tuple, datForm->datlocprovider == COLLPROVIDER_ICU ? Anum_pg_database_datlocale : Anum_pg_database_datcollate, RelationGetDescr(rel), &isnull); - if (isnull) - elog(ERROR, "unexpected null in pg_database"); - newversion = get_collation_actual_version(datForm->datlocprovider, TextDatumGetCString(datum)); + if (datForm->datlocprovider == COLLPROVIDER_ICU) + { + datum = heap_getattr(tuple, Anum_pg_database_datlocale, RelationGetDescr(rel), &isnull); + if (isnull) + elog(ERROR, "unexpected null in pg_database"); + locale = TextDatumGetCString(datum); + } + else if (datForm->datlocprovider == COLLPROVIDER_LIBC) + { + datum = heap_getattr(tuple, Anum_pg_database_datcollate, RelationGetDescr(rel), &isnull); + if (isnull) + elog(ERROR, "unexpected null in pg_database"); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + newversion = get_collation_actual_version(datForm->datlocprovider, locale); /* cannot change from NULL to non-NULL or vice versa */ if ((!oldversion && newversion) || (oldversion && !newversion)) @@ -2660,6 +2710,7 @@ pg_database_collation_actual_version(PG_FUNCTION_ARGS) HeapTuple tp; char datlocprovider; Datum datum; + char *locale; char *version; tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(dbid)); @@ -2670,8 +2721,20 @@ pg_database_collation_actual_version(PG_FUNCTION_ARGS) datlocprovider = ((Form_pg_database) GETSTRUCT(tp))->datlocprovider; - datum = SysCacheGetAttrNotNull(DATABASEOID, tp, datlocprovider == COLLPROVIDER_ICU ? Anum_pg_database_datlocale : Anum_pg_database_datcollate); - version = get_collation_actual_version(datlocprovider, TextDatumGetCString(datum)); + if (datlocprovider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tp, Anum_pg_database_datlocale); + locale = TextDatumGetCString(datum); + } + else if (datlocprovider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tp, Anum_pg_database_datcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + version = get_collation_actual_version(datlocprovider, locale); ReleaseSysCache(tp); diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 42d15b6303..0278f45adc 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -16,6 +16,8 @@ */ #include "catalog/pg_collation.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "utils/pg_locale.h" /* @@ -64,6 +66,7 @@ typedef enum { PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ + PG_REGEX_BUILTIN, /* built-in Unicode semantics */ PG_REGEX_LOCALE_WIDE, /* Use functions */ PG_REGEX_LOCALE_1BYTE, /* Use functions */ PG_REGEX_LOCALE_WIDE_L, /* Use locale_t functions */ @@ -75,6 +78,8 @@ static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; static Oid pg_regex_collation; +static bool regex_builtin_cclass_posix = false; + /* * Hard-wired character properties for C locale */ @@ -266,7 +271,15 @@ pg_set_regex_collation(Oid collation) if (GetDatabaseEncoding() == PG_UTF8) { if (pg_regex_locale) - pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + { + if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN) + { + pg_regex_strategy = PG_REGEX_BUILTIN; + regex_builtin_cclass_posix = pg_regex_locale->info.builtin.cclass_posix; + } + else + pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + } else pg_regex_strategy = PG_REGEX_LOCALE_WIDE; } @@ -290,6 +303,8 @@ pg_wc_isdigit(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); + case PG_REGEX_BUILTIN: + return pg_u_isdigit(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit((wint_t) c); @@ -322,6 +337,8 @@ pg_wc_isalpha(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA)); + case PG_REGEX_BUILTIN: + return pg_u_isalpha(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalpha((wint_t) c); @@ -354,6 +371,8 @@ pg_wc_isalnum(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); + case PG_REGEX_BUILTIN: + return pg_u_isalnum(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum((wint_t) c); @@ -395,6 +414,8 @@ pg_wc_isupper(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER)); + case PG_REGEX_BUILTIN: + return pg_u_isupper(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswupper((wint_t) c); @@ -427,6 +448,8 @@ pg_wc_islower(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER)); + case PG_REGEX_BUILTIN: + return pg_u_islower(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswlower((wint_t) c); @@ -459,6 +482,8 @@ pg_wc_isgraph(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH)); + case PG_REGEX_BUILTIN: + return pg_u_isgraph(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswgraph((wint_t) c); @@ -491,6 +516,8 @@ pg_wc_isprint(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT)); + case PG_REGEX_BUILTIN: + return pg_u_isprint(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswprint((wint_t) c); @@ -523,6 +550,8 @@ pg_wc_ispunct(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); + case PG_REGEX_BUILTIN: + return pg_u_ispunct(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct((wint_t) c); @@ -555,6 +584,8 @@ pg_wc_isspace(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE)); + case PG_REGEX_BUILTIN: + return pg_u_isspace(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswspace((wint_t) c); @@ -588,6 +619,8 @@ pg_wc_toupper(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_toupper((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_uppercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -628,6 +661,8 @@ pg_wc_tolower(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_tolower((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_lowercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -792,6 +827,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif break; + case PG_REGEX_BUILTIN: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; case PG_REGEX_LOCALE_WIDE: case PG_REGEX_LOCALE_WIDE_L: max_chr = (pg_wchar) MAX_SIMPLE_CHR; @@ -809,6 +847,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; break; default: + Assert(false); max_chr = 0; /* can't get here, but keep compiler quiet */ break; } diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index d176723d95..cf50b4d951 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -77,6 +77,8 @@ #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "mb/pg_wchar.h" #include "nodes/miscnodes.h" #include "parser/scansup.h" @@ -1670,6 +1672,43 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *orig = (unsigned char *) buff; + unsigned char *workspace; + const unsigned char *sp; + unsigned char *rp; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Output workspace cannot have more codes than input bytes */ + workspace = (unsigned char *)palloc((nbytes + 1) * sizeof(pg_wchar)); + + sp = orig; + rp = workspace; + while (sp - orig < nbytes) + { + pg_wchar u = utf8_to_unicode(sp); + unicode_to_utf8(unicode_lowercase_simple(u), rp); + sp += pg_utf_mblen(sp); + rp += pg_utf_mblen(rp); + } + + *rp = '\0'; + rp++; + + /* shrink buffer and store result */ + result = palloc(rp - workspace); + memcpy(result, workspace, rp - workspace); + pfree(workspace); + } + else { if (pg_database_encoding_max_length() > 1) { @@ -1788,6 +1827,43 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *orig = (unsigned char *) buff; + unsigned char *workspace; + const unsigned char *sp; + unsigned char *rp; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Output workspace cannot have more codes than input bytes */ + workspace = (unsigned char *)palloc((nbytes + 1) * sizeof(pg_wchar)); + + sp = orig; + rp = workspace; + while (sp - orig < nbytes) + { + pg_wchar u = utf8_to_unicode(sp); + unicode_to_utf8(unicode_uppercase_simple(u), rp); + sp += pg_utf_mblen(sp); + rp += pg_utf_mblen(rp); + } + + *rp = '\0'; + rp++; + + /* shrink buffer and store result */ + result = palloc(rp - workspace); + memcpy(result, workspace, rp - workspace); + pfree(workspace); + } + else { if (pg_database_encoding_max_length() > 1) { @@ -1907,6 +1983,52 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *orig = (unsigned char *) buff; + unsigned char *workspace; + const unsigned char *sp; + unsigned char *rp; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Output workspace cannot have more codes than input bytes */ + workspace = (unsigned char *)palloc((nbytes + 1) * sizeof(pg_wchar)); + + sp = orig; + rp = workspace; + while (sp - orig < nbytes) + { + pg_wchar u1 = utf8_to_unicode(sp); + pg_wchar u2; + + if (wasalnum) + u2 = unicode_lowercase_simple(u1); + else + u2 = unicode_titlecase_simple(u1); + + unicode_to_utf8(u2, rp); + wasalnum = pg_u_isalnum(u2, mylocale->info.builtin.cclass_posix); + + sp += pg_utf_mblen(sp); + rp += pg_utf_mblen(rp); + } + + *rp = '\0'; + rp++; + + /* shrink buffer and store result */ + result = palloc(rp - workspace); + memcpy(result, workspace, rp - workspace); + pfree(workspace); + } + else { if (pg_database_encoding_max_length() > 1) { diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 9d8634ec25..4ae0b8eb67 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1269,7 +1269,19 @@ lookup_collation_cache(Oid collation, bool set_flags) elog(ERROR, "cache lookup failed for collation %u", collation); collform = (Form_pg_collation) GETSTRUCT(tp); - if (collform->collprovider == COLLPROVIDER_LIBC) + if (collform->collprovider == COLLPROVIDER_BUILTIN) + { + Datum datum; + const char *colllocale; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + colllocale = TextDatumGetCString(datum); + + cache_entry->collate_is_c = true; + cache_entry->ctype_is_c = ((strcmp(colllocale, "C") == 0) || + (strcmp(colllocale, "POSIX") == 0)); + } + else if (collform->collprovider == COLLPROVIDER_LIBC) { Datum datum; const char *collcollate; @@ -1320,16 +1332,29 @@ lc_collate_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_COLLATE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_COLLATE setting"); + + if (default_locale.provider == COLLPROVIDER_BUILTIN) + { + localeptr = default_locale.info.builtin.locale; + } + else if (default_locale.provider == COLLPROVIDER_ICU) + { + result = false; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_LIBC) + { + localeptr = setlocale(LC_CTYPE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_CTYPE setting"); + } + else + elog(ERROR, "unexpected collation provider '%c'", + default_locale.provider); if (strcmp(localeptr, "C") == 0) result = true; @@ -1373,16 +1398,29 @@ lc_ctype_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_CTYPE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_CTYPE setting"); + + if (default_locale.provider == COLLPROVIDER_BUILTIN) + { + localeptr = default_locale.info.builtin.locale; + } + else if (default_locale.provider == COLLPROVIDER_ICU) + { + result = false; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_LIBC) + { + localeptr = setlocale(LC_CTYPE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_CTYPE setting"); + } + else + elog(ERROR, "unexpected collation provider '%c'", + default_locale.provider); if (strcmp(localeptr, "C") == 0) result = true; @@ -1390,6 +1428,7 @@ lc_ctype_is_c(Oid collation) result = true; else result = false; + return (bool) result; } @@ -1520,10 +1559,10 @@ pg_newlocale_from_collation(Oid collid) if (collid == DEFAULT_COLLATION_OID) { - if (default_locale.provider == COLLPROVIDER_ICU) - return &default_locale; - else + if (default_locale.provider == COLLPROVIDER_LIBC) return (pg_locale_t) 0; + else + return &default_locale; } cache_entry = lookup_collation_cache(collid, false); @@ -1548,7 +1587,18 @@ pg_newlocale_from_collation(Oid collid) result.provider = collform->collprovider; result.deterministic = collform->collisdeterministic; - if (collform->collprovider == COLLPROVIDER_LIBC) + if (collform->collprovider == COLLPROVIDER_BUILTIN) + { + const char *locstr; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + locstr = TextDatumGetCString(datum); + + result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext, + locstr); + result.info.builtin.cclass_posix = true; + } + else if (collform->collprovider == COLLPROVIDER_LIBC) { const char *collcollate; const char *collctype pg_attribute_unused(); @@ -1627,6 +1677,7 @@ pg_newlocale_from_collation(Oid collid) collversionstr = TextDatumGetCString(datum); + Assert(collform->collprovider != COLLPROVIDER_BUILTIN); datum = SysCacheGetAttrNotNull(COLLOID, tp, collform->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); actual_versionstr = get_collation_actual_version(collform->collprovider, @@ -1678,6 +1729,9 @@ get_collation_actual_version(char collprovider, const char *collcollate) { char *collversion = NULL; + if (collprovider == COLLPROVIDER_BUILTIN) + return NULL; /* TODO */ + #ifdef USE_ICU if (collprovider == COLLPROVIDER_ICU) { @@ -2444,6 +2498,21 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, return result; } +char * +builtin_validate_locale(int encoding, const char *locale) +{ + if (strcmp(locale, "C") == 0 || strcmp(locale, "POSIX") == 0) + return "C"; + if (encoding == PG_UTF8 && + (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0)) + return "C.UTF-8"; + + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid locale name \"%s\" for builtin provider", locale))); +} + + #ifdef USE_ICU /* diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 702bad6d8a..f6dc3afbfe 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -318,7 +318,7 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect bool isnull; char *collate; char *ctype; - char *iculocale; + char *datlocale; /* Fetch our pg_database row normally, via syscache */ tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId)); @@ -423,12 +423,21 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect strcmp(ctype, "POSIX") == 0) database_ctype_is_c = true; - if (dbform->datlocprovider == COLLPROVIDER_ICU) + if (dbform->datlocprovider == COLLPROVIDER_BUILTIN) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale); + datlocale = TextDatumGetCString(datum); + + default_locale.info.builtin.locale = MemoryContextStrdup( + TopMemoryContext, datlocale); + default_locale.info.builtin.cclass_posix = true; + } + else if (dbform->datlocprovider == COLLPROVIDER_ICU) { char *icurules; datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale); - iculocale = TextDatumGetCString(datum); + datlocale = TextDatumGetCString(datum); datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticurules, &isnull); if (!isnull) @@ -436,10 +445,10 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect else icurules = NULL; - make_icu_collator(iculocale, icurules, &default_locale); + make_icu_collator(datlocale, icurules, &default_locale); } else - iculocale = NULL; + datlocale = NULL; default_locale.provider = dbform->datlocprovider; @@ -461,10 +470,16 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect { char *actual_versionstr; char *collversionstr; + char *locale; collversionstr = TextDatumGetCString(datum); - actual_versionstr = get_collation_actual_version(dbform->datlocprovider, dbform->datlocprovider == COLLPROVIDER_ICU ? iculocale : collate); + if (dbform->datlocprovider == COLLPROVIDER_LIBC) + locale = collate; + else + locale = datlocale; + + actual_versionstr = get_collation_actual_version(dbform->datlocprovider, locale); if (!actual_versionstr) /* should not happen */ elog(WARNING, diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index aaf55ce0cd..b075d0e2eb 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2450,7 +2450,7 @@ usage(const char *progname) " set default locale in the respective category for\n" " new databases (default taken from environment)\n")); printf(_(" --no-locale equivalent to --locale=C\n")); - printf(_(" --locale-provider={libc|icu}\n" + printf(_(" --locale-provider={builtin|libc|icu}\n" " set default locale provider for new databases\n")); printf(_(" --pwfile=FILE read password for the new superuser from file\n")); printf(_(" -T, --text-search-config=CFG\n" @@ -2601,7 +2601,15 @@ setup_locale_encoding(void) { setlocales(); - if (locale_provider == COLLPROVIDER_LIBC && + if (locale_provider == COLLPROVIDER_BUILTIN && + strcmp(lc_ctype, "C") == 0 && + strcmp(lc_collate, "C") == 0 && + strcmp(lc_time, "C") == 0 && + strcmp(lc_numeric, "C") == 0 && + strcmp(lc_monetary, "C") == 0 && + strcmp(lc_messages, "C") == 0) + printf(_("The database cluster will be initialized with no locale.\n")); + else if (locale_provider == COLLPROVIDER_LIBC && strcmp(lc_ctype, lc_collate) == 0 && strcmp(lc_ctype, lc_time) == 0 && strcmp(lc_ctype, lc_numeric) == 0 && @@ -3277,7 +3285,9 @@ main(int argc, char *argv[]) "-c debug_discard_caches=1"); break; case 15: - if (strcmp(optarg, "icu") == 0) + if (strcmp(optarg, "builtin") == 0) + locale_provider = COLLPROVIDER_BUILTIN; + else if (strcmp(optarg, "icu") == 0) locale_provider = COLLPROVIDER_ICU; else if (strcmp(optarg, "libc") == 0) locale_provider = COLLPROVIDER_LIBC; diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 7606db1987..5b8a253d02 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -184,6 +184,53 @@ else 'locale provider ICU fails since no ICU support'); } +command_fails( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', "$tempdir/data6" + ], + 'locale provider builtin fails without --locale' +); + +command_ok( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--locale=C', + "$tempdir/data7" + ], + 'locale provider builtin with --locale' +); + +command_ok( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--lc-collate=C', + '--locale=C.UTF-8', "$tempdir/data8" + ], + 'locale provider builtin with --lc-collate and --locale=C.UTF-8' +); + +command_ok( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--lc-ctype=C', + '--locale=C', "$tempdir/data9" + ], + 'locale provider builtin with --lc-ctype' +); + +command_fails( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--icu-locale=en', + "$tempdir/dataX" + ], + 'fails for locale provider builtin with ICU locale' +); + +command_fails( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--icu-rules=""', + "$tempdir/dataX" + ], + 'fails for locale provider builtin with ICU rules' +); + command_fails( [ 'initdb', '--no-sync', '--locale-provider=xyz', "$tempdir/dataX" ], 'fails for invalid locale provider'); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 3d829c99d7..b953db8bea 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -3091,7 +3091,9 @@ dumpDatabase(Archive *fout) } appendPQExpBufferStr(creaQry, " LOCALE_PROVIDER = "); - if (datlocprovider[0] == 'c') + if (datlocprovider[0] == 'b') + appendPQExpBufferStr(creaQry, "builtin"); + else if (datlocprovider[0] == 'c') appendPQExpBufferStr(creaQry, "libc"); else if (datlocprovider[0] == 'i') appendPQExpBufferStr(creaQry, "icu"); @@ -13672,7 +13674,9 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) fmtQualifiedDumpable(collinfo)); appendPQExpBufferStr(q, "provider = "); - if (collprovider[0] == 'c') + if (collprovider[0] == 'b') + appendPQExpBufferStr(q, "builtin"); + else if (collprovider[0] == 'c') appendPQExpBufferStr(q, "libc"); else if (collprovider[0] == 'i') appendPQExpBufferStr(q, "icu"); @@ -13693,6 +13697,13 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) /* no locale -- the default collation cannot be reloaded anyway */ } + else if (collprovider[0] == 'b') + { + if (collcollate || collctype || colllocale || collicurules) + pg_log_warning("invalid collation \"%s\"", qcollname); + + appendPQExpBufferStr(q, ", locale = 'C'"); + } else if (collprovider[0] == 'i') { if (fout->remoteVersion >= 150000) diff --git a/src/bin/pg_upgrade/t/002_pg_upgrade.pl b/src/bin/pg_upgrade/t/002_pg_upgrade.pl index 6f359d72ce..c620a4929f 100644 --- a/src/bin/pg_upgrade/t/002_pg_upgrade.pl +++ b/src/bin/pg_upgrade/t/002_pg_upgrade.pl @@ -129,14 +129,33 @@ if (int($oldnode->pg_version) >= 15 && $ENV{with_icu} eq 'yes') $original_iculocale = "fr-CA"; } +# use builtin provider instead of libc, if supported +if ($oldnode->pg_version >= 16 && $ENV{with_icu} ne 'yes') +{ + $original_provider = "b"; +} + my @initdb_params = @custom_opts; push @initdb_params, ('--encoding', 'UTF-8'); push @initdb_params, ('--locale', $original_locale); -if ($original_provider eq "i") + +# add --locale-provider, if supported +if ($oldnode->pg_version >= 15) { - push @initdb_params, ('--locale-provider', 'icu'); - push @initdb_params, ('--icu-locale', 'fr-CA'); + if ($original_provider eq "b") + { + push @initdb_params, ('--locale-provider', 'builtin'); + } + elsif ($original_provider eq "i") + { + push @initdb_params, ('--locale-provider', 'icu'); + push @initdb_params, ('--icu-locale', 'fr-CA'); + } + elsif ($original_provider eq "c") + { + push @initdb_params, ('--locale-provider', 'libc'); + } } $node_params{extra} = \@initdb_params; diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 1122843715..3b327d159a 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -926,7 +926,7 @@ listAllDbs(const char *pattern, bool verbose) gettext_noop("Encoding")); if (pset.sversion >= 150000) appendPQExpBuffer(&buf, - " CASE d.datlocprovider WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", + " CASE d.datlocprovider WHEN 'b' THEN 'builtin' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", gettext_noop("Locale Provider")); else appendPQExpBuffer(&buf, @@ -4966,7 +4966,7 @@ listCollations(const char *pattern, bool verbose, bool showSystem) if (pset.sversion >= 100000) appendPQExpBuffer(&buf, - " CASE c.collprovider WHEN 'd' THEN 'default' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", + " CASE c.collprovider WHEN 'd' THEN 'default' WHEN 'b' THEN 'builtin' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", gettext_noop("Provider")); else appendPQExpBuffer(&buf, diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index 9ca86a3e53..8f8995964c 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -296,7 +296,7 @@ help(const char *progname) printf(_(" --lc-ctype=LOCALE LC_CTYPE setting for the database\n")); printf(_(" --icu-locale=LOCALE ICU locale setting for the database\n")); printf(_(" --icu-rules=RULES ICU rules setting for the database\n")); - printf(_(" --locale-provider={libc|icu}\n" + printf(_(" --locale-provider={builtin|libc|icu}\n" " locale provider for the database's default collation\n")); printf(_(" -O, --owner=OWNER database user to own the new database\n")); printf(_(" -S, --strategy=STRATEGY database creation strategy wal_log or file_copy\n")); diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index 40291924e5..447cb7d2a3 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -105,6 +105,62 @@ else 'create database with ICU fails since no ICU support'); } +$node->command_fails( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + 'tbuiltin1' + ], + 'create database with provider "builtin" fails without --locale' +); + +$node->command_ok( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '--locale=C', 'tbuiltin2' + ], + 'create database with provider "builtin" and locale "C"' +); + +$node->command_ok( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '--locale=C', '--lc-collate=C', 'tbuiltin3' + ], + 'create database with provider "builtin" and LC_COLLATE=C' +); + +$node->command_ok( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '--locale=C', '--lc-ctype=C', 'tbuiltin4' + ], + 'create database with provider "builtin" and LC_CTYPE=C' +); + +$node->command_fails( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '--locale=C', '--icu-locale=en', 'tbuiltin5' + ], + 'create database with provider "builtin" and ICU_LOCALE="en"' +); + +$node->command_fails( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '--locale=C', '--icu-rules=""', 'tbuiltin6' + ], + 'create database with provider "builtin" and ICU_RULES=""' +); + +$node->command_fails( + [ + 'createdb', '-T', 'template1', '--locale-provider=builtin', + '--locale=C', 'tbuiltin7' + ], + 'create database with provider "builtin" not matching template' +); + $node->command_fails([ 'createdb', 'foobar1' ], 'fails if database already exists'); diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index 2c112cd6bc..977f90518e 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -24,8 +24,8 @@ collname => 'POSIX', collprovider => 'c', collencoding => '-1', collcollate => 'POSIX', collctype => 'POSIX' }, { oid => '962', descr => 'sorts by Unicode code point', - collname => 'ucs_basic', collprovider => 'c', collencoding => '6', - collcollate => 'C', collctype => 'C' }, + collname => 'ucs_basic', collprovider => 'b', collencoding => '6', + colllocale => 'C' }, { oid => '963', descr => 'sorts using the Unicode Collation Algorithm with default settings', collname => 'unicode', collprovider => 'i', collencoding => '-1', diff --git a/src/include/catalog/pg_collation.h b/src/include/catalog/pg_collation.h index d357c89ae6..09fc991038 100644 --- a/src/include/catalog/pg_collation.h +++ b/src/include/catalog/pg_collation.h @@ -65,6 +65,7 @@ DECLARE_UNIQUE_INDEX_PKEY(pg_collation_oid_index, 3085, CollationOidIndexId, pg_ #ifdef EXPOSE_TO_CLIENT_CODE #define COLLPROVIDER_DEFAULT 'd' +#define COLLPROVIDER_BUILTIN 'b' #define COLLPROVIDER_ICU 'i' #define COLLPROVIDER_LIBC 'c' @@ -73,6 +74,8 @@ collprovider_name(char c) { switch (c) { + case COLLPROVIDER_BUILTIN: + return "builtin"; case COLLPROVIDER_ICU: return "icu"; case COLLPROVIDER_LIBC: diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 6447bea8e0..688157b9ac 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -76,6 +76,11 @@ struct pg_locale_struct bool deterministic; union { + struct + { + const char *locale; + bool cclass_posix; + } builtin; locale_t lt; #ifdef USE_ICU struct @@ -112,7 +117,7 @@ extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale); extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale); - +extern char *builtin_validate_locale(int encoding, const char *loc_str); extern void icu_validate_locale(const char *loc_str); extern char *icu_language_tag(const char *loc_str, int elevel); diff --git a/src/test/icu/t/010_database.pl b/src/test/icu/t/010_database.pl index 67fc3bbf19..af34cfc1dd 100644 --- a/src/test/icu/t/010_database.pl +++ b/src/test/icu/t/010_database.pl @@ -63,14 +63,14 @@ is( $node1->psql( 0, "C locale works for ICU"); -# Test that LOCALE works for ICU locales if LC_COLLATE and LC_CTYPE -# are specified -is( $node1->psql( - 'postgres', - q{CREATE DATABASE dbicu2 LOCALE_PROVIDER icu LOCALE '@colStrength=primary' - LC_COLLATE='C' LC_CTYPE='C' TEMPLATE template0 ENCODING UTF8} - ), - 0, - "LOCALE works for ICU locales if LC_COLLATE and LC_CTYPE are specified"); +my ($ret, $stdout, $stderr) = $node1->psql('postgres', + q{CREATE DATABASE dbicu LOCALE_PROVIDER builtin LOCALE 'C' TEMPLATE dbicu} +); +isnt($ret, 0, + "locale provider must match template: exit code not 0"); +like( + $stderr, + qr/ERROR: new locale provider \(builtin\) does not match locale provider of the template database \(icu\)/, + "locale provider must match template: error message"); done_testing(); diff --git a/src/test/regress/expected/collate.out b/src/test/regress/expected/collate.out index 0649564485..ece4a8e99d 100644 --- a/src/test/regress/expected/collate.out +++ b/src/test/regress/expected/collate.out @@ -650,6 +650,26 @@ EXPLAIN (COSTS OFF) (3 rows) -- CREATE/DROP COLLATION +CREATE COLLATION builtin_c ( PROVIDER = builtin, LOCALE = "C" ); +CREATE COLLATION builtin_posix ( PROVIDER = builtin, LOCALE = "POSIX" ); +SELECT b FROM collate_test1 ORDER BY b COLLATE builtin_c; + b +----- + ABD + Abc + abc + bbc +(4 rows) + +CREATE COLLATION builtin2 ( PROVIDER = builtin ); -- fails +ERROR: parameter "locale" must be specified +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "en_US" ); -- fails +ERROR: invalid locale name "en_US" for builtin provider +CREATE COLLATION builtin2 ( PROVIDER = builtin, LC_CTYPE = "C", LC_COLLATE = "C" ); -- fails +ERROR: parameter "locale" must be specified +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "POSIX", LC_CTYPE = "POSIX" ); -- fails +ERROR: conflicting or redundant options +DETAIL: LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE. CREATE COLLATION mycoll1 FROM "C"; CREATE COLLATION mycoll2 ( LC_COLLATE = "POSIX", LC_CTYPE = "POSIX" ); CREATE COLLATION mycoll3 FROM "default"; -- intentionally unsupported @@ -754,7 +774,7 @@ DETAIL: FROM cannot be specified together with any other options. -- must get rid of them. -- DROP SCHEMA collate_tests CASCADE; -NOTICE: drop cascades to 19 other objects +NOTICE: drop cascades to 21 other objects DETAIL: drop cascades to table collate_test1 drop cascades to table collate_test_like drop cascades to table collate_test2 @@ -771,6 +791,8 @@ drop cascades to function dup(anyelement) drop cascades to table collate_test20 drop cascades to table collate_test21 drop cascades to table collate_test22 +drop cascades to collation builtin_c +drop cascades to collation builtin_posix drop cascades to collation mycoll2 drop cascades to table collate_test23 drop cascades to view collate_on_int diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out new file mode 100644 index 0000000000..0c332f7896 --- /dev/null +++ b/src/test/regress/expected/collate.utf8.out @@ -0,0 +1,93 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +SET client_encoding TO UTF8; +-- +-- Test builtin provider with C.UTF-8 locale. +-- +CREATE COLLATION builtin_c_utf8 (PROVIDER = builtin, LOCALE = 'C.UTF-8'); +CREATE TABLE builtin_test ( + t TEXT COLLATE builtin_c_utf8 +); +INSERT INTO builtin_test VALUES + ('abc DEF'), + ('DŽxxx Džxxx džxxx'), + ('xxxDŽ xxxDž xxxdž'); +SELECT t, lower(t), initcap(t), upper(t) FROM builtin_test; + t | lower | initcap | upper +----------------+----------------+----------------+---------------- + abc DEF | abc def | Abc Def | ABC DEF + DŽxxx Džxxx džxxx | džxxx džxxx džxxx | Džxxx Džxxx Džxxx | DŽXXX DŽXXX DŽXXX + xxxDŽ xxxDž xxxdž | xxxdž xxxdž xxxdž | Xxxdž Xxxdž Xxxdž | XXXDŽ XXXDŽ XXXDŽ +(3 rows) + +DROP TABLE builtin_test; +-- character classes +select 'xyz' ~ '[[:alnum:]]' collate builtin_c_utf8; + ?column? +---------- + t +(1 row) + +select 'xyz' !~ '[[:upper:]]' collate builtin_c_utf8; + ?column? +---------- + t +(1 row) + +select '@' !~ '[[:alnum:]]' collate builtin_c_utf8; + ?column? +---------- + t +(1 row) + +select '@' ~ '[[:punct:]]' collate builtin_c_utf8; -- symbols are punctuation in posix + ?column? +---------- + t +(1 row) + +select 'a8a' ~ '[[:digit:]]' collate builtin_c_utf8; + ?column? +---------- + t +(1 row) + +select '൧' !~ '\d' collate builtin_c_utf8; -- only 0-9 considered digits in posix + ?column? +---------- + t +(1 row) + +-- case mapping +select 'xYz' ~* 'XyZ' collate builtin_c_utf8; + ?column? +---------- + t +(1 row) + +select 'xAb' ~* '[W-Y]' collate builtin_c_utf8; + ?column? +---------- + t +(1 row) + +select 'xAb' !~* '[c-d]' collate builtin_c_utf8; + ?column? +---------- + t +(1 row) + +select 'Δ' ~* '[α-λ]' collate builtin_c_utf8; + ?column? +---------- + t +(1 row) + +DROP COLLATION builtin_c_utf8; diff --git a/src/test/regress/expected/collate.utf8_1.out b/src/test/regress/expected/collate.utf8_1.out new file mode 100644 index 0000000000..e73fdf50c3 --- /dev/null +++ b/src/test/regress/expected/collate.utf8_1.out @@ -0,0 +1,8 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index f0987ff537..292bc54932 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,9 +78,9 @@ test: brin_bloom brin_multi # psql depends on create_am # amutils depends on geometry, create_index_spgist, hash_index, brin # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role +test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.utf8 collate.icu.utf8 incremental_sort create_role -# collate.*.utf8 tests cannot be run in parallel with each other +# collate.linux.utf8 and collate.icu.utf8 tests cannot be run in parallel with each other test: rules psql psql_crosstab amutils stats_ext collate.linux.utf8 collate.windows.win1252 # ---------- diff --git a/src/test/regress/sql/collate.sql b/src/test/regress/sql/collate.sql index c3d40fc195..01d5c69fe4 100644 --- a/src/test/regress/sql/collate.sql +++ b/src/test/regress/sql/collate.sql @@ -244,6 +244,16 @@ EXPLAIN (COSTS OFF) -- CREATE/DROP COLLATION +CREATE COLLATION builtin_c ( PROVIDER = builtin, LOCALE = "C" ); +CREATE COLLATION builtin_posix ( PROVIDER = builtin, LOCALE = "POSIX" ); + +SELECT b FROM collate_test1 ORDER BY b COLLATE builtin_c; + +CREATE COLLATION builtin2 ( PROVIDER = builtin ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "en_US" ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LC_CTYPE = "C", LC_COLLATE = "C" ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "POSIX", LC_CTYPE = "POSIX" ); -- fails + CREATE COLLATION mycoll1 FROM "C"; CREATE COLLATION mycoll2 ( LC_COLLATE = "POSIX", LC_CTYPE = "POSIX" ); CREATE COLLATION mycoll3 FROM "default"; -- intentionally unsupported diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql new file mode 100644 index 0000000000..dadd8f8032 --- /dev/null +++ b/src/test/regress/sql/collate.utf8.sql @@ -0,0 +1,47 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ + +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +SET client_encoding TO UTF8; + +-- +-- Test builtin provider with C.UTF-8 locale. +-- +CREATE COLLATION builtin_c_utf8 (PROVIDER = builtin, LOCALE = 'C.UTF-8'); + +CREATE TABLE builtin_test ( + t TEXT COLLATE builtin_c_utf8 +); +INSERT INTO builtin_test VALUES + ('abc DEF'), + ('DŽxxx Džxxx džxxx'), + ('xxxDŽ xxxDž xxxdž'); + +SELECT t, lower(t), initcap(t), upper(t) FROM builtin_test; + +DROP TABLE builtin_test; + +-- character classes + +select 'xyz' ~ '[[:alnum:]]' collate builtin_c_utf8; +select 'xyz' !~ '[[:upper:]]' collate builtin_c_utf8; +select '@' !~ '[[:alnum:]]' collate builtin_c_utf8; +select '@' ~ '[[:punct:]]' collate builtin_c_utf8; -- symbols are punctuation in posix +select 'a8a' ~ '[[:digit:]]' collate builtin_c_utf8; +select '൧' !~ '\d' collate builtin_c_utf8; -- only 0-9 considered digits in posix + +-- case mapping + +select 'xYz' ~* 'XyZ' collate builtin_c_utf8; +select 'xAb' ~* '[W-Y]' collate builtin_c_utf8; +select 'xAb' !~* '[c-d]' collate builtin_c_utf8; +select 'Δ' ~* '[α-λ]' collate builtin_c_utf8; + +DROP COLLATION builtin_c_utf8; -- 2.34.1