From 5ffd8d5d3a327b86c8170a8125344e9b1aeb028d Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 1 May 2023 15:38:29 -0700 Subject: [PATCH v15 5/5] Introduce collation provider "builtin" for "C" and "C.UTF-8". The builtin "C" locale is equal (in semantics and implementation) to the libc "C" locale. The builtin "C.UTF-8" locale is especially useful. It provides a fast memcmp-based collation (like "C") that supports abbrevated keys, while also providing richer ctype semantics (upper/lower and regexes). The semantics are derived from Unicode by building in lookup tables in the same way as for text normalization. By using built-in semantics, the behavior is stabilized within a Postgres major version, and also matches the behavior of other built-in Unicode functionality, such as normalization. Discussion: https://postgr.es/m/ab925f69-5f9d-f85e-b87c-bd2a44798659@joeconway.com Discussion: https://postgr.es/m/dd9261f4-7a98-4565-93ec-336c1c110d90@manitou-mail.org --- doc/src/sgml/charset.sgml | 88 +++++++-- doc/src/sgml/ref/create_collation.sgml | 11 +- doc/src/sgml/ref/create_database.sgml | 8 +- doc/src/sgml/ref/createdb.sgml | 2 +- doc/src/sgml/ref/initdb.sgml | 7 +- src/backend/catalog/pg_collation.c | 5 +- src/backend/commands/collationcmds.c | 93 +++++++-- src/backend/commands/dbcommands.c | 123 +++++++++--- src/backend/regex/regc_pg_locale.c | 41 +++- src/backend/utils/adt/formatting.c | 193 +++++++++++++++++++ src/backend/utils/adt/pg_locale.c | 130 +++++++++++-- src/backend/utils/init/postinit.c | 27 ++- src/bin/initdb/initdb.c | 30 ++- src/bin/initdb/t/001_initdb.pl | 55 ++++++ src/bin/pg_dump/pg_dump.c | 15 +- src/bin/pg_upgrade/t/002_pg_upgrade.pl | 74 +++++-- src/bin/psql/describe.c | 4 +- src/bin/scripts/createdb.c | 18 +- src/bin/scripts/t/020_createdb.pl | 72 +++++++ src/include/catalog/pg_collation.dat | 9 +- src/include/catalog/pg_collation.h | 3 + src/include/utils/pg_locale.h | 7 +- src/test/icu/t/010_database.pl | 18 +- src/test/regress/expected/collate.out | 24 ++- src/test/regress/expected/collate.utf8.out | 97 ++++++++++ src/test/regress/expected/collate.utf8_1.out | 8 + src/test/regress/parallel_schedule | 4 +- src/test/regress/sql/collate.sql | 10 + src/test/regress/sql/collate.utf8.sql | 45 +++++ 29 files changed, 1082 insertions(+), 139 deletions(-) create mode 100644 src/test/regress/expected/collate.utf8.out create mode 100644 src/test/regress/expected/collate.utf8_1.out create mode 100644 src/test/regress/sql/collate.utf8.sql diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 74783d148f..1553deea20 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -342,22 +342,14 @@ initdb --locale=sv_SE Locale Providers - PostgreSQL supports multiple locale - providers. This specifies which library supplies the locale - data. One standard provider name is libc, which uses - the locales provided by the operating system C library. These are the - locales used by most tools provided by the operating system. Another - provider is icu, which uses the external - ICUICU library. ICU locales can - only be used if support for ICU was configured when PostgreSQL was built. + A locale provider specifies which library defines the locale behavior for + collations and character classifications. The commands and tools that select the locale settings, as described - above, each have an option to select the locale provider. The examples - shown earlier all use the libc provider, which is the - default. Here is an example to initialize a database cluster using the - ICU provider: + above, each have an option to select the locale provider. Here is an + example to initialize a database cluster using the ICU provider: initdb --locale-provider=icu --icu-locale=en @@ -370,12 +362,74 @@ initdb --locale-provider=icu --icu-locale=en - Which locale provider to use depends on individual requirements. For most - basic uses, either provider will give adequate results. For the libc - provider, it depends on what the operating system offers; some operating - systems are better than others. For advanced uses, ICU offers more locale - variants and customization options. + Regardless of the locale provider, the operating system is still used to + provide some locale-aware behavior, such as messages (see ). + + + The available locale providers are listed below. + + + + Builtin + + The builtin provider uses built-in operations. Only + the C and C.UTF-8 locales are + supported for this provider. + + + The C locale behavior is identical to the + C locale in the libc provider. When using this locale, + the behavior may depend on the database encoding. + + + The C.UTF-8 locale is available only for when the + database encoding is UTF-8, and the behavior is based + on Unicode. The collation uses the code point values only. The regular + expression character classes are based on the "POSIX Compatible" + semantics, and the case mapping is the "simple" variant. + + + + ICU + + The icu provider uses the external + ICUICU + library. PostgreSQL must have been configured + with support. + + + ICU provides collation and character classification behavior that is + independent of the operating system and database encoding, which is + preferable if you expect to transition to other platforms without any + change in results. LC_COLLATE and + LC_CTYPE can be set independently of the ICU locale. + + + + For the ICU provider, results may depend on the version of the ICU + library used, as it is updated to reflect changes in natural language + over time. + + + + + libc + + The libc provider uses the operating system's C + library. The collation and character classification behavior is + controlled by the settings LC_COLLATE and + LC_CTYPE, so they cannot be set independently. + + + + The same locale name may have different behavior on different platforms + when using the libc provider. + + + + diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index 5cf9777764..85f18cbbe5 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -96,6 +96,11 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM locale, you cannot specify either of those parameters. + + If provider is builtin, + then locale must be specified and set to + either C or C.UTF-8. + @@ -129,9 +134,9 @@ CREATE COLLATION [ IF NOT EXISTS ] name FROM Specifies the provider to use for locale services associated with this - collation. Possible values are - icuICU - (if the server was built with ICU support) or libc. + collation. Possible values are builtin, + icuICU (if + the server was built with ICU support) or libc. libc is the default. See for details. diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index ce7317f81b..6dc3348d1b 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -162,6 +162,12 @@ CREATE DATABASE name linkend="create-database-lc-ctype"/>, or individually. + + If is + builtin, then locale + must be specified and set to either C or + C.UTF-8. + The other locale settings , name Specifies the provider to use for the default collation in this - database. Possible values are + database. Possible values are builtin, icuICU (if the server was built with ICU support) or libc. By default, the provider is the same as that of the - + Specifies the locale provider for the database's default collation. diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index d43c91575c..2192d017b3 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -286,6 +286,11 @@ PostgreSQL documentation environment that initdb runs in. Locale support is described in . + + If is builtin, + must be specified and set to + C or C.UTF-8. + @@ -315,7 +320,7 @@ PostgreSQL documentation - + This option sets the locale provider for databases created in the new diff --git a/src/backend/catalog/pg_collation.c b/src/backend/catalog/pg_collation.c index b615d2fc7a..2144e11fe8 100644 --- a/src/backend/catalog/pg_collation.c +++ b/src/backend/catalog/pg_collation.c @@ -68,7 +68,10 @@ CollationCreate(const char *collname, Oid collnamespace, Assert(collname); Assert(collnamespace); Assert(collowner); - Assert((collcollate && collctype) || colllocale); + Assert((collprovider == COLLPROVIDER_LIBC && + collcollate && collctype && !colllocale) || + (collprovider != COLLPROVIDER_LIBC && + !collcollate && !collctype && colllocale)); /* * Make sure there is no existing collation of same name & encoding. diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 19e61b3e5b..505b8ae86d 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -68,7 +68,7 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e DefElem *versionEl = NULL; char *collcollate; char *collctype; - char *colllocale; + const char *colllocale; char *collicurules; bool collisdeterministic; int collencoding; @@ -215,7 +215,9 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e if (collproviderstr) { - if (pg_strcasecmp(collproviderstr, "icu") == 0) + if (pg_strcasecmp(collproviderstr, "builtin") == 0) + collprovider = COLLPROVIDER_BUILTIN; + else if (pg_strcasecmp(collproviderstr, "icu") == 0) collprovider = COLLPROVIDER_ICU; else if (pg_strcasecmp(collproviderstr, "libc") == 0) collprovider = COLLPROVIDER_LIBC; @@ -245,7 +247,17 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e if (lcctypeEl) collctype = defGetString(lcctypeEl); - if (collprovider == COLLPROVIDER_LIBC) + if (collprovider == COLLPROVIDER_BUILTIN) + { + if (!colllocale) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("parameter \"locale\" must be specified"))); + + colllocale = builtin_validate_locale(GetDatabaseEncoding(), + colllocale); + } + else if (collprovider == COLLPROVIDER_LIBC) { if (!collcollate) ereport(ERROR, @@ -305,7 +317,17 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("ICU rules cannot be specified unless locale provider is ICU"))); - if (collprovider == COLLPROVIDER_ICU) + if (collprovider == COLLPROVIDER_BUILTIN) + { + /* + * Behavior may be different in different encodings, so set + * collencoding to the current database encoding. No validation is + * required, because the "builtin" provider is compatible with any + * encoding. + */ + collencoding = GetDatabaseEncoding(); + } + else if (collprovider == COLLPROVIDER_ICU) { #ifdef USE_ICU /* @@ -334,7 +356,18 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e } if (!collversion) - collversion = get_collation_actual_version(collprovider, collprovider == COLLPROVIDER_ICU ? colllocale : collcollate); + { + const char *locale; + + if (collprovider == COLLPROVIDER_ICU) + locale = colllocale; + else if (collprovider == COLLPROVIDER_LIBC) + locale = collcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + collversion = get_collation_actual_version(collprovider, locale); + } newoid = CollationCreate(collName, collNamespace, @@ -409,6 +442,7 @@ AlterCollation(AlterCollationStmt *stmt) Form_pg_collation collForm; Datum datum; bool isnull; + char *locale; char *oldversion; char *newversion; ObjectAddress address; @@ -435,8 +469,20 @@ AlterCollation(AlterCollationStmt *stmt) datum = SysCacheGetAttr(COLLOID, tup, Anum_pg_collation_collversion, &isnull); oldversion = isnull ? NULL : TextDatumGetCString(datum); - datum = SysCacheGetAttrNotNull(COLLOID, tup, collForm->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); - newversion = get_collation_actual_version(collForm->collprovider, TextDatumGetCString(datum)); + if (collForm->collprovider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(COLLOID, tup, Anum_pg_collation_colllocale); + locale = TextDatumGetCString(datum); + } + else if (collForm->collprovider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(COLLOID, tup, Anum_pg_collation_collcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + newversion = get_collation_actual_version(collForm->collprovider, locale); /* cannot change from NULL to non-NULL or vice versa */ if ((!oldversion && newversion) || (oldversion && !newversion)) @@ -500,11 +546,18 @@ pg_collation_actual_version(PG_FUNCTION_ARGS) provider = ((Form_pg_database) GETSTRUCT(dbtup))->datlocprovider; - datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, - provider == COLLPROVIDER_ICU ? - Anum_pg_database_datlocale : Anum_pg_database_datcollate); - - locale = TextDatumGetCString(datum); + if (provider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, Anum_pg_database_datlocale); + locale = TextDatumGetCString(datum); + } + else if (provider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, dbtup, Anum_pg_database_datcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ ReleaseSysCache(dbtup); } @@ -521,11 +574,19 @@ pg_collation_actual_version(PG_FUNCTION_ARGS) provider = ((Form_pg_collation) GETSTRUCT(colltp))->collprovider; Assert(provider != COLLPROVIDER_DEFAULT); - datum = SysCacheGetAttrNotNull(COLLOID, colltp, - provider == COLLPROVIDER_ICU ? - Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); - locale = TextDatumGetCString(datum); + if (provider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(COLLOID, colltp, Anum_pg_collation_colllocale); + locale = TextDatumGetCString(datum); + } + else if (provider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(COLLOID, colltp, Anum_pg_collation_collcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ ReleaseSysCache(colltp); } diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index c8732be067..b04ff25562 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -698,6 +698,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) DefElem *dtemplate = NULL; DefElem *dencoding = NULL; DefElem *dlocale = NULL; + DefElem *dbuiltinlocale = NULL; DefElem *dcollate = NULL; DefElem *dctype = NULL; DefElem *diculocale = NULL; @@ -713,7 +714,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) const char *dbtemplate = NULL; char *dbcollate = NULL; char *dbctype = NULL; - char *dblocale = NULL; + const char *dblocale = NULL; char *dbicurules = NULL; char dblocprovider = '\0'; char *canonname; @@ -762,6 +763,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errorConflictingDefElem(defel, pstate); dlocale = defel; } + else if (strcmp(defel->defname, "builtin_locale") == 0) + { + if (dbuiltinlocale) + errorConflictingDefElem(defel, pstate); + dbuiltinlocale = defel; + } else if (strcmp(defel->defname, "lc_collate") == 0) { if (dcollate) @@ -897,7 +904,10 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) { dbcollate = defGetString(dlocale); dbctype = defGetString(dlocale); + dblocale = defGetString(dlocale); } + if (dbuiltinlocale && dbuiltinlocale->arg) + dblocale = defGetString(dbuiltinlocale); if (dcollate && dcollate->arg) dbcollate = defGetString(dcollate); if (dctype && dctype->arg) @@ -910,7 +920,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) { char *locproviderstr = defGetString(dlocprovider); - if (pg_strcasecmp(locproviderstr, "icu") == 0) + if (pg_strcasecmp(locproviderstr, "builtin") == 0) + dblocprovider = COLLPROVIDER_BUILTIN; + else if (pg_strcasecmp(locproviderstr, "icu") == 0) dblocprovider = COLLPROVIDER_ICU; else if (pg_strcasecmp(locproviderstr, "libc") == 0) dblocprovider = COLLPROVIDER_LIBC; @@ -1027,14 +1039,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbctype = src_ctype; if (dblocprovider == '\0') dblocprovider = src_locprovider; - if (dblocale == NULL && dblocprovider == COLLPROVIDER_ICU) - { - if (dlocale && dlocale->arg) - dblocale = defGetString(dlocale); - else - dblocale = src_locale; - } - if (dbicurules == NULL && dblocprovider == COLLPROVIDER_ICU) + if (dblocale == NULL) + dblocale = src_locale; + if (dbicurules == NULL) dbicurules = src_icurules; /* Some encodings are client only */ @@ -1059,6 +1066,27 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) check_encoding_locale_matches(encoding, dbcollate, dbctype); + if (dblocprovider == COLLPROVIDER_BUILTIN) + { + /* + * This would happen if template0 uses the libc provider but the new + * database uses builtin. + */ + if (!dblocale) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("LOCALE must be specified for the builtin provider"))); + + dblocale = builtin_validate_locale(encoding, dblocale); + } + else + { + if (dbuiltinlocale && dbuiltinlocale->arg) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("BUILTIN_LOCALE cannot be specified unless locale provider is builtin"))); + } + if (dblocprovider == COLLPROVIDER_ICU) { if (!(is_encoding_supported_by_icu(encoding))) @@ -1100,7 +1128,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) } else { - if (dblocale) + if (diculocale && diculocale->arg) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("ICU locale cannot be specified unless locale provider is ICU"))); @@ -1111,6 +1139,10 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errmsg("ICU rules cannot be specified unless locale provider is ICU"))); } + /* for libc, locale comes from datcollate and datctype */ + if (dblocprovider == COLLPROVIDER_LIBC) + dblocale = NULL; + /* * Check that the new encoding and locale settings match the source * database. We insist on this because we simply copy the source data --- @@ -1195,9 +1227,17 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) */ if (src_collversion && !dcollversion) { - char *actual_versionstr; + char *actual_versionstr; + const char *locale; - actual_versionstr = get_collation_actual_version(dblocprovider, dblocprovider == COLLPROVIDER_ICU ? dblocale : dbcollate); + if (dblocprovider == COLLPROVIDER_ICU) + locale = dblocale; + else if (dblocprovider == COLLPROVIDER_LIBC) + locale = dbcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + actual_versionstr = get_collation_actual_version(dblocprovider, locale); if (!actual_versionstr) ereport(ERROR, (errmsg("template database \"%s\" has a collation version, but no actual collation version could be determined", @@ -1225,7 +1265,18 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) * collation version, which is normally only the case for template0. */ if (dbcollversion == NULL) - dbcollversion = get_collation_actual_version(dblocprovider, dblocprovider == COLLPROVIDER_ICU ? dblocale : dbcollate); + { + const char *locale; + + if (dblocprovider == COLLPROVIDER_ICU) + locale = dblocale; + else if (dblocprovider == COLLPROVIDER_LIBC) + locale = dbcollate; + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + dbcollversion = get_collation_actual_version(dblocprovider, locale); + } /* Resolve default tablespace for new database */ if (dtablespacename && dtablespacename->arg) @@ -1364,8 +1415,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) * block on the unique index, and fail after we commit). */ - Assert((dblocprovider == COLLPROVIDER_ICU && dblocale) || - (dblocprovider != COLLPROVIDER_ICU && !dblocale)); + Assert((dblocprovider != COLLPROVIDER_LIBC && dblocale) || + (dblocprovider == COLLPROVIDER_LIBC && !dblocale)); /* Form tuple */ new_record[Anum_pg_database_oid - 1] = ObjectIdGetDatum(dboid); @@ -2446,6 +2497,7 @@ AlterDatabaseRefreshColl(AlterDatabaseRefreshCollStmt *stmt) ObjectAddress address; Datum datum; bool isnull; + char *locale; char *oldversion; char *newversion; @@ -2472,10 +2524,24 @@ AlterDatabaseRefreshColl(AlterDatabaseRefreshCollStmt *stmt) datum = heap_getattr(tuple, Anum_pg_database_datcollversion, RelationGetDescr(rel), &isnull); oldversion = isnull ? NULL : TextDatumGetCString(datum); - datum = heap_getattr(tuple, datForm->datlocprovider == COLLPROVIDER_ICU ? Anum_pg_database_datlocale : Anum_pg_database_datcollate, RelationGetDescr(rel), &isnull); - if (isnull) - elog(ERROR, "unexpected null in pg_database"); - newversion = get_collation_actual_version(datForm->datlocprovider, TextDatumGetCString(datum)); + if (datForm->datlocprovider == COLLPROVIDER_ICU) + { + datum = heap_getattr(tuple, Anum_pg_database_datlocale, RelationGetDescr(rel), &isnull); + if (isnull) + elog(ERROR, "unexpected null in pg_database"); + locale = TextDatumGetCString(datum); + } + else if (datForm->datlocprovider == COLLPROVIDER_LIBC) + { + datum = heap_getattr(tuple, Anum_pg_database_datcollate, RelationGetDescr(rel), &isnull); + if (isnull) + elog(ERROR, "unexpected null in pg_database"); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + newversion = get_collation_actual_version(datForm->datlocprovider, locale); /* cannot change from NULL to non-NULL or vice versa */ if ((!oldversion && newversion) || (oldversion && !newversion)) @@ -2660,6 +2726,7 @@ pg_database_collation_actual_version(PG_FUNCTION_ARGS) HeapTuple tp; char datlocprovider; Datum datum; + char *locale; char *version; tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(dbid)); @@ -2670,8 +2737,20 @@ pg_database_collation_actual_version(PG_FUNCTION_ARGS) datlocprovider = ((Form_pg_database) GETSTRUCT(tp))->datlocprovider; - datum = SysCacheGetAttrNotNull(DATABASEOID, tp, datlocprovider == COLLPROVIDER_ICU ? Anum_pg_database_datlocale : Anum_pg_database_datcollate); - version = get_collation_actual_version(datlocprovider, TextDatumGetCString(datum)); + if (datlocprovider == COLLPROVIDER_ICU) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tp, Anum_pg_database_datlocale); + locale = TextDatumGetCString(datum); + } + else if (datlocprovider == COLLPROVIDER_LIBC) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tp, Anum_pg_database_datcollate); + locale = TextDatumGetCString(datum); + } + else + locale = NULL; /* COLLPROVIDER_BUILTIN */ + + version = get_collation_actual_version(datlocprovider, locale); ReleaseSysCache(tp); diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 42d15b6303..0278f45adc 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -16,6 +16,8 @@ */ #include "catalog/pg_collation.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "utils/pg_locale.h" /* @@ -64,6 +66,7 @@ typedef enum { PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ + PG_REGEX_BUILTIN, /* built-in Unicode semantics */ PG_REGEX_LOCALE_WIDE, /* Use functions */ PG_REGEX_LOCALE_1BYTE, /* Use functions */ PG_REGEX_LOCALE_WIDE_L, /* Use locale_t functions */ @@ -75,6 +78,8 @@ static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; static Oid pg_regex_collation; +static bool regex_builtin_cclass_posix = false; + /* * Hard-wired character properties for C locale */ @@ -266,7 +271,15 @@ pg_set_regex_collation(Oid collation) if (GetDatabaseEncoding() == PG_UTF8) { if (pg_regex_locale) - pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + { + if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN) + { + pg_regex_strategy = PG_REGEX_BUILTIN; + regex_builtin_cclass_posix = pg_regex_locale->info.builtin.cclass_posix; + } + else + pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + } else pg_regex_strategy = PG_REGEX_LOCALE_WIDE; } @@ -290,6 +303,8 @@ pg_wc_isdigit(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); + case PG_REGEX_BUILTIN: + return pg_u_isdigit(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit((wint_t) c); @@ -322,6 +337,8 @@ pg_wc_isalpha(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA)); + case PG_REGEX_BUILTIN: + return pg_u_isalpha(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalpha((wint_t) c); @@ -354,6 +371,8 @@ pg_wc_isalnum(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); + case PG_REGEX_BUILTIN: + return pg_u_isalnum(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum((wint_t) c); @@ -395,6 +414,8 @@ pg_wc_isupper(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER)); + case PG_REGEX_BUILTIN: + return pg_u_isupper(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswupper((wint_t) c); @@ -427,6 +448,8 @@ pg_wc_islower(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER)); + case PG_REGEX_BUILTIN: + return pg_u_islower(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswlower((wint_t) c); @@ -459,6 +482,8 @@ pg_wc_isgraph(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH)); + case PG_REGEX_BUILTIN: + return pg_u_isgraph(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswgraph((wint_t) c); @@ -491,6 +516,8 @@ pg_wc_isprint(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT)); + case PG_REGEX_BUILTIN: + return pg_u_isprint(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswprint((wint_t) c); @@ -523,6 +550,8 @@ pg_wc_ispunct(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); + case PG_REGEX_BUILTIN: + return pg_u_ispunct(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct((wint_t) c); @@ -555,6 +584,8 @@ pg_wc_isspace(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE)); + case PG_REGEX_BUILTIN: + return pg_u_isspace(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswspace((wint_t) c); @@ -588,6 +619,8 @@ pg_wc_toupper(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_toupper((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_uppercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -628,6 +661,8 @@ pg_wc_tolower(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_tolower((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_lowercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -792,6 +827,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif break; + case PG_REGEX_BUILTIN: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; case PG_REGEX_LOCALE_WIDE: case PG_REGEX_LOCALE_WIDE_L: max_chr = (pg_wchar) MAX_SIMPLE_CHR; @@ -809,6 +847,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; break; default: + Assert(false); max_chr = 0; /* can't get here, but keep compiler quiet */ break; } diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index d176723d95..c1f04eb03a 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -77,6 +77,8 @@ #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "mb/pg_wchar.h" #include "nodes/miscnodes.h" #include "parser/scansup.h" @@ -1670,6 +1672,67 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *orig = (unsigned char *) buff; + unsigned char *workspace; + size_t workspace_size = nbytes + 1; + const unsigned char *sp; + unsigned char *rp; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Output workspace cannot have more codes than input bytes */ + workspace = (unsigned char *)palloc(workspace_size); + + sp = orig; + rp = workspace; + while (sp - orig < nbytes) + { + pg_wchar u1 = utf8_to_unicode(sp); + pg_wchar u2 = unicode_lowercase_simple(u1); + + /* + * If we can't fit 4 more bytes, and the next character to + * write is multibyte, reallocate buffer to maximum size we + * will need. + */ + if (rp - workspace > workspace_size - 4 && u2 >= 0x80) + { + int written = rp - workspace; + + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + workspace_size = (nbytes + 1) * sizeof(pg_wchar); + workspace = repalloc(workspace, workspace_size); + rp = workspace + written; + } + + unicode_to_utf8(u2, rp); + sp += pg_utf_mblen(sp); + rp += pg_utf_mblen(rp); + } + + *rp = '\0'; + rp++; + + if (workspace_size == rp - workspace) + { + result = (char *) workspace; + } + else + { + /* shrink buffer and store result */ + result = palloc(rp - workspace); + memcpy(result, workspace, rp - workspace); + pfree(workspace); + } + } + else { if (pg_database_encoding_max_length() > 1) { @@ -1788,6 +1851,67 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *orig = (unsigned char *) buff; + unsigned char *workspace; + size_t workspace_size = nbytes + 1; + const unsigned char *sp; + unsigned char *rp; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Output workspace cannot have more codes than input bytes */ + workspace = (unsigned char *)palloc(workspace_size); + + sp = orig; + rp = workspace; + while (sp - orig < nbytes) + { + pg_wchar u1 = utf8_to_unicode(sp); + pg_wchar u2 = unicode_uppercase_simple(u1); + + /* + * If we can't fit 4 more bytes, and the next character to + * write is multibyte, reallocate buffer to maximum size we + * will need. + */ + if (rp - workspace > workspace_size - 4 && u2 >= 0x80) + { + int written = rp - workspace; + + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + workspace_size = (nbytes + 1) * sizeof(pg_wchar); + workspace = repalloc(workspace, workspace_size); + rp = workspace + written; + } + + unicode_to_utf8(u2, rp); + sp += pg_utf_mblen(sp); + rp += pg_utf_mblen(rp); + } + + *rp = '\0'; + rp++; + + if (workspace_size == rp - workspace) + { + result = (char *) workspace; + } + else + { + /* shrink buffer and store result */ + result = palloc(rp - workspace); + memcpy(result, workspace, rp - workspace); + pfree(workspace); + } + } + else { if (pg_database_encoding_max_length() > 1) { @@ -1907,6 +2031,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *orig = (unsigned char *) buff; + unsigned char *workspace; + size_t workspace_size = nbytes + 1; + const unsigned char *sp; + unsigned char *rp; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* Output workspace cannot have more codes than input bytes */ + workspace = (unsigned char *)palloc(workspace_size); + + sp = orig; + rp = workspace; + while (sp - orig < nbytes) + { + pg_wchar u1 = utf8_to_unicode(sp); + pg_wchar u2; + + if (wasalnum) + u2 = unicode_lowercase_simple(u1); + else + u2 = unicode_titlecase_simple(u1); + + /* + * If we can't fit 4 more bytes, and the next character to + * write is multibyte, reallocate buffer to maximum size we + * will need. + */ + if (rp - workspace > workspace_size - 4 && u2 >= 0x80) + { + int written = rp - workspace; + + /* Overflow paranoia */ + if ((nbytes + 1) > (INT_MAX / sizeof(pg_wchar))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + workspace_size = (nbytes + 1) * sizeof(pg_wchar); + workspace = repalloc(workspace, workspace_size); + rp = workspace + written; + } + + unicode_to_utf8(u2, rp); + + wasalnum = pg_u_isalnum(u2, mylocale->info.builtin.cclass_posix); + + sp += pg_utf_mblen(sp); + rp += pg_utf_mblen(rp); + } + + *rp = '\0'; + rp++; + + if (workspace_size == rp - workspace) + { + result = (char *) workspace; + } + else + { + /* shrink buffer and store result */ + result = palloc(rp - workspace); + memcpy(result, workspace, rp - workspace); + pfree(workspace); + } + } + else { if (pg_database_encoding_max_length() > 1) { diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 9d8634ec25..7067e29c2d 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1269,7 +1269,19 @@ lookup_collation_cache(Oid collation, bool set_flags) elog(ERROR, "cache lookup failed for collation %u", collation); collform = (Form_pg_collation) GETSTRUCT(tp); - if (collform->collprovider == COLLPROVIDER_LIBC) + if (collform->collprovider == COLLPROVIDER_BUILTIN) + { + Datum datum; + const char *colllocale; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + colllocale = TextDatumGetCString(datum); + + cache_entry->collate_is_c = true; + cache_entry->ctype_is_c = ((strcmp(colllocale, "C") == 0) || + (strcmp(colllocale, "POSIX") == 0)); + } + else if (collform->collprovider == COLLPROVIDER_LIBC) { Datum datum; const char *collcollate; @@ -1320,16 +1332,30 @@ lc_collate_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_COLLATE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_COLLATE setting"); + + if (default_locale.provider == COLLPROVIDER_BUILTIN) + { + result = true; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_ICU) + { + result = false; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_LIBC) + { + localeptr = setlocale(LC_CTYPE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_CTYPE setting"); + } + else + elog(ERROR, "unexpected collation provider '%c'", + default_locale.provider); if (strcmp(localeptr, "C") == 0) result = true; @@ -1373,16 +1399,29 @@ lc_ctype_is_c(Oid collation) if (collation == DEFAULT_COLLATION_OID) { static int result = -1; - char *localeptr; - - if (default_locale.provider == COLLPROVIDER_ICU) - return false; + const char *localeptr; if (result >= 0) return (bool) result; - localeptr = setlocale(LC_CTYPE, NULL); - if (!localeptr) - elog(ERROR, "invalid LC_CTYPE setting"); + + if (default_locale.provider == COLLPROVIDER_BUILTIN) + { + localeptr = default_locale.info.builtin.locale; + } + else if (default_locale.provider == COLLPROVIDER_ICU) + { + result = false; + return (bool) result; + } + else if (default_locale.provider == COLLPROVIDER_LIBC) + { + localeptr = setlocale(LC_CTYPE, NULL); + if (!localeptr) + elog(ERROR, "invalid LC_CTYPE setting"); + } + else + elog(ERROR, "unexpected collation provider '%c'", + default_locale.provider); if (strcmp(localeptr, "C") == 0) result = true; @@ -1390,6 +1429,7 @@ lc_ctype_is_c(Oid collation) result = true; else result = false; + return (bool) result; } @@ -1520,10 +1560,10 @@ pg_newlocale_from_collation(Oid collid) if (collid == DEFAULT_COLLATION_OID) { - if (default_locale.provider == COLLPROVIDER_ICU) - return &default_locale; - else + if (default_locale.provider == COLLPROVIDER_LIBC) return (pg_locale_t) 0; + else + return &default_locale; } cache_entry = lookup_collation_cache(collid, false); @@ -1548,7 +1588,18 @@ pg_newlocale_from_collation(Oid collid) result.provider = collform->collprovider; result.deterministic = collform->collisdeterministic; - if (collform->collprovider == COLLPROVIDER_LIBC) + if (collform->collprovider == COLLPROVIDER_BUILTIN) + { + const char *locstr; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + locstr = TextDatumGetCString(datum); + + result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext, + locstr); + result.info.builtin.cclass_posix = true; + } + else if (collform->collprovider == COLLPROVIDER_LIBC) { const char *collcollate; const char *collctype pg_attribute_unused(); @@ -1627,6 +1678,7 @@ pg_newlocale_from_collation(Oid collid) collversionstr = TextDatumGetCString(datum); + Assert(collform->collprovider != COLLPROVIDER_BUILTIN); datum = SysCacheGetAttrNotNull(COLLOID, tp, collform->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colllocale : Anum_pg_collation_collcollate); actual_versionstr = get_collation_actual_version(collform->collprovider, @@ -1678,6 +1730,14 @@ get_collation_actual_version(char collprovider, const char *collcollate) { char *collversion = NULL; + /* + * The only two supported locales (C and C.UTF-8) are both based on memcmp + * and do not change. (The ctype behavior can change, but the versioning + * does not track that.) + */ + if (collprovider == COLLPROVIDER_BUILTIN) + return NULL; + #ifdef USE_ICU if (collprovider == COLLPROVIDER_ICU) { @@ -2444,6 +2504,38 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, return result; } +const char * +builtin_validate_locale(int encoding, const char *locale) +{ + const char *canonical_name = NULL; + int required_encoding = -1; + + if (strcmp(locale, "C") == 0 || strcmp(locale, "POSIX") == 0) + { + canonical_name = "C"; + } + else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0) + { + required_encoding = PG_UTF8; + canonical_name = "C.UTF-8"; + } + + if (!canonical_name) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid locale name \"%s\" for builtin provider", + locale))); + + if (required_encoding >= 0 && encoding != required_encoding) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("encoding \"%s\" does not match locale \"%s\"", + pg_encoding_to_char(encoding), locale))); + + return canonical_name; +} + + #ifdef USE_ICU /* diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 702bad6d8a..f6dc3afbfe 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -318,7 +318,7 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect bool isnull; char *collate; char *ctype; - char *iculocale; + char *datlocale; /* Fetch our pg_database row normally, via syscache */ tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId)); @@ -423,12 +423,21 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect strcmp(ctype, "POSIX") == 0) database_ctype_is_c = true; - if (dbform->datlocprovider == COLLPROVIDER_ICU) + if (dbform->datlocprovider == COLLPROVIDER_BUILTIN) + { + datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale); + datlocale = TextDatumGetCString(datum); + + default_locale.info.builtin.locale = MemoryContextStrdup( + TopMemoryContext, datlocale); + default_locale.info.builtin.cclass_posix = true; + } + else if (dbform->datlocprovider == COLLPROVIDER_ICU) { char *icurules; datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale); - iculocale = TextDatumGetCString(datum); + datlocale = TextDatumGetCString(datum); datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticurules, &isnull); if (!isnull) @@ -436,10 +445,10 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect else icurules = NULL; - make_icu_collator(iculocale, icurules, &default_locale); + make_icu_collator(datlocale, icurules, &default_locale); } else - iculocale = NULL; + datlocale = NULL; default_locale.provider = dbform->datlocprovider; @@ -461,10 +470,16 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect { char *actual_versionstr; char *collversionstr; + char *locale; collversionstr = TextDatumGetCString(datum); - actual_versionstr = get_collation_actual_version(dbform->datlocprovider, dbform->datlocprovider == COLLPROVIDER_ICU ? iculocale : collate); + if (dbform->datlocprovider == COLLPROVIDER_LIBC) + locale = collate; + else + locale = datlocale; + + actual_versionstr = get_collation_actual_version(dbform->datlocprovider, locale); if (!actual_versionstr) /* should not happen */ elog(WARNING, diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index eea40b930f..fab9b4c131 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2450,7 +2450,7 @@ usage(const char *progname) " set default locale in the respective category for\n" " new databases (default taken from environment)\n")); printf(_(" --no-locale equivalent to --locale=C\n")); - printf(_(" --locale-provider={libc|icu}\n" + printf(_(" --locale-provider={builtin|libc|icu}\n" " set default locale provider for new databases\n")); printf(_(" --pwfile=FILE read password for the new superuser from file\n")); printf(_(" -T, --text-search-config=CFG\n" @@ -2601,7 +2601,15 @@ setup_locale_encoding(void) { setlocales(); - if (locale_provider == COLLPROVIDER_LIBC && + if (locale_provider == COLLPROVIDER_BUILTIN && + strcmp(lc_ctype, "C") == 0 && + strcmp(lc_collate, "C") == 0 && + strcmp(lc_time, "C") == 0 && + strcmp(lc_numeric, "C") == 0 && + strcmp(lc_monetary, "C") == 0 && + strcmp(lc_messages, "C") == 0) + printf(_("The database cluster will be initialized with no locale.\n")); + else if (locale_provider == COLLPROVIDER_LIBC && strcmp(lc_ctype, lc_collate) == 0 && strcmp(lc_ctype, lc_time) == 0 && strcmp(lc_ctype, lc_numeric) == 0 && @@ -3107,9 +3115,10 @@ main(int argc, char *argv[]) {"allow-group-access", no_argument, NULL, 'g'}, {"discard-caches", no_argument, NULL, 14}, {"locale-provider", required_argument, NULL, 15}, - {"icu-locale", required_argument, NULL, 16}, - {"icu-rules", required_argument, NULL, 17}, - {"sync-method", required_argument, NULL, 18}, + {"builtin-locale", required_argument, NULL, 16}, + {"icu-locale", required_argument, NULL, 17}, + {"icu-rules", required_argument, NULL, 18}, + {"sync-method", required_argument, NULL, 19}, {NULL, 0, NULL, 0} }; @@ -3277,7 +3286,9 @@ main(int argc, char *argv[]) "-c debug_discard_caches=1"); break; case 15: - if (strcmp(optarg, "icu") == 0) + if (strcmp(optarg, "builtin") == 0) + locale_provider = COLLPROVIDER_BUILTIN; + else if (strcmp(optarg, "icu") == 0) locale_provider = COLLPROVIDER_ICU; else if (strcmp(optarg, "libc") == 0) locale_provider = COLLPROVIDER_LIBC; @@ -3286,12 +3297,15 @@ main(int argc, char *argv[]) break; case 16: datlocale = pg_strdup(optarg); - icu_locale_specified = true; break; case 17: - icu_rules = pg_strdup(optarg); + datlocale = pg_strdup(optarg); + icu_locale_specified = true; break; case 18: + icu_rules = pg_strdup(optarg); + break; + case 19: if (!parse_sync_method(optarg, &sync_method)) exit(1); break; diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 7606db1987..1701506abd 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -184,6 +184,61 @@ else 'locale provider ICU fails since no ICU support'); } +command_fails( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', "$tempdir/data6" + ], + 'locale provider builtin fails without --locale' +); + +command_ok( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--locale=C', + "$tempdir/data7" + ], + 'locale provider builtin with --locale' +); + +command_ok( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '-E UTF-8', + '--builtin-locale=C.UTF-8', "$tempdir/data8" + ], + 'locale provider builtin with -E UTF-8 --builtin-locale=C.UTF-8' +); + +command_fails( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '-E SQL_ASCII', + '--builtin-locale=C.UTF-8', "$tempdir/data9" + ], + 'locale provider builtin with --builtin-locale=C.UTF-8 fails for SQL_ASCII' +); + +command_ok( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--lc-ctype=C', + '--locale=C', "$tempdir/data10" + ], + 'locale provider builtin with --lc-ctype' +); + +command_fails( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--icu-locale=en', + "$tempdir/dataX" + ], + 'fails for locale provider builtin with ICU locale' +); + +command_fails( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--icu-rules=""', + "$tempdir/dataX" + ], + 'fails for locale provider builtin with ICU rules' +); + command_fails( [ 'initdb', '--no-sync', '--locale-provider=xyz', "$tempdir/dataX" ], 'fails for invalid locale provider'); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 3d829c99d7..b953db8bea 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -3091,7 +3091,9 @@ dumpDatabase(Archive *fout) } appendPQExpBufferStr(creaQry, " LOCALE_PROVIDER = "); - if (datlocprovider[0] == 'c') + if (datlocprovider[0] == 'b') + appendPQExpBufferStr(creaQry, "builtin"); + else if (datlocprovider[0] == 'c') appendPQExpBufferStr(creaQry, "libc"); else if (datlocprovider[0] == 'i') appendPQExpBufferStr(creaQry, "icu"); @@ -13672,7 +13674,9 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) fmtQualifiedDumpable(collinfo)); appendPQExpBufferStr(q, "provider = "); - if (collprovider[0] == 'c') + if (collprovider[0] == 'b') + appendPQExpBufferStr(q, "builtin"); + else if (collprovider[0] == 'c') appendPQExpBufferStr(q, "libc"); else if (collprovider[0] == 'i') appendPQExpBufferStr(q, "icu"); @@ -13693,6 +13697,13 @@ dumpCollation(Archive *fout, const CollInfo *collinfo) /* no locale -- the default collation cannot be reloaded anyway */ } + else if (collprovider[0] == 'b') + { + if (collcollate || collctype || colllocale || collicurules) + pg_log_warning("invalid collation \"%s\"", qcollname); + + appendPQExpBufferStr(q, ", locale = 'C'"); + } else if (collprovider[0] == 'i') { if (fout->remoteVersion >= 150000) diff --git a/src/bin/pg_upgrade/t/002_pg_upgrade.pl b/src/bin/pg_upgrade/t/002_pg_upgrade.pl index 6f359d72ce..37a4e9d334 100644 --- a/src/bin/pg_upgrade/t/002_pg_upgrade.pl +++ b/src/bin/pg_upgrade/t/002_pg_upgrade.pl @@ -108,35 +108,71 @@ if ($oldnode->pg_version >= 11) # can test that pg_upgrade copies the locale settings of template0 # from the old to the new cluster. -my $original_encoding = "6"; # UTF-8 -my $original_provider = "c"; -my $original_locale = "C"; -my $original_iculocale = ""; -my $provider_field = "'c' AS datlocprovider"; -my $iculocale_field = "NULL AS datlocale"; -if (int($oldnode->pg_version) >= 15 && $ENV{with_icu} eq 'yes') +my %encoding_number = ('UTF-8' => 6, 'SQL_ASCII' => 0); +my $provider_field; +my $datlocale_field; +my $original_encoding; +my $original_provider; +my $original_datcollate = "C"; +my $original_datctype = "C"; +my $original_datlocale; + +if (int($oldnode->pg_version) >= 15) { $provider_field = "datlocprovider"; if (int($oldnode->pg_version) >= 17) { - $iculocale_field = "datlocale"; + $datlocale_field = "datlocale"; } else { - $iculocale_field = "daticulocale AS datlocale"; + $datlocale_field = "daticulocale AS datlocale"; } +} +else +{ + $provider_field = "'c' AS datlocprovider"; + $datlocale_field = "NULL AS datlocale"; +} + +if (int($oldnode->pg_version) >= 17) +{ + $original_encoding = "UTF-8"; + $original_provider = "b"; + $original_datlocale = "C.UTF-8"; +} +elsif (int($oldnode->pg_version) >= 15 && $ENV{with_icu} eq 'yes') +{ + $original_encoding = "UTF-8"; $original_provider = "i"; - $original_iculocale = "fr-CA"; + $original_datlocale = "fr-CA"; +} +else +{ + my $original_encoding = "SQL_ASCII"; + my $original_provider = "c"; + my $original_datlocale = ""; } my @initdb_params = @custom_opts; -push @initdb_params, ('--encoding', 'UTF-8'); -push @initdb_params, ('--locale', $original_locale); -if ($original_provider eq "i") +push @initdb_params, ('--encoding', $original_encoding); +push @initdb_params, ('--lc-collate', $original_datcollate); +push @initdb_params, ('--lc-ctype', $original_datctype); + +# add --locale-provider, if supported +my %provider_name = ('b' => 'builtin', 'i' => 'icu', 'c' => 'libc'); +if ($oldnode->pg_version >= 15) { - push @initdb_params, ('--locale-provider', 'icu'); - push @initdb_params, ('--icu-locale', 'fr-CA'); + push @initdb_params, ('--locale-provider', $provider_name{$original_provider}); + if ($original_provider eq 'b') + { + push @initdb_params, ('--builtin-locale', $original_datlocale); + } + elsif ($original_provider eq 'i') + { + push @initdb_params, ('--icu-locale', $original_datlocale); + } } $node_params{extra} = \@initdb_params; @@ -146,10 +182,10 @@ $oldnode->start; my $result; $result = $oldnode->safe_psql( 'postgres', - "SELECT encoding, $provider_field, datcollate, datctype, $iculocale_field + "SELECT encoding, $provider_field, datcollate, datctype, $datlocale_field FROM pg_database WHERE datname='template0'"); is( $result, - "$original_encoding|$original_provider|$original_locale|$original_locale|$original_iculocale", + "$encoding_number{$original_encoding}|$original_provider|$original_datcollate|$original_datctype|$original_datlocale", "check locales in original cluster"); # The default location of the source code is the root of this directory. @@ -429,10 +465,10 @@ if (-d $log_path) # Test that upgraded cluster has original locale settings. $result = $newnode->safe_psql( 'postgres', - "SELECT encoding, $provider_field, datcollate, datctype, $iculocale_field + "SELECT encoding, $provider_field, datcollate, datctype, $datlocale_field FROM pg_database WHERE datname='template0'"); is( $result, - "$original_encoding|$original_provider|$original_locale|$original_locale|$original_iculocale", + "$encoding_number{$original_encoding}|$original_provider|$original_datcollate|$original_datctype|$original_datlocale", "check that locales in new cluster match original cluster"); # Second dump from the upgraded instance. diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 1122843715..3b327d159a 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -926,7 +926,7 @@ listAllDbs(const char *pattern, bool verbose) gettext_noop("Encoding")); if (pset.sversion >= 150000) appendPQExpBuffer(&buf, - " CASE d.datlocprovider WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", + " CASE d.datlocprovider WHEN 'b' THEN 'builtin' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", gettext_noop("Locale Provider")); else appendPQExpBuffer(&buf, @@ -4966,7 +4966,7 @@ listCollations(const char *pattern, bool verbose, bool showSystem) if (pset.sversion >= 100000) appendPQExpBuffer(&buf, - " CASE c.collprovider WHEN 'd' THEN 'default' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", + " CASE c.collprovider WHEN 'd' THEN 'default' WHEN 'b' THEN 'builtin' WHEN 'c' THEN 'libc' WHEN 'i' THEN 'icu' END AS \"%s\",\n", gettext_noop("Provider")); else appendPQExpBuffer(&buf, diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c index 9ca86a3e53..78d729106a 100644 --- a/src/bin/scripts/createdb.c +++ b/src/bin/scripts/createdb.c @@ -40,8 +40,9 @@ main(int argc, char *argv[]) {"locale", required_argument, NULL, 'l'}, {"maintenance-db", required_argument, NULL, 3}, {"locale-provider", required_argument, NULL, 4}, - {"icu-locale", required_argument, NULL, 5}, - {"icu-rules", required_argument, NULL, 6}, + {"builtin-locale", required_argument, NULL, 5}, + {"icu-locale", required_argument, NULL, 6}, + {"icu-rules", required_argument, NULL, 7}, {NULL, 0, NULL, 0} }; @@ -67,6 +68,7 @@ main(int argc, char *argv[]) char *lc_ctype = NULL; char *locale = NULL; char *locale_provider = NULL; + char *builtin_locale = NULL; char *icu_locale = NULL; char *icu_rules = NULL; @@ -134,9 +136,12 @@ main(int argc, char *argv[]) locale_provider = pg_strdup(optarg); break; case 5: - icu_locale = pg_strdup(optarg); + builtin_locale = pg_strdup(optarg); break; case 6: + icu_locale = pg_strdup(optarg); + break; + case 7: icu_rules = pg_strdup(optarg); break; default: @@ -216,6 +221,11 @@ main(int argc, char *argv[]) appendPQExpBufferStr(&sql, " LOCALE "); appendStringLiteralConn(&sql, locale, conn); } + if (builtin_locale) + { + appendPQExpBufferStr(&sql, " BUILTIN_LOCALE "); + appendStringLiteralConn(&sql, builtin_locale, conn); + } if (lc_collate) { appendPQExpBufferStr(&sql, " LC_COLLATE "); @@ -296,7 +306,7 @@ help(const char *progname) printf(_(" --lc-ctype=LOCALE LC_CTYPE setting for the database\n")); printf(_(" --icu-locale=LOCALE ICU locale setting for the database\n")); printf(_(" --icu-rules=RULES ICU rules setting for the database\n")); - printf(_(" --locale-provider={libc|icu}\n" + printf(_(" --locale-provider={builtin|libc|icu}\n" " locale provider for the database's default collation\n")); printf(_(" -O, --owner=OWNER database user to own the new database\n")); printf(_(" -S, --strategy=STRATEGY database creation strategy wal_log or file_copy\n")); diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index 40291924e5..feee9cf85d 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -105,6 +105,78 @@ else 'create database with ICU fails since no ICU support'); } +$node->command_fails( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + 'tbuiltin1' + ], + 'create database with provider "builtin" fails without --locale' +); + +$node->command_ok( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '--locale=C', 'tbuiltin2' + ], + 'create database with provider "builtin" and locale "C"' +); + +$node->command_ok( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '--locale=C', '--lc-collate=C', 'tbuiltin3' + ], + 'create database with provider "builtin" and LC_COLLATE=C' +); + +$node->command_ok( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '--locale=C', '--lc-ctype=C', 'tbuiltin4' + ], + 'create database with provider "builtin" and LC_CTYPE=C' +); + +$node->command_ok( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '-E UTF-8', '--builtin-locale=C.UTF8', 'tbuiltin5' + ], + 'create database with provider "builtin" and --builtin-locale C.UTF-8' +); + +$node->command_fails( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '-E LATIN1', '--builtin-locale=C.UTF-8', 'tbuiltin6' + ], + 'create database with provider "builtin" and --builtin-locale C.UTF-8' +); + +$node->command_fails( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '--locale=C', '--icu-locale=en', 'tbuiltin7' + ], + 'create database with provider "builtin" and ICU_LOCALE="en"' +); + +$node->command_fails( + [ + 'createdb', '-T', 'template0', '--locale-provider=builtin', + '--locale=C', '--icu-rules=""', 'tbuiltin8' + ], + 'create database with provider "builtin" and ICU_RULES=""' +); + +$node->command_fails( + [ + 'createdb', '-T', 'template1', '--locale-provider=builtin', + '--locale=C', 'tbuiltin9' + ], + 'create database with provider "builtin" not matching template' +); + $node->command_fails([ 'createdb', 'foobar1' ], 'fails if database already exists'); diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index 2c112cd6bc..fa071ce68a 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -23,12 +23,15 @@ descr => 'standard POSIX collation', collname => 'POSIX', collprovider => 'c', collencoding => '-1', collcollate => 'POSIX', collctype => 'POSIX' }, -{ oid => '962', descr => 'sorts by Unicode code point', - collname => 'ucs_basic', collprovider => 'c', collencoding => '6', - collcollate => 'C', collctype => 'C' }, +{ oid => '962', descr => 'sorts by Unicode code point, C character semantics', + collname => 'ucs_basic', collprovider => 'b', collencoding => '6', + colllocale => 'C' }, { oid => '963', descr => 'sorts using the Unicode Collation Algorithm with default settings', collname => 'unicode', collprovider => 'i', collencoding => '-1', colllocale => 'und' }, +{ oid => '970', descr => 'sorts by Unicode code point; Unicode & POSIX character semantics', + collname => 'c_utf8', collprovider => 'b', collencoding => '6', + colllocale => 'C.UTF8' }, ] diff --git a/src/include/catalog/pg_collation.h b/src/include/catalog/pg_collation.h index d357c89ae6..09fc991038 100644 --- a/src/include/catalog/pg_collation.h +++ b/src/include/catalog/pg_collation.h @@ -65,6 +65,7 @@ DECLARE_UNIQUE_INDEX_PKEY(pg_collation_oid_index, 3085, CollationOidIndexId, pg_ #ifdef EXPOSE_TO_CLIENT_CODE #define COLLPROVIDER_DEFAULT 'd' +#define COLLPROVIDER_BUILTIN 'b' #define COLLPROVIDER_ICU 'i' #define COLLPROVIDER_LIBC 'c' @@ -73,6 +74,8 @@ collprovider_name(char c) { switch (c) { + case COLLPROVIDER_BUILTIN: + return "builtin"; case COLLPROVIDER_ICU: return "icu"; case COLLPROVIDER_LIBC: diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 6447bea8e0..22dc38f42e 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -76,6 +76,11 @@ struct pg_locale_struct bool deterministic; union { + struct + { + const char *locale; + bool cclass_posix; + } builtin; locale_t lt; #ifdef USE_ICU struct @@ -112,7 +117,7 @@ extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale); extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale); - +extern const char *builtin_validate_locale(int encoding, const char *loc_str); extern void icu_validate_locale(const char *loc_str); extern char *icu_language_tag(const char *loc_str, int elevel); diff --git a/src/test/icu/t/010_database.pl b/src/test/icu/t/010_database.pl index 67fc3bbf19..af34cfc1dd 100644 --- a/src/test/icu/t/010_database.pl +++ b/src/test/icu/t/010_database.pl @@ -63,14 +63,14 @@ is( $node1->psql( 0, "C locale works for ICU"); -# Test that LOCALE works for ICU locales if LC_COLLATE and LC_CTYPE -# are specified -is( $node1->psql( - 'postgres', - q{CREATE DATABASE dbicu2 LOCALE_PROVIDER icu LOCALE '@colStrength=primary' - LC_COLLATE='C' LC_CTYPE='C' TEMPLATE template0 ENCODING UTF8} - ), - 0, - "LOCALE works for ICU locales if LC_COLLATE and LC_CTYPE are specified"); +my ($ret, $stdout, $stderr) = $node1->psql('postgres', + q{CREATE DATABASE dbicu LOCALE_PROVIDER builtin LOCALE 'C' TEMPLATE dbicu} +); +isnt($ret, 0, + "locale provider must match template: exit code not 0"); +like( + $stderr, + qr/ERROR: new locale provider \(builtin\) does not match locale provider of the template database \(icu\)/, + "locale provider must match template: error message"); done_testing(); diff --git a/src/test/regress/expected/collate.out b/src/test/regress/expected/collate.out index 0649564485..ece4a8e99d 100644 --- a/src/test/regress/expected/collate.out +++ b/src/test/regress/expected/collate.out @@ -650,6 +650,26 @@ EXPLAIN (COSTS OFF) (3 rows) -- CREATE/DROP COLLATION +CREATE COLLATION builtin_c ( PROVIDER = builtin, LOCALE = "C" ); +CREATE COLLATION builtin_posix ( PROVIDER = builtin, LOCALE = "POSIX" ); +SELECT b FROM collate_test1 ORDER BY b COLLATE builtin_c; + b +----- + ABD + Abc + abc + bbc +(4 rows) + +CREATE COLLATION builtin2 ( PROVIDER = builtin ); -- fails +ERROR: parameter "locale" must be specified +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "en_US" ); -- fails +ERROR: invalid locale name "en_US" for builtin provider +CREATE COLLATION builtin2 ( PROVIDER = builtin, LC_CTYPE = "C", LC_COLLATE = "C" ); -- fails +ERROR: parameter "locale" must be specified +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "POSIX", LC_CTYPE = "POSIX" ); -- fails +ERROR: conflicting or redundant options +DETAIL: LOCALE cannot be specified together with LC_COLLATE or LC_CTYPE. CREATE COLLATION mycoll1 FROM "C"; CREATE COLLATION mycoll2 ( LC_COLLATE = "POSIX", LC_CTYPE = "POSIX" ); CREATE COLLATION mycoll3 FROM "default"; -- intentionally unsupported @@ -754,7 +774,7 @@ DETAIL: FROM cannot be specified together with any other options. -- must get rid of them. -- DROP SCHEMA collate_tests CASCADE; -NOTICE: drop cascades to 19 other objects +NOTICE: drop cascades to 21 other objects DETAIL: drop cascades to table collate_test1 drop cascades to table collate_test_like drop cascades to table collate_test2 @@ -771,6 +791,8 @@ drop cascades to function dup(anyelement) drop cascades to table collate_test20 drop cascades to table collate_test21 drop cascades to table collate_test22 +drop cascades to collation builtin_c +drop cascades to collation builtin_posix drop cascades to collation mycoll2 drop cascades to table collate_test23 drop cascades to view collate_on_int diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out new file mode 100644 index 0000000000..f4a9d26978 --- /dev/null +++ b/src/test/regress/expected/collate.utf8.out @@ -0,0 +1,97 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +SET client_encoding TO UTF8; +-- +-- Test preinstalled C_UTF8 collation. +-- +CREATE TABLE builtin_test ( + t TEXT COLLATE C_UTF8 +); +INSERT INTO builtin_test VALUES + ('abc DEF'), + ('ábc DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'); +SELECT t, lower(t), initcap(t), upper(t) FROM builtin_test; + t | lower | initcap | upper +----------------+----------------+----------------+---------------- + abc DEF | abc def | Abc Def | ABC DEF + ábc DÉF | ábc déf | Ábc Déf | ÁBC DÉF + DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ +(3 rows) + +DROP TABLE builtin_test; +-- character classes +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xyz' !~ '[[:upper:]]' COLLATE C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '@' !~ '[[:alnum:]]' COLLATE C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '@' ~ '[[:punct:]]' COLLATE C_UTF8; -- symbols are punctuation in posix + ?column? +---------- + t +(1 row) + +SELECT 'a8a' ~ '[[:digit:]]' COLLATE C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '൧' !~ '\d' COLLATE C_UTF8; -- only 0-9 considered digits in posix + ?column? +---------- + t +(1 row) + +-- case mapping +SELECT 'xYz' ~* 'XyZ' COLLATE C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' ~* '[W-Y]' COLLATE C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' !~* '[c-d]' COLLATE C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'Δ' ~* '[α-λ]' COLLATE C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'δ' ~* '[Γ-Λ]' COLLATE C_UTF8; -- same as above with cases reversed + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/expected/collate.utf8_1.out b/src/test/regress/expected/collate.utf8_1.out new file mode 100644 index 0000000000..e73fdf50c3 --- /dev/null +++ b/src/test/regress/expected/collate.utf8_1.out @@ -0,0 +1,8 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index f0987ff537..292bc54932 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,9 +78,9 @@ test: brin_bloom brin_multi # psql depends on create_am # amutils depends on geometry, create_index_spgist, hash_index, brin # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role +test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.utf8 collate.icu.utf8 incremental_sort create_role -# collate.*.utf8 tests cannot be run in parallel with each other +# collate.linux.utf8 and collate.icu.utf8 tests cannot be run in parallel with each other test: rules psql psql_crosstab amutils stats_ext collate.linux.utf8 collate.windows.win1252 # ---------- diff --git a/src/test/regress/sql/collate.sql b/src/test/regress/sql/collate.sql index c3d40fc195..01d5c69fe4 100644 --- a/src/test/regress/sql/collate.sql +++ b/src/test/regress/sql/collate.sql @@ -244,6 +244,16 @@ EXPLAIN (COSTS OFF) -- CREATE/DROP COLLATION +CREATE COLLATION builtin_c ( PROVIDER = builtin, LOCALE = "C" ); +CREATE COLLATION builtin_posix ( PROVIDER = builtin, LOCALE = "POSIX" ); + +SELECT b FROM collate_test1 ORDER BY b COLLATE builtin_c; + +CREATE COLLATION builtin2 ( PROVIDER = builtin ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "en_US" ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LC_CTYPE = "C", LC_COLLATE = "C" ); -- fails +CREATE COLLATION builtin2 ( PROVIDER = builtin, LOCALE = "POSIX", LC_CTYPE = "POSIX" ); -- fails + CREATE COLLATION mycoll1 FROM "C"; CREATE COLLATION mycoll2 ( LC_COLLATE = "POSIX", LC_CTYPE = "POSIX" ); CREATE COLLATION mycoll3 FROM "default"; -- intentionally unsupported diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql new file mode 100644 index 0000000000..0f30c5704d --- /dev/null +++ b/src/test/regress/sql/collate.utf8.sql @@ -0,0 +1,45 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ + +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +SET client_encoding TO UTF8; + +-- +-- Test preinstalled C_UTF8 collation. +-- + +CREATE TABLE builtin_test ( + t TEXT COLLATE C_UTF8 +); +INSERT INTO builtin_test VALUES + ('abc DEF'), + ('ábc DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'); + +SELECT t, lower(t), initcap(t), upper(t) FROM builtin_test; + +DROP TABLE builtin_test; + +-- character classes + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE C_UTF8; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE C_UTF8; +SELECT '@' !~ '[[:alnum:]]' COLLATE C_UTF8; +SELECT '@' ~ '[[:punct:]]' COLLATE C_UTF8; -- symbols are punctuation in posix +SELECT 'a8a' ~ '[[:digit:]]' COLLATE C_UTF8; +SELECT '൧' !~ '\d' COLLATE C_UTF8; -- only 0-9 considered digits in posix + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE C_UTF8; +SELECT 'xAb' ~* '[W-Y]' COLLATE C_UTF8; +SELECT 'xAb' !~* '[c-d]' COLLATE C_UTF8; +SELECT 'Δ' ~* '[α-λ]' COLLATE C_UTF8; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE C_UTF8; -- same as above with cases reversed -- 2.34.1