From fb66a4f669ccb0b6ea3c89723f733c3f9c294f6d Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Wed, 6 Mar 2024 18:15:33 -0800 Subject: [PATCH v24 5/5] Support PG_UNICODE_FAST locale in the builtin collation provider. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------ CATVERSION ------ The PG_UNICODE_FAST locale uses code point sort order (fast, memcmp-based) combined with Unicode character semantics. The character semantics are based on Unicode full case mapping. Full case mapping can map a single codepoint to multiple codepoints, such as "ß" uppercasing to "SS". Additionally, it handles context-sensitive mappings like the "final sigma", and it uses titlecase mappings such as "Dž" when titlecasing (rather than plain uppercase mappings). Importantly, the uppercasing of "ß" as "SS" is specifically mentioned by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics for case mapping and pattern matching, so if we changed it to use the PG_UNICODE_FAST locale, it would offer better compliance with the standard. For now, though, do not change the behavior of UCS_BASIC. Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org --- doc/src/sgml/charset.sgml | 17 +++ src/backend/regex/regc_pg_locale.c | 11 +- src/backend/utils/adt/formatting.c | 22 ++- src/backend/utils/adt/pg_locale.c | 6 + src/backend/utils/init/postinit.c | 1 + src/bin/initdb/initdb.c | 6 +- src/include/catalog/pg_collation.dat | 3 + src/include/utils/pg_locale.h | 1 + src/test/regress/expected/collate.utf8.out | 160 +++++++++++++++++++++ src/test/regress/sql/collate.utf8.sql | 60 ++++++++ 10 files changed, 276 insertions(+), 11 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 55bbb20dac..fc520138a6 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -886,6 +886,23 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR"; + + pg_unicode_fast + + + This collation sorts by Unicode code point values rather than natural + language order. For the functions lower, + initcap, and upper it uses + Unicode full case mapping. For pattern matching (including regular + expressions), it uses the Standard variant of Unicode Compatibility + Properties. Behavior is efficient and stable within a + Postgres major version. It is only + available for encoding UTF8. + + + + pg_c_utf8 diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 85f3238eb0..24c5200cef 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -78,6 +78,8 @@ static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; static Oid pg_regex_collation; +static bool regex_builtin_cclass_posix = false; + /* * Hard-wired character properties for C locale */ @@ -271,7 +273,10 @@ pg_set_regex_collation(Oid collation) if (pg_regex_locale) { if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN) + { pg_regex_strategy = PG_REGEX_BUILTIN; + regex_builtin_cclass_posix = !pg_regex_locale->info.builtin.casemap_full; + } else pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; } @@ -299,7 +304,7 @@ pg_wc_isdigit(pg_wchar c) return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); case PG_REGEX_BUILTIN: - return pg_u_isdigit(c, true); + return pg_u_isdigit(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit((wint_t) c); @@ -367,7 +372,7 @@ pg_wc_isalnum(pg_wchar c) return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); case PG_REGEX_BUILTIN: - return pg_u_isalnum(c, true); + return pg_u_isalnum(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum((wint_t) c); @@ -546,7 +551,7 @@ pg_wc_ispunct(pg_wchar c) return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); case PG_REGEX_BUILTIN: - return pg_u_ispunct(c, true); + return pg_u_ispunct(c, regex_builtin_cclass_posix); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct((wint_t) c); diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 97ccf23583..47601e75ba 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1695,13 +1695,15 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) dstsize = srclen + 1; result = palloc(dstsize); - needed = unicode_strlower(dst, dstsize, src, srclen, false); + needed = unicode_strlower(dst, dstsize, src, srclen, + mylocale->info.builtin.casemap_full); if (needed + 1 > dstsize) { /* grow buffer if needed and retry */ dstsize = needed + 1; dst = repalloc(dst, dstsize); - needed = unicode_strlower(dst, dstsize, src, srclen, false); + needed = unicode_strlower(dst, dstsize, src, srclen, + mylocale->info.builtin.casemap_full); Assert(needed + 1 == dstsize); } @@ -1842,13 +1844,15 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) dstsize = srclen + 1; result = palloc(dstsize); - needed = unicode_strupper(dst, dstsize, src, srclen, false); + needed = unicode_strupper(dst, dstsize, src, srclen, + mylocale->info.builtin.casemap_full); if (needed + 1 > dstsize) { /* grow buffer if needed and retry */ dstsize = needed + 1; dst = repalloc(dst, dstsize); - needed = unicode_strupper(dst, dstsize, src, srclen, false); + needed = unicode_strupper(dst, dstsize, src, srclen, + mylocale->info.builtin.casemap_full); Assert(needed + 1 == dstsize); } @@ -1927,6 +1931,7 @@ struct WordBoundaryState size_t offset; bool init; bool prev_alnum; + bool posix; }; /* @@ -1943,7 +1948,7 @@ initcap_wbnext(void *state) { pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + wbstate->offset); - bool curr_alnum = pg_u_isalnum(u, true); + bool curr_alnum = pg_u_isalnum(u, wbstate->posix); if (!wbstate->init || curr_alnum != wbstate->prev_alnum) { @@ -2030,6 +2035,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) .offset = 0, .init = false, .prev_alnum = false, + .posix = !mylocale->info.builtin.casemap_full, }; Assert(GetDatabaseEncoding() == PG_UTF8); @@ -2038,7 +2044,8 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) dstsize = srclen + 1; result = palloc(dstsize); - needed = unicode_strtitle(dst, dstsize, src, srclen, false, + needed = unicode_strtitle(dst, dstsize, src, srclen, + mylocale->info.builtin.casemap_full, initcap_wbnext, &wbstate); if (needed + 1 > dstsize) { @@ -2049,7 +2056,8 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) /* grow buffer if needed and retry */ dstsize = needed + 1; dst = repalloc(dst, dstsize); - needed = unicode_strtitle(dst, dstsize, src, srclen, false, + needed = unicode_strtitle(dst, dstsize, src, srclen, + mylocale->info.builtin.casemap_full, initcap_wbnext, &wbstate); Assert(needed + 1 == dstsize); } diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index a5aeabce94..ea88325f1c 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1596,6 +1596,7 @@ pg_newlocale_from_collation(Oid collid) result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext, locstr); + result.info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); } else if (collform->collprovider == COLLPROVIDER_LIBC) { @@ -2520,6 +2521,11 @@ builtin_validate_locale(int encoding, const char *locale) required_encoding = PG_UTF8; canonical_name = "C.UTF-8"; } + else if (strcmp(locale, "PG_UNICODE_FAST") == 0) + { + required_encoding = PG_UTF8; + canonical_name = "PG_UNICODE_FAST"; + } if (!canonical_name) ereport(ERROR, diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 0805398e24..9a357e3f90 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -432,6 +432,7 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect default_locale.info.builtin.locale = MemoryContextStrdup( TopMemoryContext, datlocale); + default_locale.info.builtin.casemap_full = (strcmp(datlocale, "PG_UNICODE_FAST") == 0); } else if (dbform->datlocprovider == COLLPROVIDER_ICU) { diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index ac33508d32..804cef35cb 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2408,6 +2408,8 @@ setlocales(void) else if (strcmp(datlocale, "C.UTF-8") == 0 || strcmp(datlocale, "C.UTF8") == 0) canonname = "C.UTF-8"; + else if (strcmp(datlocale, "PG_UNICODE_FAST") == 0) + canonname = "PG_UNICODE_FAST"; else pg_fatal("invalid locale name \"%s\" for builtin provider", datlocale); @@ -2703,7 +2705,9 @@ setup_locale_encoding(void) if (locale_provider == COLLPROVIDER_BUILTIN) { - if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8) + if ((strcmp(datlocale, "C.UTF-8") == 0 || + strcmp(datlocale, "PG_UNICODE_FAST") == 0) && + encodingid != PG_UTF8) pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"", datlocale, "UTF-8"); } diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index b95a89491d..a187b3dcef 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -33,5 +33,8 @@ { oid => '811', descr => 'sorts by Unicode code point; Unicode & POSIX character semantics', collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6', colllocale => 'C.UTF-8' }, +{ oid => '812', descr => 'sorts by Unicode code point; Unicode character semantics', + collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6', + colllocale => 'PG_UNICODE_FAST' }, ] diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 3d949d5112..b924fdf653 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -79,6 +79,7 @@ struct pg_locale_struct struct { const char *locale; + bool casemap_full; } builtin; locale_t lt; #ifdef USE_ICU diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out index eff0ef21ac..63327a8fdd 100644 --- a/src/test/regress/expected/collate.utf8.out +++ b/src/test/regress/expected/collate.utf8.out @@ -134,3 +134,163 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed t (1 row) +-- +-- Test PG_UNICODE_FAST +-- +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'unicode'); -- fails +ERROR: invalid locale name "unicode" for builtin provider +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'PG_UNICODE_FAST'); +CREATE TABLE test_pg_unicode_fast ( + t TEXT COLLATE PG_UNICODE_FAST +); +INSERT INTO test_pg_unicode_fast VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_unicode_fast; + t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes +-----------------+-----------------+------------------+-------------------+---------+---------------+-----------------+--------------- + abc DEF 123abc | abc def 123abc | Abc Def 123Abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 + ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19 + DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 + ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 + ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 +(6 rows) + +DROP TABLE test_pg_unicode_fast; +-- test Final_Sigma +SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 + lower +------- + ας +(1 row) + +SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030 + lower +------- + ας0 +(1 row) + +SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 + lower +------- + ἀς̓ +(1 row) + +SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 + lower +------- + ᾳςͅ +(1 row) + +-- test !Final_Sigma +SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3 + lower +------- + σ +(1 row) + +SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3 + lower +------- + 0σ +(1 row) + +SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391 + lower +------- + ασα +(1 row) + +SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391 + lower +------- + ἀσ̓α +(1 row) + +SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391 + lower +------- + ᾳσͅα +(1 row) + +-- properties +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation + ?column? +---------- + t +(1 row) + +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +-- case mapping +SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST; + ?column? +---------- + t +(1 row) + +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql index 1f5f9ef491..c63928d38e 100644 --- a/src/test/regress/sql/collate.utf8.sql +++ b/src/test/regress/sql/collate.utf8.sql @@ -65,3 +65,63 @@ SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed + +-- +-- Test PG_UNICODE_FAST +-- + +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'unicode'); -- fails +CREATE COLLATION regress_pg_unicode_fast ( + provider = builtin, locale = 'PG_UNICODE_FAST'); + +CREATE TABLE test_pg_unicode_fast ( + t TEXT COLLATE PG_UNICODE_FAST +); +INSERT INTO test_pg_unicode_fast VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); + +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_unicode_fast; + +DROP TABLE test_pg_unicode_fast; + +-- test Final_Sigma +SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 +SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030 +SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 +SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 + +-- test !Final_Sigma +SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3 +SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3 +SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391 +SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391 +SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391 + +-- properties + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST; +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; +SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST; +SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST; + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST; +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST; +SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST; +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed -- 2.34.1