From afefb373f8777f772c34f1b2daec88a33de74eb1 Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 19 Feb 2024 14:43:15 -0800 Subject: [PATCH v19 6/6] Add builtin collation objects PG_C_UTF8 and PG_UNICODE_FAST. --- doc/src/sgml/charset.sgml | 34 ++++++++++++++++ src/include/catalog/pg_collation.dat | 6 +++ src/test/regress/expected/collate.utf8.out | 45 ++++++++++----------- src/test/regress/sql/collate.utf8.sql | 47 ++++++++++------------ 4 files changed, 82 insertions(+), 50 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 2abd898115..47f137e1f0 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -874,6 +874,40 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR"; + + PG_C_UTF8 + + + This collation sorts by Unicode code point. Behavior is efficient and + stable within a Postgres major version. + For the functions lower, + initcap, and upper it uses + Unicode simple case mapping. For pattern matching (including regular + expressions), it uses the POSIX Compatible variant of Unicode Compatibility + Properties. This collation is only available for encoding + UTF8. + + + + + + PG_UNICODE_FAST + + + This collation sorts by Unicode code point. Behavior is efficient and + stable within a Postgres major version. + For the functions lower, + initcap, and upper it uses + Unicode full case mapping. For pattern matching (including regular + expressions), it uses the Standard variant of Unicode Compatibility + Properties. It is only available for encoding + UTF8. + + + + C (equivalent to POSIX) diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index 938432e8a4..a187b3dcef 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -30,5 +30,11 @@ descr => 'sorts using the Unicode Collation Algorithm with default settings', collname => 'unicode', collprovider => 'i', collencoding => '-1', colllocale => 'und' }, +{ oid => '811', descr => 'sorts by Unicode code point; Unicode & POSIX character semantics', + collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6', + colllocale => 'C.UTF-8' }, +{ oid => '812', descr => 'sorts by Unicode code point; Unicode character semantics', + collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6', + colllocale => 'PG_UNICODE_FAST' }, ] diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out index ecf5305105..14cbc0f870 100644 --- a/src/test/regress/expected/collate.utf8.out +++ b/src/test/regress/expected/collate.utf8.out @@ -11,10 +11,8 @@ SET client_encoding TO UTF8; -- -- Test builtin PG_UNICODE_FAST locale. -- -CREATE COLLATION BUILTIN_UNICODE_FAST - ( provider = builtin, locale = 'PG_UNICODE_FAST' ); CREATE TABLE builtin_test1 ( - t TEXT COLLATE BUILTIN_UNICODE_FAST + t TEXT COLLATE PG_UNICODE_FAST ); INSERT INTO builtin_test1 VALUES ('abc DEF'), @@ -42,130 +40,129 @@ SELECT DROP TABLE builtin_test1; -- test Final_Sigma -SELECT lower('ΑΣ' COLLATE BUILTIN_UNICODE_FAST); -- 0391 03A3 +SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 lower ------- ας (1 row) -SELECT lower('ΑΣ0' COLLATE BUILTIN_UNICODE_FAST); -- 0391 03A3 0030 +SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030 lower ------- ας0 (1 row) -SELECT lower('ἈΣ̓' COLLATE BUILTIN_UNICODE_FAST); -- 0391 0343 03A3 0343 +SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 lower ------- ἀς̓ (1 row) -SELECT lower('ᾼΣͅ' COLLATE BUILTIN_UNICODE_FAST); -- 0391 0345 03A3 0345 +SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 lower ------- ᾳςͅ (1 row) -- test !Final_Sigma -SELECT lower('Σ' COLLATE BUILTIN_UNICODE_FAST); -- 03A3 +SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3 lower ------- σ (1 row) -SELECT lower('0Σ' COLLATE BUILTIN_UNICODE_FAST); -- 0030 03A3 +SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3 lower ------- 0σ (1 row) -SELECT lower('ΑΣΑ' COLLATE BUILTIN_UNICODE_FAST); -- 0391 03A3 0391 +SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391 lower ------- ασα (1 row) -SELECT lower('ἈΣ̓Α' COLLATE BUILTIN_UNICODE_FAST); -- 0391 0343 03A3 0343 0391 +SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391 lower ------- ἀσ̓α (1 row) -SELECT lower('ᾼΣͅΑ' COLLATE BUILTIN_UNICODE_FAST); -- 0391 0345 03A3 0345 0391 +SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391 lower ------- ᾳσͅα (1 row) -- properties -SELECT 'xyz' ~ '[[:alnum:]]' COLLATE BUILTIN_UNICODE_FAST; +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; ?column? ---------- t (1 row) -SELECT 'xyz' !~ '[[:upper:]]' COLLATE BUILTIN_UNICODE_FAST; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST; ?column? ---------- t (1 row) -SELECT '@' !~ '[[:alnum:]]' COLLATE BUILTIN_UNICODE_FAST; +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; ?column? ---------- t (1 row) -SELECT '=' !~ '[[:punct:]]' COLLATE BUILTIN_UNICODE_FAST; -- symbols are not punctuation +SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation ?column? ---------- t (1 row) -SELECT 'a8a' ~ '[[:digit:]]' COLLATE BUILTIN_UNICODE_FAST; +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST; ?column? ---------- t (1 row) -SELECT '൧' ~ '\d' COLLATE BUILTIN_UNICODE_FAST; +SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST; ?column? ---------- t (1 row) -- case mapping -SELECT 'xYz' ~* 'XyZ' COLLATE BUILTIN_UNICODE_FAST; +SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST; ?column? ---------- t (1 row) -SELECT 'xAb' ~* '[W-Y]' COLLATE BUILTIN_UNICODE_FAST; +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST; ?column? ---------- t (1 row) -SELECT 'xAb' !~* '[c-d]' COLLATE BUILTIN_UNICODE_FAST; +SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST; ?column? ---------- t (1 row) -SELECT 'Δ' ~* '[α-λ]' COLLATE BUILTIN_UNICODE_FAST; +SELECT 'Δ' ~* '[α-λ]' COLLATE PG_UNICODE_FAST; ?column? ---------- t (1 row) -SELECT 'δ' ~* '[Γ-Λ]' COLLATE BUILTIN_UNICODE_FAST; -- same as above with cases reversed +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed ?column? ---------- t (1 row) -DROP COLLATION BUILTIN_UNICODE_FAST; -- -- Test builtin C.UTF-8 locale. -- diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql index c0da1e7cb4..0c799b24a2 100644 --- a/src/test/regress/sql/collate.utf8.sql +++ b/src/test/regress/sql/collate.utf8.sql @@ -15,11 +15,8 @@ SET client_encoding TO UTF8; -- Test builtin PG_UNICODE_FAST locale. -- -CREATE COLLATION BUILTIN_UNICODE_FAST - ( provider = builtin, locale = 'PG_UNICODE_FAST' ); - CREATE TABLE builtin_test1 ( - t TEXT COLLATE BUILTIN_UNICODE_FAST + t TEXT COLLATE PG_UNICODE_FAST ); INSERT INTO builtin_test1 VALUES ('abc DEF'), @@ -40,36 +37,34 @@ SELECT DROP TABLE builtin_test1; -- test Final_Sigma -SELECT lower('ΑΣ' COLLATE BUILTIN_UNICODE_FAST); -- 0391 03A3 -SELECT lower('ΑΣ0' COLLATE BUILTIN_UNICODE_FAST); -- 0391 03A3 0030 -SELECT lower('ἈΣ̓' COLLATE BUILTIN_UNICODE_FAST); -- 0391 0343 03A3 0343 -SELECT lower('ᾼΣͅ' COLLATE BUILTIN_UNICODE_FAST); -- 0391 0345 03A3 0345 +SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 +SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030 +SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 +SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 -- test !Final_Sigma -SELECT lower('Σ' COLLATE BUILTIN_UNICODE_FAST); -- 03A3 -SELECT lower('0Σ' COLLATE BUILTIN_UNICODE_FAST); -- 0030 03A3 -SELECT lower('ΑΣΑ' COLLATE BUILTIN_UNICODE_FAST); -- 0391 03A3 0391 -SELECT lower('ἈΣ̓Α' COLLATE BUILTIN_UNICODE_FAST); -- 0391 0343 03A3 0343 0391 -SELECT lower('ᾼΣͅΑ' COLLATE BUILTIN_UNICODE_FAST); -- 0391 0345 03A3 0345 0391 +SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3 +SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3 +SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391 +SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391 +SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391 -- properties -SELECT 'xyz' ~ '[[:alnum:]]' COLLATE BUILTIN_UNICODE_FAST; -SELECT 'xyz' !~ '[[:upper:]]' COLLATE BUILTIN_UNICODE_FAST; -SELECT '@' !~ '[[:alnum:]]' COLLATE BUILTIN_UNICODE_FAST; -SELECT '=' !~ '[[:punct:]]' COLLATE BUILTIN_UNICODE_FAST; -- symbols are not punctuation -SELECT 'a8a' ~ '[[:digit:]]' COLLATE BUILTIN_UNICODE_FAST; -SELECT '൧' ~ '\d' COLLATE BUILTIN_UNICODE_FAST; +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST; +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST; +SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST; +SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST; -- case mapping -SELECT 'xYz' ~* 'XyZ' COLLATE BUILTIN_UNICODE_FAST; -SELECT 'xAb' ~* '[W-Y]' COLLATE BUILTIN_UNICODE_FAST; -SELECT 'xAb' !~* '[c-d]' COLLATE BUILTIN_UNICODE_FAST; -SELECT 'Δ' ~* '[α-λ]' COLLATE BUILTIN_UNICODE_FAST; -SELECT 'δ' ~* '[Γ-Λ]' COLLATE BUILTIN_UNICODE_FAST; -- same as above with cases reversed - -DROP COLLATION BUILTIN_UNICODE_FAST; +SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST; +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST; +SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST; +SELECT 'Δ' ~* '[α-λ]' COLLATE PG_UNICODE_FAST; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed -- -- Test builtin C.UTF-8 locale. -- 2.34.1