Re: Empty string in lexeme for tsvector - Mailing list pgsql-hackers
From | Tom Lane |
---|---|
Subject | Re: Empty string in lexeme for tsvector |
Date | |
Msg-id | 258839.1635972057@sss.pgh.pa.us Whole thread Raw |
In response to | Re: Empty string in lexeme for tsvector (Jean-Christophe Arnu <jcarnu@gmail.com>) |
List | pgsql-hackers |
Jean-Christophe Arnu <jcarnu@gmail.com> writes: >> (By the same token, I think there's a good argument for >> tsvector_delete_arr to just ignore nulls, not throw an error. >> That's a somewhat orthogonal issue, though.) > Nulls are now ignored in tsvector_delete_arr(). I think setweight() with an array should handle this the same as ts_delete() with an array, so the attached v6 does it like that. > I also wonder if we should not also consider changing COPY FROM behaviour > on empty string lexemes. > Current version is just crashing on empty string lexemes. Should > we allow them anyway as COPY FROM input (it seems weird not to be able > to re-import dumped data) or "skipping them" just like array_to_tsvector() > does in the patched version (that may lead to database content changes) or > finally should we not change COPY behaviour ? No, I don't think so. tsvector's restriction against empty lexemes was certainly intentional from the beginning, so I wouldn't be surprised if we'd run into semantic difficulties if we remove it. Moreover, we're going on fourteen years with that restriction and we've had few complaints, so there's no field demand to loosen it. It's clearly just an oversight that array_to_tsvector() failed to enforce the restriction. I polished the docs and tests a bit more, too. I think the attached is committable -- any objections? regards, tom lane diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 4b49dff2ff..24447c0017 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -12920,8 +12920,10 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple <returnvalue>tsvector</returnvalue> </para> <para> - Converts an array of lexemes to a <type>tsvector</type>. - The given strings are used as-is without further processing. + Converts an array of text strings to a <type>tsvector</type>. + The given strings are used as lexemes as-is, without further + processing. Array elements must not be empty strings + or <literal>NULL</literal>. </para> <para> <literal>array_to_tsvector('{fat,cat,rat}'::text[])</literal> @@ -13104,6 +13106,9 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple Assigns the specified <parameter>weight</parameter> to elements of the <parameter>vector</parameter> that are listed in <parameter>lexemes</parameter>. + The strings in <parameter>lexemes</parameter> are taken as lexemes + as-is, without further processing. Strings that do not match any + lexeme in <parameter>vector</parameter> are ignored. </para> <para> <literal>setweight('fat:2,4 cat:3 rat:5,6B'::tsvector, 'A', '{cat,rat}')</literal> @@ -13265,6 +13270,8 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple <para> Removes any occurrence of the given <parameter>lexeme</parameter> from the <parameter>vector</parameter>. + The <parameter>lexeme</parameter> string is treated as a lexeme as-is, + without further processing. </para> <para> <literal>ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal> @@ -13281,6 +13288,9 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple Removes any occurrences of the lexemes in <parameter>lexemes</parameter> from the <parameter>vector</parameter>. + The strings in <parameter>lexemes</parameter> are taken as lexemes + as-is, without further processing. Strings that do not match any + lexeme in <parameter>vector</parameter> are ignored. </para> <para> <literal>ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat'])</literal> diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 9236ebcc8f..11ccb5297c 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -322,10 +322,9 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS) int lex_len, lex_pos; + /* Ignore null array elements, they surely don't match */ if (nulls[i]) - ereport(ERROR, - (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), - errmsg("lexeme array may not contain nulls"))); + continue; lex = VARDATA(dlexemes[i]); lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; @@ -602,10 +601,9 @@ tsvector_delete_arr(PG_FUNCTION_ARGS) int lex_len, lex_pos; + /* Ignore null array elements, they surely don't match */ if (nulls[i]) - ereport(ERROR, - (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), - errmsg("lexeme array may not contain nulls"))); + continue; lex = VARDATA(dlexemes[i]); lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; @@ -761,13 +759,21 @@ array_to_tsvector(PG_FUNCTION_ARGS) deconstruct_array(v, TEXTOID, -1, false, TYPALIGN_INT, &dlexemes, &nulls, &nitems); - /* Reject nulls (maybe we should just ignore them, instead?) */ + /* + * Reject nulls and zero length strings (maybe we should just ignore them, + * instead?) + */ for (i = 0; i < nitems; i++) { if (nulls[i]) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); + + if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0) + ereport(ERROR, + (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING), + errmsg("lexeme array may not contain empty strings"))); } /* Sort and de-dup, because this is required for a valid tsvector. */ diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out index 2601e312df..92c1c6e10b 100644 --- a/src/test/regress/expected/tstypes.out +++ b/src/test/regress/expected/tstypes.out @@ -85,6 +85,10 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B'; 'a':3A,4B 'b':2A 'ba':1237 (1 row) +SELECT $$'' '1' '2'$$::tsvector; -- error, empty lexeme is not allowed +ERROR: syntax error in tsvector: "'' '1' '2'" +LINE 1: SELECT $$'' '1' '2'$$::tsvector; + ^ --Base tsquery test SELECT '1'::tsquery; tsquery @@ -1258,8 +1262,12 @@ SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceshi 'base' 'hidden' 'strike' (1 row) -SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]); -ERROR: lexeme array may not contain nulls +SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', '', NULL]); + ts_delete +-------------------------- + 'base' 'hidden' 'strike' +(1 row) + SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); unnest --------------------------------------------- @@ -1328,8 +1336,11 @@ SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']); 'base' 'hidden' 'rebel' 'spaceship' 'strike' (1 row) +-- null and empty string are disallowed, since we mustn't make an empty lexeme SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]); ERROR: lexeme array may not contain nulls +SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', '']); +ERROR: lexeme array may not contain empty strings -- array_to_tsvector must sort and de-dup SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']); array_to_tsvector @@ -1367,14 +1378,12 @@ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', ' 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C (1 row) -SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}'); +SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', '', NULL]); setweight --------------------------------- 'a' 'asd' 'w':5,6,12B,13A 'zxc' (1 row) -SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]); -ERROR: lexeme array may not contain nulls SELECT ts_filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector,'{a}'); ts_filter ------------------------------------------------------------- diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql index 30c8c702f0..61e8f49c91 100644 --- a/src/test/regress/sql/tstypes.sql +++ b/src/test/regress/sql/tstypes.sql @@ -17,6 +17,7 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector; SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector)); SELECT '''w'':4A,3B,2C,1D,5 a:8'; SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B'; +SELECT $$'' '1' '2'$$::tsvector; -- error, empty lexeme is not allowed --Base tsquery test SELECT '1'::tsquery; @@ -239,7 +240,7 @@ SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3': SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']); SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']); SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']); -SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]); +SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', '', NULL]); SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); SELECT unnest('base hidden rebel spaceship strike'::tsvector); @@ -251,7 +252,9 @@ SELECT tsvector_to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D st SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector); SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']); +-- null and empty string are disallowed, since we mustn't make an empty lexeme SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]); +SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', '']); -- array_to_tsvector must sort and de-dup SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']); @@ -260,8 +263,7 @@ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c'); SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}'); SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}'); SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}'); -SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}'); -SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]); +SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', '', NULL]); SELECT ts_filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector,'{a}'); SELECT ts_filter('base hidden rebel spaceship strike'::tsvector, '{a}');
pgsql-hackers by date: