Re: Empty string in lexeme for tsvector - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Empty string in lexeme for tsvector
Date
Msg-id 258839.1635972057@sss.pgh.pa.us
Whole thread Raw
In response to Re: Empty string in lexeme for tsvector  (Jean-Christophe Arnu <jcarnu@gmail.com>)
List pgsql-hackers
Jean-Christophe Arnu <jcarnu@gmail.com> writes:
>> (By the same token, I think there's a good argument for
>> tsvector_delete_arr to just ignore nulls, not throw an error.
>> That's a somewhat orthogonal issue, though.)

> Nulls are now ignored in tsvector_delete_arr().

I think setweight() with an array should handle this the same as
ts_delete() with an array, so the attached v6 does it like that.

> I also wonder if we should not also consider changing COPY FROM  behaviour
> on empty string lexemes.
> Current version is just crashing on empty string lexemes. Should
> we allow them  anyway as COPY FROM input (it seems weird not to be able
> to re-import dumped data) or "skipping them" just like array_to_tsvector()
> does in the patched version (that may lead to database content changes) or
> finally should we not change COPY behaviour ?

No, I don't think so.  tsvector's restriction against empty lexemes was
certainly intentional from the beginning, so I wouldn't be surprised if
we'd run into semantic difficulties if we remove it.  Moreover, we're
going on fourteen years with that restriction and we've had few
complaints, so there's no field demand to loosen it.  It's clearly just
an oversight that array_to_tsvector() failed to enforce the restriction.

I polished the docs and tests a bit more, too.  I think the attached
is committable -- any objections?

            regards, tom lane

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 4b49dff2ff..24447c0017 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -12920,8 +12920,10 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
         <returnvalue>tsvector</returnvalue>
        </para>
        <para>
-        Converts an array of lexemes to a <type>tsvector</type>.
-        The given strings are used as-is without further processing.
+        Converts an array of text strings to a <type>tsvector</type>.
+        The given strings are used as lexemes as-is, without further
+        processing.  Array elements must not be empty strings
+        or <literal>NULL</literal>.
        </para>
        <para>
         <literal>array_to_tsvector('{fat,cat,rat}'::text[])</literal>
@@ -13104,6 +13106,9 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
         Assigns the specified <parameter>weight</parameter> to elements
         of the <parameter>vector</parameter> that are listed
         in <parameter>lexemes</parameter>.
+        The strings in <parameter>lexemes</parameter> are taken as lexemes
+        as-is, without further processing.  Strings that do not match any
+        lexeme in <parameter>vector</parameter> are ignored.
        </para>
        <para>
         <literal>setweight('fat:2,4 cat:3 rat:5,6B'::tsvector, 'A', '{cat,rat}')</literal>
@@ -13265,6 +13270,8 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
        <para>
         Removes any occurrence of the given <parameter>lexeme</parameter>
         from the <parameter>vector</parameter>.
+        The <parameter>lexeme</parameter> string is treated as a lexeme as-is,
+        without further processing.
        </para>
        <para>
         <literal>ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal>
@@ -13281,6 +13288,9 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
         Removes any occurrences of the lexemes
         in <parameter>lexemes</parameter>
         from the <parameter>vector</parameter>.
+        The strings in <parameter>lexemes</parameter> are taken as lexemes
+        as-is, without further processing.  Strings that do not match any
+        lexeme in <parameter>vector</parameter> are ignored.
        </para>
        <para>
         <literal>ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat'])</literal>
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 9236ebcc8f..11ccb5297c 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -322,10 +322,9 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
         int            lex_len,
                     lex_pos;

+        /* Ignore null array elements, they surely don't match */
         if (nulls[i])
-            ereport(ERROR,
-                    (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
-                     errmsg("lexeme array may not contain nulls")));
+            continue;

         lex = VARDATA(dlexemes[i]);
         lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
@@ -602,10 +601,9 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
         int            lex_len,
                     lex_pos;

+        /* Ignore null array elements, they surely don't match */
         if (nulls[i])
-            ereport(ERROR,
-                    (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
-                     errmsg("lexeme array may not contain nulls")));
+            continue;

         lex = VARDATA(dlexemes[i]);
         lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
@@ -761,13 +759,21 @@ array_to_tsvector(PG_FUNCTION_ARGS)

     deconstruct_array(v, TEXTOID, -1, false, TYPALIGN_INT, &dlexemes, &nulls, &nitems);

-    /* Reject nulls (maybe we should just ignore them, instead?) */
+    /*
+     * Reject nulls and zero length strings (maybe we should just ignore them,
+     * instead?)
+     */
     for (i = 0; i < nitems; i++)
     {
         if (nulls[i])
             ereport(ERROR,
                     (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
                      errmsg("lexeme array may not contain nulls")));
+
+        if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0)
+            ereport(ERROR,
+                    (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
+                     errmsg("lexeme array may not contain empty strings")));
     }

     /* Sort and de-dup, because this is required for a valid tsvector. */
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
index 2601e312df..92c1c6e10b 100644
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -85,6 +85,10 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
  'a':3A,4B 'b':2A 'ba':1237
 (1 row)

+SELECT $$'' '1' '2'$$::tsvector;  -- error, empty lexeme is not allowed
+ERROR:  syntax error in tsvector: "'' '1' '2'"
+LINE 1: SELECT $$'' '1' '2'$$::tsvector;
+               ^
 --Base tsquery test
 SELECT '1'::tsquery;
  tsquery
@@ -1258,8 +1262,12 @@ SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceshi
  'base' 'hidden' 'strike'
 (1 row)

-SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
-ERROR:  lexeme array may not contain nulls
+SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', '', NULL]);
+        ts_delete
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
 SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
                    unnest
 ---------------------------------------------
@@ -1328,8 +1336,11 @@ SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
  'base' 'hidden' 'rebel' 'spaceship' 'strike'
 (1 row)

+-- null and empty string are disallowed, since we mustn't make an empty lexeme
 SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
 ERROR:  lexeme array may not contain nulls
+SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', '']);
+ERROR:  lexeme array may not contain empty strings
 -- array_to_tsvector must sort and de-dup
 SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']);
  array_to_tsvector
@@ -1367,14 +1378,12 @@ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '
  'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C
 (1 row)

-SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', '', NULL]);
             setweight
 ---------------------------------
  'a' 'asd' 'w':5,6,12B,13A 'zxc'
 (1 row)

-SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
-ERROR:  lexeme array may not contain nulls
 SELECT ts_filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12
won:9'::tsvector,'{a}'); 
                           ts_filter
 -------------------------------------------------------------
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
index 30c8c702f0..61e8f49c91 100644
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -17,6 +17,7 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector;
 SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector));
 SELECT '''w'':4A,3B,2C,1D,5 a:8';
 SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
+SELECT $$'' '1' '2'$$::tsvector;  -- error, empty lexeme is not allowed

 --Base tsquery test
 SELECT '1'::tsquery;
@@ -239,7 +240,7 @@ SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3':
 SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector,
ARRAY['spaceship','leya','rebel']);
 SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
 SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
-SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
+SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', '', NULL]);

 SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
 SELECT unnest('base hidden rebel spaceship strike'::tsvector);
@@ -251,7 +252,9 @@ SELECT tsvector_to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D st
 SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector);

 SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+-- null and empty string are disallowed, since we mustn't make an empty lexeme
 SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
+SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', '']);
 -- array_to_tsvector must sort and de-dup
 SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']);

@@ -260,8 +263,7 @@ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
 SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
 SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
 SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
-SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
-SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', '', NULL]);

 SELECT ts_filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12
won:9'::tsvector,'{a}'); 
 SELECT ts_filter('base hidden rebel spaceship strike'::tsvector, '{a}');

pgsql-hackers by date:

Previous
From: Daniel Gustafsson
Date:
Subject: Re: remove internal support in pgcrypto?
Next
From: Daniel Gustafsson
Date:
Subject: Re: On login trigger: take three