Thread: a tsearch2 (8.2.4) dictionary that only filters out stopwords

a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Jan Urbański
Date:
Hi,

the rationale for this patch is rather complicated, as it's related to
the peculiarities of Polish grammar. Please read on.

I'm using PostgreSQL 8.2.4 and the ispell tsearch2 dictionary. The
problem is as follows. In Polish (and possibly other languages that
don't come to my mind at the moment) a noun can take different forms
depending on the grammatical context. This is called declension. For
exmple the noun 'oda' (which means 'ode' in English) can take the form
'od' in certain cases. However, the word in Polish 'od' is also a
preposition. The problem with the ispell dictionary is that it first
reduces a lexeme to it's stem and then checks whether it is or is not a
stopword.

This means that I either have to agree with the fact that the tsvectors
for my documents will contain large numbers of the noun 'oda' (because
each time a preposition 'od' is used in the text it will be stemmed to
produce 'oda' and then indexed) or I have to include the word 'oda' in
the stopwords file and thus eliminate a perfectly good noun from my
tsvectors.

The solution I came up with was simple: write a dictionary, that does
only one thing: looks up the lexeme in a stopwords file and either
discards it or returns NULL. That way I could use it as the first
dictionary is the dictionary stach for lexeme types I'm interested in
and it would discard every instance of 'od', while passing every
non-stopword (in particular 'oda') to the ispell dictionary.

Tha attached patch adds a dictionary called stop to the set of standard
dictionaries that one gets after installing tsearch2. The C code may not
be first-class (however it works for me in a real business solution) -
it's quite trivial and I'd be happy if some more experienced Postgres
hackers would implement the idea in a cleaner/safer way. It's been
tested on 8.2.4 and compiles on 8.2.5. I haven't even looked at the code
for 8.3 yet, but maybe the change could somehow make it's way into the
integrated full text search?

Regards,
Jan Urbanski
Warsaw University
http://fiok.pl/

--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin
diff -Naur postgresql-8.2.4-orig/contrib/tsearch2/dict_stop.c postgresql-8.2.4/contrib/tsearch2/dict_stop.c
--- postgresql-8.2.4-orig/contrib/tsearch2/dict_stop.c    1970-01-01 01:00:00.000000000 +0100
+++ postgresql-8.2.4/contrib/tsearch2/dict_stop.c    2007-09-14 19:34:12.000000000 +0200
@@ -0,0 +1,69 @@
+/*
+ * A dictionary that only recognizes stopwords (and discards them)
+ * Jan Urbanski <wulczer@students.mimuw.edu.pl>
+ */
+#include "postgres.h"
+#include "common.h"
+#include "dict.h"
+#include "ts_locale.h"
+
+
+PG_FUNCTION_INFO_V1(stop_init);
+Datum        stop_init(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(stop_lexize);
+Datum        stop_lexize(PG_FUNCTION_ARGS);
+
+Datum
+stop_init(PG_FUNCTION_ARGS)
+{
+    StopList *l = (StopList *) malloc(sizeof(StopList));
+
+    if (!l)
+        ereport(ERROR,
+                (errcode(ERRCODE_OUT_OF_MEMORY),
+                 errmsg("out of memory")));
+    memset(l, 0, sizeof(StopList));
+    l->wordop = lowerstr;
+
+    if (!PG_ARGISNULL(0) && PG_GETARG_POINTER(0) != NULL)
+    {
+        text       *in = PG_GETARG_TEXT_P(0);
+
+        readstoplist(in, l);
+        sortstoplist(l);
+        PG_FREE_IF_COPY(in, 0);
+    }
+
+    PG_RETURN_POINTER(l);
+}
+
+Datum
+stop_lexize(PG_FUNCTION_ARGS)
+{
+    StopList   *l = (StopList *) PG_GETARG_POINTER(0);
+    char       *in = (char *) PG_GETARG_POINTER(1);
+    char       *utxt = pnstrdup(in, PG_GETARG_INT32(2));
+    TSLexeme   *res = palloc(sizeof(TSLexeme) * 2);
+    char       *txt = lowerstr(utxt);
+
+    int       found = 0;
+
+    pfree(utxt);
+    memset(res, 0, sizeof(TSLexeme) * 2);
+    if (*txt == '\0' || searchstoplist(l, txt))
+    {
+        found = 1;
+    }
+    pfree(txt);
+
+    if (found)
+    {
+        PG_RETURN_POINTER(res);
+    }
+    else
+    {
+        pfree(res);
+        PG_RETURN_POINTER(NULL);
+    }
+}
diff -Naur postgresql-8.2.4-orig/contrib/tsearch2/Makefile postgresql-8.2.4/contrib/tsearch2/Makefile
--- postgresql-8.2.4-orig/contrib/tsearch2/Makefile    2007-09-14 19:28:48.000000000 +0200
+++ postgresql-8.2.4/contrib/tsearch2/Makefile    2007-09-14 19:30:52.000000000 +0200
@@ -2,7 +2,7 @@

 MODULE_big = tsearch2
 OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
-       dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \
+       dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o dict_stop.o \
        wparser.o wparser_def.o \
        ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
        tsvector_op.o rank.o ts_stat.o \
diff -Naur postgresql-8.2.4-orig/contrib/tsearch2/tsearch.sql.in postgresql-8.2.4/contrib/tsearch2/tsearch.sql.in
--- postgresql-8.2.4-orig/contrib/tsearch2/tsearch.sql.in    2007-09-14 19:28:48.000000000 +0200
+++ postgresql-8.2.4/contrib/tsearch2/tsearch.sql.in    2007-09-14 19:30:52.000000000 +0200
@@ -82,6 +82,25 @@
     'English Stemmer. Snowball.'
 ;

+CREATE FUNCTION stop_init(internal)
+    RETURNS internal
+    as 'MODULE_PATHNAME'
+    LANGUAGE C;
+
+CREATE FUNCTION stop_lexize(internal,internal,int4)
+    RETURNS internal
+    as 'MODULE_PATHNAME'
+    LANGUAGE C
+    RETURNS NULL ON NULL INPUT;
+
+insert into pg_ts_dict select
+    'stop',
+    'stop_init(internal)',
+    'contrib/english.stop',
+    'stop_lexize(internal,internal,int4)',
+    'Stopwords sieve. Must have stopwords file.'
+;
+
 CREATE FUNCTION snb_ru_init_koi8(internal)
     RETURNS internal
     as 'MODULE_PATHNAME'

Attachment

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Tom Lane
Date:
=?UTF-8?B?SmFuIFVyYmHFhHNraQ==?= <j.urbanski@students.mimuw.edu.pl> writes:
> The solution I came up with was simple: write a dictionary, that does
> only one thing: looks up the lexeme in a stopwords file and either
> discards it or returns NULL.

Doesn't the "simple" dictionary handle this?

            regards, tom lane

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Jan Urbański
Date:
>> The solution I came up with was simple: write a dictionary, that does
>> only one thing: looks up the lexeme in a stopwords file and either
>> discards it or returns NULL.
>
> Doesn't the "simple" dictionary handle this?

I don't think so. The 'simple' dictionary discards stopwords, but
accepts any other lexemes. So if use {'simple', 'pl_ispell'} for my
config, I'll get rid of the stopwords, but I won't get any lexemes
stemmed by ispell. Every lexeme that's not a stopword will produce the
very same lexeme (this is how I think the 'simple' dictionary works).

My dictionary does basically the same thing as the 'simple' dictionary,
but it returns NULL instead of the original lexeme in case the lexeme is
not found in the stopwords file.

Regards,
--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin


Attachment

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Heikki Linnakangas
Date:
Jan Urbański wrote:
>>> The solution I came up with was simple: write a dictionary, that does
>>> only one thing: looks up the lexeme in a stopwords file and either
>>> discards it or returns NULL.
>> Doesn't the "simple" dictionary handle this?
>
> I don't think so. The 'simple' dictionary discards stopwords, but
> accepts any other lexemes. So if use {'simple', 'pl_ispell'} for my
> config, I'll get rid of the stopwords, but I won't get any lexemes
> stemmed by ispell. Every lexeme that's not a stopword will produce the
> very same lexeme (this is how I think the 'simple' dictionary works).
>
> My dictionary does basically the same thing as the 'simple' dictionary,
> but it returns NULL instead of the original lexeme in case the lexeme is
> not found in the stopwords file.

In the long term, what we really need a more flexible way to chain
dictionaries. In this case, you would first check against one stopword
list, eliminating 'od', then check the ispell dictionary, and then check
another stopword list without 'od'.

I suggested that a while ago
(http://archives.postgresql.org/pgsql-hackers/2007-08/msg01036.php).
Hopefully Oleg or someone else gets around restructuring the
dictionaries in a future release.

I wonder if you could hack the ispell dictionary file to treat oda
specially?

--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Jan Urbański
Date:
> dictionaries. In this case, you would first check against one stopword
> list, eliminating 'od', then check the ispell dictionary, and then check
> another stopword list without 'od'.

My problem is basically solved using the patch I sent earlier. I use
'{stop, pl_ispell, simple}' which has the effect of:
a) eliminating words that are stopwords but stemmed produce
non-stopwords (such as  'od', that gets stemmed to 'oda')
b) stemming non-stopwords properly (using an ispell dictionary)
c) indexing words that are not reckognized by ispell, (for instance
'postgresql' gets indexed as 'postgresql')

> I suggested that a while ago
> (http://archives.postgresql.org/pgsql-hackers/2007-08/msg01036.php).
> Hopefully Oleg or someone else gets around restructuring the
> dictionaries in a future release.

I'm gald to see I'm not the only one who is in need of a more
sophisticated way of dealing with dictionaries chaining. I understand
however the problems that arise when one wants to extend the dictionary
API beyond the reject/accept/pass-on schema. For these three we have an
easy way of passing the result from lexize - it returns an empty array,
an array of stemmed lexemes or NULL. If more complex actions were to be
taken, I'm afraid lexize would have to return something more complex
than just text[].

> I wonder if you could hack the ispell dictionary file to treat oda
> specially?

I thought about it, but it turned out that writing a custom dictionary
was easier than figuring out how ispell works internally.

Regards,
--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin


Attachment

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Jan Urbański
Date:
> This example still doesn't seem very convincing --- why would you not
> merely attach the stopword list to the pl_ispell dictionary?

Because the ispell-based dictionaries first stem the lexeme and then
search for it in the stopwords file. The situation here is that a
stopword is first stemmed to produce another lexeme (which is not in the
stopwords file, as it's a perfectly valid word) and then gets indexed,
instead of being discarded.
To restate: the word 'od' in Polish is both a preposition and a declined
form of the noun 'oda'. The ispell dictionary when passed the lexeme
'od' first stems it to produce 'oda' and then fails to find it in the
stopwords file. If I'd include the word 'oda' in the stopwords file, I'd
be losing information about the noun 'oda' appearing in documents.

I'm still trying to find an English example, as I'm sure it would be
easier to understand by most readers of this list. Nothing comes to my
mind, however - I guess some languages just have rotten luck with their
grammar.

> If there is a use-case for it, IMHO it'd be better to add a boolean
> accept-or-pass-on parameter to the "simple" dictionary than to add a
> whole new dictionary type.

Ah, I never thought of it. You may be very right - it does look like an
easier solution. However, it would require coding some basic parsing
logic into the dex_init procedure, because right now the 'simple'
dictionary expects dict_initoption to be a path to the stopwords file.
Do you mean something like 'StopFile="/path/to/stopwords",
AcceptUnknown=0'" ?

Regards,
Jan Urbanski
--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin


Attachment

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Tom Lane
Date:
=?UTF-8?B?SmFuIFVyYmHFhHNraQ==?= <j.urbanski@students.mimuw.edu.pl> writes:
>> If there is a use-case for it, IMHO it'd be better to add a boolean
>> accept-or-pass-on parameter to the "simple" dictionary than to add a
>> whole new dictionary type.

> Ah, I never thought of it. You may be very right - it does look like an
> easier solution. However, it would require coding some basic parsing
> logic into the dex_init procedure, because right now the 'simple'
> dictionary expects dict_initoption to be a path to the stopwords file.

That doesn't have a whole lot to do with where we are today:
http://developer.postgresql.org/pgdocs/postgres/textsearch-dictionaries.html#TEXTSEARCH-SIMPLE-DICTIONARY
http://developer.postgresql.org/cvsweb.cgi/pgsql/src/backend/tsearch/dict_simple.c

            regards, tom lane

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Jan Urbański
Date:
> That doesn't have a whole lot to do with where we are today:
> http://developer.postgresql.org/pgdocs/postgres/textsearch-dictionaries.html#TEXTSEARCH-SIMPLE-DICTIONARY
> http://developer.postgresql.org/cvsweb.cgi/pgsql/src/backend/tsearch/dict_simple.c

Great, I didn't know the API was that convenient in 8.3. I'll try
posting a working patch for 8.3 during the weekend.

Regards,
--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin


Attachment

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Tom Lane
Date:
> +            defstring = defGetString(defel);
> +            if (pg_strcasecmp(defstring, "True") == 0)
> +                d->acceptAll = true;
> +            else if (pg_strcasecmp(defstring, "False") == 0)
> +                d->acceptAll = false;
> +            else
> +                ereport(ERROR,
> +                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
> +                     errmsg("invalid value for AcceptAll parameter: \"%s\"",
> +                             defstring)));

This bit should be replaced with defGetBoolean.  Otherwise it looks
reasonably sane.

            regards, tom lane

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Jan Urbański
Date:
Jan Urbański wrote:
> Great, I didn't know the API was that convenient in 8.3. I'll try
> posting a working patch for 8.3 during the weekend.

Here's the patch for 8.3beta2. As was suggested I added a configuration
parameter to the 'simple' dictionary called AcceptAll so now it can work
in two modes: either accept everything (the default) or do not
reckognize anything (return NULL). Of couse stopwords are still being
weeded out.

The patch includes changes to the documentation (which was inconsistent
by the way: it stated that the 'simple' dictionary returns NULL for
stopwords, when in fact it returns an empty array).

Regards,
Jan Urbanski
--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin
diff -Naur postgresql-8.3beta2-orig/doc/src/sgml/textsearch.sgml postgresql-8.3beta2/doc/src/sgml/textsearch.sgml
--- postgresql-8.3beta2-orig/doc/src/sgml/textsearch.sgml    2007-10-27 02:19:45.000000000 +0200
+++ postgresql-8.3beta2/doc/src/sgml/textsearch.sgml    2007-11-14 03:35:48.000000000 +0100
@@ -2090,9 +2090,10 @@
    <para>
     The <literal>simple</> dictionary template operates by converting the
     input token to lower case and checking it against a file of stop words.
-    If it is found in the file then <literal>NULL</> is returned, causing
-    the token to be discarded.  If not, the lower-cased form of the word
-    is returned as the normalized lexeme.
+    If it is found in the file then an empty array is returned. If not, the
+    return value depends on the configuration. The default is to return the
+    lower-cased form of the word, but one might choose to
+    return <literal>NULL</> insead.
    </para>

    <para>
@@ -2135,6 +2136,34 @@
 </programlisting>
    </para>

+   <para>
+     We can also choose to return <literal>NULL</> insead of the lower-cased
+     lexeme if it is not found in the stop words file. This can be useful if
+     we just want to pass the unchanged lexeme to another dictionary instead
+     of reporting it as reckognized. We can control this behaviour through
+     the <literal>AcceptAll</> parameter. Correct values for this parameter
+     are <literal>true</> and <literal>false</>, the default
+     is <literal>true</>.
+   </para>
+
+   <para>
+     Using the same configuration as in the previous example:
+
+<programlisting>
+ALTER TEXT SEARCH DICTIONARY public.simple_dict ( AcceptAll = false );
+
+SELECT ts_lexize('public.simple_dict','YeS');
+ ts_lexize
+-----------
+
+
+SELECT ts_lexize('public.simple_dict','The');
+ ts_lexize
+-----------
+ {}
+</programlisting>
+   </para>
+
    <caution>
     <para>
      Most types of dictionaries rely on configuration files, such as files of
diff -Naur postgresql-8.3beta2-orig/src/backend/tsearch/dict_simple.c
postgresql-8.3beta2/src/backend/tsearch/dict_simple.c
--- postgresql-8.3beta2-orig/src/backend/tsearch/dict_simple.c    2007-08-25 02:03:59.000000000 +0200
+++ postgresql-8.3beta2/src/backend/tsearch/dict_simple.c    2007-11-14 03:39:45.000000000 +0100
@@ -23,6 +23,7 @@
 typedef struct
 {
     StopList    stoplist;
+    bool        acceptAll;
 } DictSimple;


@@ -31,8 +32,12 @@
 {
     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
-    bool        stoploaded = false;
+    bool        stoploaded = false,
+            acceptloaded = false;
     ListCell   *l;
+    const char    *defstring;
+
+    d->acceptAll = true;

     foreach(l, dictoptions)
     {
@@ -47,6 +52,24 @@
             readstoplist(defGetString(defel), &d->stoplist, lowerstr);
             stoploaded = true;
         }
+        else if (pg_strcasecmp("AcceptAll", defel->defname) == 0)
+        {
+            if (acceptloaded)
+                ereport(ERROR,
+                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                     errmsg("multiple AcceptAll parameters")));
+            defstring = defGetString(defel);
+            if (pg_strcasecmp(defstring, "True") == 0)
+                d->acceptAll = true;
+            else if (pg_strcasecmp(defstring, "False") == 0)
+                d->acceptAll = false;
+            else
+                ereport(ERROR,
+                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                     errmsg("invalid value for AcceptAll parameter: \"%s\"",
+                             defstring)));
+            acceptloaded = true;
+        }
         else
         {
             ereport(ERROR,
@@ -71,9 +94,18 @@
     txt = lowerstr_with_len(in, len);

     if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
+    {
         pfree(txt);
+        PG_RETURN_POINTER(res);
+    }
     else
-        res[0].lexeme = txt;
-
-    PG_RETURN_POINTER(res);
+    {
+        if (d->acceptAll)
+        {
+            res[0].lexeme = txt;
+            PG_RETURN_POINTER(res);
+        }
+        else
+            PG_RETURN_POINTER(NULL);
+    }
 }

Attachment

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Jan Urbański
Date:
> This bit should be replaced with defGetBoolean.  Otherwise it looks
> reasonably sane.

Fixed that, thank you.

Regards,
Jan Urbanski
--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin
diff -Naur postgresql-8.3beta2-orig/doc/src/sgml/textsearch.sgml postgresql-8.3beta2/doc/src/sgml/textsearch.sgml
--- postgresql-8.3beta2-orig/doc/src/sgml/textsearch.sgml    2007-10-27 02:19:45.000000000 +0200
+++ postgresql-8.3beta2/doc/src/sgml/textsearch.sgml    2007-11-14 03:35:48.000000000 +0100
@@ -2090,9 +2090,10 @@
    <para>
     The <literal>simple</> dictionary template operates by converting the
     input token to lower case and checking it against a file of stop words.
-    If it is found in the file then <literal>NULL</> is returned, causing
-    the token to be discarded.  If not, the lower-cased form of the word
-    is returned as the normalized lexeme.
+    If it is found in the file then an empty array is returned. If not, the
+    return value depends on the configuration. The default is to return the
+    lower-cased form of the word, but one might choose to
+    return <literal>NULL</> insead.
    </para>

    <para>
@@ -2135,6 +2136,34 @@
 </programlisting>
    </para>

+   <para>
+     We can also choose to return <literal>NULL</> insead of the lower-cased
+     lexeme if it is not found in the stop words file. This can be useful if
+     we just want to pass the unchanged lexeme to another dictionary instead
+     of reporting it as reckognized. We can control this behaviour through
+     the <literal>AcceptAll</> parameter. Correct values for this parameter
+     are <literal>true</> and <literal>false</>, the default
+     is <literal>true</>.
+   </para>
+
+   <para>
+     Using the same configuration as in the previous example:
+
+<programlisting>
+ALTER TEXT SEARCH DICTIONARY public.simple_dict ( AcceptAll = false );
+
+SELECT ts_lexize('public.simple_dict','YeS');
+ ts_lexize
+-----------
+
+
+SELECT ts_lexize('public.simple_dict','The');
+ ts_lexize
+-----------
+ {}
+</programlisting>
+   </para>
+
    <caution>
     <para>
      Most types of dictionaries rely on configuration files, such as files of
diff -Naur postgresql-8.3beta2-orig/src/backend/tsearch/dict_simple.c
postgresql-8.3beta2/src/backend/tsearch/dict_simple.c
--- postgresql-8.3beta2-orig/src/backend/tsearch/dict_simple.c    2007-08-25 02:03:59.000000000 +0200
+++ postgresql-8.3beta2/src/backend/tsearch/dict_simple.c    2007-11-14 12:17:05.000000000 +0100
@@ -23,6 +23,7 @@
 typedef struct
 {
     StopList    stoplist;
+    bool        acceptAll;
 } DictSimple;


@@ -31,9 +32,12 @@
 {
     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
-    bool        stoploaded = false;
+    bool        stoploaded = false,
+            acceptloaded = false;
     ListCell   *l;

+    d->acceptAll = true;
+
     foreach(l, dictoptions)
     {
         DefElem    *defel = (DefElem *) lfirst(l);
@@ -47,6 +51,18 @@
             readstoplist(defGetString(defel), &d->stoplist, lowerstr);
             stoploaded = true;
         }
+        else if (pg_strcasecmp("AcceptAll", defel->defname) == 0)
+        {
+            if (acceptloaded)
+                ereport(ERROR,
+                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                     errmsg("multiple AcceptAll parameters")));
+            if (defGetBoolean(defel))
+                d->acceptAll = true;
+            else
+                d->acceptAll = false;
+            acceptloaded = true;
+        }
         else
         {
             ereport(ERROR,
@@ -71,9 +87,18 @@
     txt = lowerstr_with_len(in, len);

     if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
+    {
         pfree(txt);
+        PG_RETURN_POINTER(res);
+    }
     else
-        res[0].lexeme = txt;
-
-    PG_RETURN_POINTER(res);
+    {
+        if (d->acceptAll)
+        {
+            res[0].lexeme = txt;
+            PG_RETURN_POINTER(res);
+        }
+        else
+            PG_RETURN_POINTER(NULL);
+    }
 }

Attachment

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Tom Lane
Date:
This patch:
http://archives.postgresql.org/pgsql-patches/2007-11/msg00137.php
seems simple and useful enough that I think we ought to slip it into
8.3, even though we are far past feature freeze.

As the "simple" dictionary type stands in CVS HEAD, it is only useful as
the last dictionary in a stack, since it never passes anything on as
unrecognized.  With the proposed AcceptAll = false option, it could be
used to filter out some stopwords before feeding tokens to another
dictionary.  While most dictionary types have their own stopword support,
some of them match stopwords after their own normalization processing,
and so there's no way to filter on pre-normalized words.  That seems
like a good improvement, even without the specific need-example that
Jan provided at the start of the thread.

Normally we'd never consider adding a new feature so late in the
development cycle, but this seems small enough and useful enough
to make an exception.  Comments?

            regards, tom lane

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Bruce Momjian
Date:
Tom Lane wrote:
> This patch:
> http://archives.postgresql.org/pgsql-patches/2007-11/msg00137.php
> seems simple and useful enough that I think we ought to slip it into
> 8.3, even though we are far past feature freeze.
>
> As the "simple" dictionary type stands in CVS HEAD, it is only useful as
> the last dictionary in a stack, since it never passes anything on as
> unrecognized.  With the proposed AcceptAll = false option, it could be
> used to filter out some stopwords before feeding tokens to another
> dictionary.  While most dictionary types have their own stopword support,
> some of them match stopwords after their own normalization processing,
> and so there's no way to filter on pre-normalized words.  That seems
> like a good improvement, even without the specific need-example that
> Jan provided at the start of the thread.
>
> Normally we'd never consider adding a new feature so late in the
> development cycle, but this seems small enough and useful enough
> to make an exception.  Comments?

Agreed.  The logic is that textsearch is getting a major overhaul in 8.3
and it is reasonable to keep adjusting things during beta.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://postgres.enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Oleg Bartunov
Date:
In principle the right way is to allow any dictionary have option
like 'PassThrough' and internal function get_dict_options(dict, option)
to check if PassThrough option is true.
Let's consider one example - removing accents.
In the past I always recommend people to use regex functions before
to_tsvector conversion to remove accents, but recently I was noticed that
such trick doesn't work with headline(). So, the only way is to have
special dictionary dict_remove_accent before, which  works as a filter.

I don't remember why do we left this for future releases, though.

Oleg
On Wed, 14 Nov 2007, Tom Lane wrote:

> This patch:
> http://archives.postgresql.org/pgsql-patches/2007-11/msg00137.php
> seems simple and useful enough that I think we ought to slip it into
> 8.3, even though we are far past feature freeze.
>
> As the "simple" dictionary type stands in CVS HEAD, it is only useful as
> the last dictionary in a stack, since it never passes anything on as
> unrecognized.  With the proposed AcceptAll = false option, it could be
> used to filter out some stopwords before feeding tokens to another
> dictionary.  While most dictionary types have their own stopword support,
> some of them match stopwords after their own normalization processing,
> and so there's no way to filter on pre-normalized words.  That seems
> like a good improvement, even without the specific need-example that
> Jan provided at the start of the thread.
>
> Normally we'd never consider adding a new feature so late in the
> development cycle, but this seems small enough and useful enough
> to make an exception.  Comments?
>
>             regards, tom lane
>

     Regards,
         Oleg
_____________________________________________________________
Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
Sternberg Astronomical Institute, Moscow University, Russia
Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/
phone: +007(495)939-16-83, +007(495)939-23-83

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Tom Lane
Date:
Oleg Bartunov <oleg@sai.msu.su> writes:
> Let's consider one example - removing accents.
> In the past I always recommend people to use regex functions before
> to_tsvector conversion to remove accents, but recently I was noticed that
> such trick doesn't work with headline(). So, the only way is to have
> special dictionary dict_remove_accent before, which  works as a filter.

> I don't remember why do we left this for future releases, though.

That would require a system-to-dictionary API change (to be able to
modify the token under inspection), no?  So it's certainly something
I'd say is too late for 8.3.

One thought that came to mind is that the option name should be just
"Accept" not "AcceptAll".  To me "All" implies that it would accept
*everything* ... including stopwords.

            regards, tom lane

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Oleg Bartunov
Date:
On Wed, 14 Nov 2007, Tom Lane wrote:

> Oleg Bartunov <oleg@sai.msu.su> writes:
>> Let's consider one example - removing accents.
>> In the past I always recommend people to use regex functions before
>> to_tsvector conversion to remove accents, but recently I was noticed that
>> such trick doesn't work with headline(). So, the only way is to have
>> special dictionary dict_remove_accent before, which  works as a filter.
>
>> I don't remember why do we left this for future releases, though.
>
> That would require a system-to-dictionary API change (to be able to
> modify the token under inspection), no?  So it's certainly something

It requires one reserved option for dictionaries and  ability to get dictionary
option.  Unless somebody have dictionary with the same option, this change
looks harmless.

> I'd say is too late for 8.3.

yes, probably we get better idea.

>
> One thought that came to mind is that the option name should be just
> "Accept" not "AcceptAll".  To me "All" implies that it would accept
> *everything* ... including stopwords.

wait, I remind the problem with filters. How it will works with thesaurus ?

     Regards,
         Oleg
_____________________________________________________________
Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
Sternberg Astronomical Institute, Moscow University, Russia
Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/
phone: +007(495)939-16-83, +007(495)939-23-83

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Tom Lane
Date:
Oleg Bartunov <oleg@sai.msu.su> writes:
> On Wed, 14 Nov 2007, Tom Lane wrote:
>> One thought that came to mind is that the option name should be just
>> "Accept" not "AcceptAll".  To me "All" implies that it would accept
>> *everything* ... including stopwords.

> wait, I remind the problem with filters. How it will works with thesaurus ?

Huh?  This is just an option for the "simple" dictionary, it's got
nothing to do with thesaurus AFAICS.

            regards, tom lane

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Tom Lane
Date:
=?UTF-8?B?SmFuIFVyYmHFhHNraQ==?= <j.urbanski@students.mimuw.edu.pl> writes:
>> This bit should be replaced with defGetBoolean.  Otherwise it looks
>> reasonably sane.

> Fixed that, thank you.

Applied with minor revisions (changed the parameter name, avoided
probably-insignificant memory leak).

            regards, tom lane

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Oleg Bartunov
Date:
On Wed, 14 Nov 2007, Tom Lane wrote:

> Oleg Bartunov <oleg@sai.msu.su> writes:
>> On Wed, 14 Nov 2007, Tom Lane wrote:
>>> One thought that came to mind is that the option name should be just
>>> "Accept" not "AcceptAll".  To me "All" implies that it would accept
>>> *everything* ... including stopwords.
>
>> wait, I remind the problem with filters. How it will works with thesaurus ?
>
> Huh?  This is just an option for the "simple" dictionary, it's got
> nothing to do with thesaurus AFAICS.

I can assign simple dictionary as a normalization dictionary for thesaurus

     Regards,
         Oleg
_____________________________________________________________
Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
Sternberg Astronomical Institute, Moscow University, Russia
Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/
phone: +007(495)939-16-83, +007(495)939-23-83

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Tom Lane
Date:
Oleg Bartunov <oleg@sai.msu.su> writes:
> On Wed, 14 Nov 2007, Tom Lane wrote:
>> Huh?  This is just an option for the "simple" dictionary, it's got
>> nothing to do with thesaurus AFAICS.

> I can assign simple dictionary as a normalization dictionary for thesaurus

Sure.  So what?  You wouldn't use this option in that case.

            regards, tom lane

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Oleg Bartunov
Date:
On Wed, 14 Nov 2007, Tom Lane wrote:

> Oleg Bartunov <oleg@sai.msu.su> writes:
>> On Wed, 14 Nov 2007, Tom Lane wrote:
>>> Huh?  This is just an option for the "simple" dictionary, it's got
>>> nothing to do with thesaurus AFAICS.
>
>> I can assign simple dictionary as a normalization dictionary for thesaurus
>
> Sure.  So what?  You wouldn't use this option in that case.

Right. That should be documented to avoid possible confusion.

     Regards,
         Oleg
_____________________________________________________________
Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
Sternberg Astronomical Institute, Moscow University, Russia
Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/
phone: +007(495)939-16-83, +007(495)939-23-83

Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords

From
Bruce Momjian
Date:
Added to TODO:

* Allow text search dictionary to filter out only stop words

  http://archives.postgresql.org/pgsql-patches/2007-11/msg00081.php


---------------------------------------------------------------------------

Tom Lane wrote:
> Oleg Bartunov <oleg@sai.msu.su> writes:
> > Let's consider one example - removing accents.
> > In the past I always recommend people to use regex functions before
> > to_tsvector conversion to remove accents, but recently I was noticed that
> > such trick doesn't work with headline(). So, the only way is to have
> > special dictionary dict_remove_accent before, which  works as a filter.
>
> > I don't remember why do we left this for future releases, though.
>
> That would require a system-to-dictionary API change (to be able to
> modify the token under inspection), no?  So it's certainly something
> I'd say is too late for 8.3.
>
> One thought that came to mind is that the option name should be just
> "Accept" not "AcceptAll".  To me "All" implies that it would accept
> *everything* ... including stopwords.
>
>             regards, tom lane
>
> ---------------------------(end of broadcast)---------------------------
> TIP 4: Have you searched our list archives?
>
>                http://archives.postgresql.org

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://postgres.enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +

Bruce Momjian <bruce@momjian.us> writes:
> Added to TODO:

> * Allow text search dictionary to filter out only stop words

>   http://archives.postgresql.org/pgsql-patches/2007-11/msg00081.php

That's a poor description.  I thought the TODO was something more like
"allow dictionaries to change the token that is passed on to later
dictionaries".

            regards, tom lane

Tom Lane wrote:
> Bruce Momjian <bruce@momjian.us> writes:
> > Added to TODO:
>
> > * Allow text search dictionary to filter out only stop words
>
> >   http://archives.postgresql.org/pgsql-patches/2007-11/msg00081.php
>
> That's a poor description.  I thought the TODO was something more like
> "allow dictionaries to change the token that is passed on to later
> dictionaries".

TODO updated as described.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +