a tsearch2 (8.2.4) dictionary that only filters out stopwords - Mailing list pgsql-patches

From Jan Urbański
Subject a tsearch2 (8.2.4) dictionary that only filters out stopwords
Date
Msg-id 4733B65A.9030707@students.mimuw.edu.pl
Whole thread Raw
Responses Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords
List pgsql-patches
Hi,

the rationale for this patch is rather complicated, as it's related to
the peculiarities of Polish grammar. Please read on.

I'm using PostgreSQL 8.2.4 and the ispell tsearch2 dictionary. The
problem is as follows. In Polish (and possibly other languages that
don't come to my mind at the moment) a noun can take different forms
depending on the grammatical context. This is called declension. For
exmple the noun 'oda' (which means 'ode' in English) can take the form
'od' in certain cases. However, the word in Polish 'od' is also a
preposition. The problem with the ispell dictionary is that it first
reduces a lexeme to it's stem and then checks whether it is or is not a
stopword.

This means that I either have to agree with the fact that the tsvectors
for my documents will contain large numbers of the noun 'oda' (because
each time a preposition 'od' is used in the text it will be stemmed to
produce 'oda' and then indexed) or I have to include the word 'oda' in
the stopwords file and thus eliminate a perfectly good noun from my
tsvectors.

The solution I came up with was simple: write a dictionary, that does
only one thing: looks up the lexeme in a stopwords file and either
discards it or returns NULL. That way I could use it as the first
dictionary is the dictionary stach for lexeme types I'm interested in
and it would discard every instance of 'od', while passing every
non-stopword (in particular 'oda') to the ispell dictionary.

Tha attached patch adds a dictionary called stop to the set of standard
dictionaries that one gets after installing tsearch2. The C code may not
be first-class (however it works for me in a real business solution) -
it's quite trivial and I'd be happy if some more experienced Postgres
hackers would implement the idea in a cleaner/safer way. It's been
tested on 8.2.4 and compiles on 8.2.5. I haven't even looked at the code
for 8.3 yet, but maybe the change could somehow make it's way into the
integrated full text search?

Regards,
Jan Urbanski
Warsaw University
http://fiok.pl/

--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin
diff -Naur postgresql-8.2.4-orig/contrib/tsearch2/dict_stop.c postgresql-8.2.4/contrib/tsearch2/dict_stop.c
--- postgresql-8.2.4-orig/contrib/tsearch2/dict_stop.c    1970-01-01 01:00:00.000000000 +0100
+++ postgresql-8.2.4/contrib/tsearch2/dict_stop.c    2007-09-14 19:34:12.000000000 +0200
@@ -0,0 +1,69 @@
+/*
+ * A dictionary that only recognizes stopwords (and discards them)
+ * Jan Urbanski <wulczer@students.mimuw.edu.pl>
+ */
+#include "postgres.h"
+#include "common.h"
+#include "dict.h"
+#include "ts_locale.h"
+
+
+PG_FUNCTION_INFO_V1(stop_init);
+Datum        stop_init(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(stop_lexize);
+Datum        stop_lexize(PG_FUNCTION_ARGS);
+
+Datum
+stop_init(PG_FUNCTION_ARGS)
+{
+    StopList *l = (StopList *) malloc(sizeof(StopList));
+
+    if (!l)
+        ereport(ERROR,
+                (errcode(ERRCODE_OUT_OF_MEMORY),
+                 errmsg("out of memory")));
+    memset(l, 0, sizeof(StopList));
+    l->wordop = lowerstr;
+
+    if (!PG_ARGISNULL(0) && PG_GETARG_POINTER(0) != NULL)
+    {
+        text       *in = PG_GETARG_TEXT_P(0);
+
+        readstoplist(in, l);
+        sortstoplist(l);
+        PG_FREE_IF_COPY(in, 0);
+    }
+
+    PG_RETURN_POINTER(l);
+}
+
+Datum
+stop_lexize(PG_FUNCTION_ARGS)
+{
+    StopList   *l = (StopList *) PG_GETARG_POINTER(0);
+    char       *in = (char *) PG_GETARG_POINTER(1);
+    char       *utxt = pnstrdup(in, PG_GETARG_INT32(2));
+    TSLexeme   *res = palloc(sizeof(TSLexeme) * 2);
+    char       *txt = lowerstr(utxt);
+
+    int       found = 0;
+
+    pfree(utxt);
+    memset(res, 0, sizeof(TSLexeme) * 2);
+    if (*txt == '\0' || searchstoplist(l, txt))
+    {
+        found = 1;
+    }
+    pfree(txt);
+
+    if (found)
+    {
+        PG_RETURN_POINTER(res);
+    }
+    else
+    {
+        pfree(res);
+        PG_RETURN_POINTER(NULL);
+    }
+}
diff -Naur postgresql-8.2.4-orig/contrib/tsearch2/Makefile postgresql-8.2.4/contrib/tsearch2/Makefile
--- postgresql-8.2.4-orig/contrib/tsearch2/Makefile    2007-09-14 19:28:48.000000000 +0200
+++ postgresql-8.2.4/contrib/tsearch2/Makefile    2007-09-14 19:30:52.000000000 +0200
@@ -2,7 +2,7 @@

 MODULE_big = tsearch2
 OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
-       dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \
+       dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o dict_stop.o \
        wparser.o wparser_def.o \
        ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
        tsvector_op.o rank.o ts_stat.o \
diff -Naur postgresql-8.2.4-orig/contrib/tsearch2/tsearch.sql.in postgresql-8.2.4/contrib/tsearch2/tsearch.sql.in
--- postgresql-8.2.4-orig/contrib/tsearch2/tsearch.sql.in    2007-09-14 19:28:48.000000000 +0200
+++ postgresql-8.2.4/contrib/tsearch2/tsearch.sql.in    2007-09-14 19:30:52.000000000 +0200
@@ -82,6 +82,25 @@
     'English Stemmer. Snowball.'
 ;

+CREATE FUNCTION stop_init(internal)
+    RETURNS internal
+    as 'MODULE_PATHNAME'
+    LANGUAGE C;
+
+CREATE FUNCTION stop_lexize(internal,internal,int4)
+    RETURNS internal
+    as 'MODULE_PATHNAME'
+    LANGUAGE C
+    RETURNS NULL ON NULL INPUT;
+
+insert into pg_ts_dict select
+    'stop',
+    'stop_init(internal)',
+    'contrib/english.stop',
+    'stop_lexize(internal,internal,int4)',
+    'Stopwords sieve. Must have stopwords file.'
+;
+
 CREATE FUNCTION snb_ru_init_koi8(internal)
     RETURNS internal
     as 'MODULE_PATHNAME'

Attachment

pgsql-patches by date:

Previous
From: Bruce Momjian
Date:
Subject: Re: [HACKERS] Connection Pools and DISCARD ALL
Next
From: Tom Lane
Date:
Subject: Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords