a tsearch2 (8.2.4) dictionary that only filters out stopwords - Mailing list pgsql-patches
From | Jan Urbański |
---|---|
Subject | a tsearch2 (8.2.4) dictionary that only filters out stopwords |
Date | |
Msg-id | 4733B65A.9030707@students.mimuw.edu.pl Whole thread Raw |
Responses |
Re: a tsearch2 (8.2.4) dictionary that only filters out stopwords
|
List | pgsql-patches |
Hi, the rationale for this patch is rather complicated, as it's related to the peculiarities of Polish grammar. Please read on. I'm using PostgreSQL 8.2.4 and the ispell tsearch2 dictionary. The problem is as follows. In Polish (and possibly other languages that don't come to my mind at the moment) a noun can take different forms depending on the grammatical context. This is called declension. For exmple the noun 'oda' (which means 'ode' in English) can take the form 'od' in certain cases. However, the word in Polish 'od' is also a preposition. The problem with the ispell dictionary is that it first reduces a lexeme to it's stem and then checks whether it is or is not a stopword. This means that I either have to agree with the fact that the tsvectors for my documents will contain large numbers of the noun 'oda' (because each time a preposition 'od' is used in the text it will be stemmed to produce 'oda' and then indexed) or I have to include the word 'oda' in the stopwords file and thus eliminate a perfectly good noun from my tsvectors. The solution I came up with was simple: write a dictionary, that does only one thing: looks up the lexeme in a stopwords file and either discards it or returns NULL. That way I could use it as the first dictionary is the dictionary stach for lexeme types I'm interested in and it would discard every instance of 'od', while passing every non-stopword (in particular 'oda') to the ispell dictionary. Tha attached patch adds a dictionary called stop to the set of standard dictionaries that one gets after installing tsearch2. The C code may not be first-class (however it works for me in a real business solution) - it's quite trivial and I'd be happy if some more experienced Postgres hackers would implement the idea in a cleaner/safer way. It's been tested on 8.2.4 and compiles on 8.2.5. I haven't even looked at the code for 8.3 yet, but maybe the change could somehow make it's way into the integrated full text search? Regards, Jan Urbanski Warsaw University http://fiok.pl/ -- Jan Urbanski GPG key ID: E583D7D2 ouden estin diff -Naur postgresql-8.2.4-orig/contrib/tsearch2/dict_stop.c postgresql-8.2.4/contrib/tsearch2/dict_stop.c --- postgresql-8.2.4-orig/contrib/tsearch2/dict_stop.c 1970-01-01 01:00:00.000000000 +0100 +++ postgresql-8.2.4/contrib/tsearch2/dict_stop.c 2007-09-14 19:34:12.000000000 +0200 @@ -0,0 +1,69 @@ +/* + * A dictionary that only recognizes stopwords (and discards them) + * Jan Urbanski <wulczer@students.mimuw.edu.pl> + */ +#include "postgres.h" +#include "common.h" +#include "dict.h" +#include "ts_locale.h" + + +PG_FUNCTION_INFO_V1(stop_init); +Datum stop_init(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(stop_lexize); +Datum stop_lexize(PG_FUNCTION_ARGS); + +Datum +stop_init(PG_FUNCTION_ARGS) +{ + StopList *l = (StopList *) malloc(sizeof(StopList)); + + if (!l) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + memset(l, 0, sizeof(StopList)); + l->wordop = lowerstr; + + if (!PG_ARGISNULL(0) && PG_GETARG_POINTER(0) != NULL) + { + text *in = PG_GETARG_TEXT_P(0); + + readstoplist(in, l); + sortstoplist(l); + PG_FREE_IF_COPY(in, 0); + } + + PG_RETURN_POINTER(l); +} + +Datum +stop_lexize(PG_FUNCTION_ARGS) +{ + StopList *l = (StopList *) PG_GETARG_POINTER(0); + char *in = (char *) PG_GETARG_POINTER(1); + char *utxt = pnstrdup(in, PG_GETARG_INT32(2)); + TSLexeme *res = palloc(sizeof(TSLexeme) * 2); + char *txt = lowerstr(utxt); + + int found = 0; + + pfree(utxt); + memset(res, 0, sizeof(TSLexeme) * 2); + if (*txt == '\0' || searchstoplist(l, txt)) + { + found = 1; + } + pfree(txt); + + if (found) + { + PG_RETURN_POINTER(res); + } + else + { + pfree(res); + PG_RETURN_POINTER(NULL); + } +} diff -Naur postgresql-8.2.4-orig/contrib/tsearch2/Makefile postgresql-8.2.4/contrib/tsearch2/Makefile --- postgresql-8.2.4-orig/contrib/tsearch2/Makefile 2007-09-14 19:28:48.000000000 +0200 +++ postgresql-8.2.4/contrib/tsearch2/Makefile 2007-09-14 19:30:52.000000000 +0200 @@ -2,7 +2,7 @@ MODULE_big = tsearch2 OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \ - dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \ + dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o dict_stop.o \ wparser.o wparser_def.o \ ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \ tsvector_op.o rank.o ts_stat.o \ diff -Naur postgresql-8.2.4-orig/contrib/tsearch2/tsearch.sql.in postgresql-8.2.4/contrib/tsearch2/tsearch.sql.in --- postgresql-8.2.4-orig/contrib/tsearch2/tsearch.sql.in 2007-09-14 19:28:48.000000000 +0200 +++ postgresql-8.2.4/contrib/tsearch2/tsearch.sql.in 2007-09-14 19:30:52.000000000 +0200 @@ -82,6 +82,25 @@ 'English Stemmer. Snowball.' ; +CREATE FUNCTION stop_init(internal) + RETURNS internal + as 'MODULE_PATHNAME' + LANGUAGE C; + +CREATE FUNCTION stop_lexize(internal,internal,int4) + RETURNS internal + as 'MODULE_PATHNAME' + LANGUAGE C + RETURNS NULL ON NULL INPUT; + +insert into pg_ts_dict select + 'stop', + 'stop_init(internal)', + 'contrib/english.stop', + 'stop_lexize(internal,internal,int4)', + 'Stopwords sieve. Must have stopwords file.' +; + CREATE FUNCTION snb_ru_init_koi8(internal) RETURNS internal as 'MODULE_PATHNAME'
Attachment
pgsql-patches by date: