Re: improvements for dict_xsyn extended synonym dictionary - RRR - Mailing list pgsql-hackers
From | karpov@sao.ru (Sergey V. Karpov) |
---|---|
Subject | Re: improvements for dict_xsyn extended synonym dictionary - RRR |
Date | |
Msg-id | 8763dad34e.fsf@sao.ru Whole thread Raw |
In response to | Re: improvements for dict_xsyn extended synonym dictionary - RRR (Andres Freund <andres@anarazel.de>) |
Responses |
Re: improvements for dict_xsyn extended synonym dictionary - RRR
|
List | pgsql-hackers |
Andres Freund <andres@anarazel.de> writes: Hi Andres, > Looks nice. The only small gripe I have is that the patch adds trailing > whitespaces at a lot of places... > > Except maybe that I do see no need for changes anymore... My fault. Please check the patch version attached - I've tried to fix all those. Thanks, Sergey Index: contrib/dict_xsyn/dict_xsyn.c =================================================================== RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/dict_xsyn.c,v retrieving revision 1.6 diff -u -r1.6 dict_xsyn.c --- contrib/dict_xsyn/dict_xsyn.c 1 Jan 2009 17:23:32 -0000 1.6 +++ contrib/dict_xsyn/dict_xsyn.c 30 Jul 2009 07:37:54 -0000 @@ -26,6 +26,7 @@ char *key; /* Word */ char *value; /* Unparsed list of synonyms, including the * word itself */ + int pos; /* Position of key word in original string */ } Syn; typedef struct @@ -33,7 +34,10 @@ int len; Syn *syn; + bool matchorig; bool keeporig; + bool matchsynonyms; + bool keepsynonyms; } DictSyn; @@ -88,6 +92,7 @@ { char *value; char *key; + char *pos; char *end = NULL; if (*line == '\0') @@ -96,26 +101,39 @@ value = lowerstr(line); pfree(line); - key = find_word(value, &end); - if (!key) - { - pfree(value); - continue; - } + pos = value; - if (cur == d->len) + while((key = find_word(pos, &end)) != NULL) { - d->len = (d->len > 0) ? 2 * d->len : 16; - if (d->syn) - d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); - else - d->syn = (Syn *) palloc(sizeof(Syn) * d->len); - } + if (cur == d->len) + { + d->len = (d->len > 0) ? 2 * d->len : 16; + if (d->syn) + d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); + else + d->syn = (Syn *) palloc(sizeof(Syn) * d->len); + } + + /* Read first word only if we will match it */ + if (pos != value || d->matchorig) + { + d->syn[cur].key = pnstrdup(key, end - key); + d->syn[cur].value = pstrdup(value); + d->syn[cur].pos = key - value; - d->syn[cur].key = pnstrdup(key, end - key); - d->syn[cur].value = value; + cur++; + } + + pos = end; + + /* Don't read synonyms if we do not match them */ + if (!d->matchsynonyms) + { + break; + } + } - cur++; + pfree(value); } tsearch_readline_end(&trst); @@ -133,23 +151,40 @@ List *dictoptions = (List *) PG_GETARG_POINTER(0); DictSyn *d; ListCell *l; + char *filename = NULL; d = (DictSyn *) palloc0(sizeof(DictSyn)); d->len = 0; d->syn = NULL; + d->matchorig = true; d->keeporig = true; + d->matchsynonyms = false; + d->keepsynonyms = true; foreach(l, dictoptions) { DefElem *defel = (DefElem *) lfirst(l); - if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0) + if (pg_strcasecmp(defel->defname, "MATCHORIG") == 0) + { + d->matchorig = defGetBoolean(defel); + } + else if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0) { d->keeporig = defGetBoolean(defel); } + else if (pg_strcasecmp(defel->defname, "MATCHSYNONYMS") == 0) + { + d->matchsynonyms = defGetBoolean(defel); + } + else if (pg_strcasecmp(defel->defname, "KEEPSYNONYMS") == 0) + { + d->keepsynonyms = defGetBoolean(defel); + } else if (pg_strcasecmp(defel->defname, "RULES") == 0) { - read_dictionary(d, defGetString(defel)); + /* we can't read the rules before parsing all options! */ + filename = pstrdup(defGetString(defel)); } else { @@ -160,6 +195,12 @@ } } + if(filename) + { + read_dictionary(d, filename); + pfree(filename); + } + PG_RETURN_POINTER(d); } @@ -198,7 +239,6 @@ int value_length = strlen(value); char *pos = value; int nsyns = 0; - bool is_first = true; res = palloc(0); @@ -214,8 +254,8 @@ res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2)); res[nsyns].lexeme = NULL; - /* first word is added to result only if KEEPORIG flag is set */ - if (d->keeporig || !is_first) + /* The first word is added only if keeporig=true */ + if (pos != value || d->keeporig) { res[nsyns].lexeme = pstrdup(syn); res[nsyns + 1].lexeme = NULL; @@ -223,9 +263,12 @@ nsyns++; } - is_first = false; - pos = end + 1; + + if(!d->keepsynonyms) + { + break; + } } pfree(value); Index: contrib/dict_xsyn/expected/dict_xsyn.out =================================================================== RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/expected/dict_xsyn.out,v retrieving revision 1.1 diff -u -r1.1 dict_xsyn.out --- contrib/dict_xsyn/expected/dict_xsyn.out 15 Oct 2007 21:36:50 -0000 1.1 +++ contrib/dict_xsyn/expected/dict_xsyn.out 30 Jul 2009 07:37:54 -0000 @@ -5,10 +5,76 @@ SET client_min_messages = warning; \set ECHO none RESET client_min_messages; ---configuration -ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); +-- default configuration - match first word and return it among with all synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); --lexize SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- the same, but return only synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word and return all words +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word and return all words except first one +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); ts_lexize ---------------- {sn,sne,1987a} @@ -20,3 +86,63 @@ (1 row) +-- match any synonym but not first word, and return first word instead +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +------------- + {supernova} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- do not match or return anything +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word but return nothing +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + {} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + {} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + Index: contrib/dict_xsyn/sql/dict_xsyn.sql =================================================================== RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/sql/dict_xsyn.sql,v retrieving revision 1.1 diff -u -r1.1 dict_xsyn.sql --- contrib/dict_xsyn/sql/dict_xsyn.sql 15 Oct 2007 21:36:50 -0000 1.1 +++ contrib/dict_xsyn/sql/dict_xsyn.sql 30 Jul 2009 07:37:54 -0000 @@ -8,9 +8,47 @@ \set ECHO all RESET client_min_messages; ---configuration -ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); +-- default configuration - match first word and return it among with all synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); --lexize SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); SELECT ts_lexize('xsyn', 'grb'); + +-- the same, but return only synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word and return all words +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word and return all words except first one +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any synonym but not first word, and return first word instead +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- do not match or return anything +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word but return nothing +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + Index: doc/src/sgml/dict-xsyn.sgml =================================================================== RCS file: /projects/cvsroot/pgsql/doc/src/sgml/dict-xsyn.sgml,v retrieving revision 1.2 diff -u -r1.2 dict-xsyn.sgml --- doc/src/sgml/dict-xsyn.sgml 6 Dec 2007 04:12:10 -0000 1.2 +++ doc/src/sgml/dict-xsyn.sgml 30 Jul 2009 07:37:56 -0000 @@ -23,13 +23,32 @@ <itemizedlist> <listitem> <para> + <literal>matchorig</> controls whether the original word is accepted by + the dictionary. Default is <literal>true</>. + </para> + </listitem> + <listitem> + <para> <literal>keeporig</> controls whether the original word is included (if - <literal>true</>), or only its synonyms (if <literal>false</>). Default - is <literal>true</>. + <literal>true</>) in results, or only its synonyms (if + <literal>false</>). Default is <literal>true</>. + </para> + </listitem> + <listitem> + <para> + <literal>matchsynonyms</> controls whether any of the synonyms is accepted + by the dictionary (if <literal>true</>). Default is <literal>false</>. </para> </listitem> <listitem> <para> + <literal>keepsynonyms</> controls whether synonyms are returned by the + dictionary (if <literal>true</>). Default is <literal>true</>. + </para> + </listitem> + + <listitem> + <para> <literal>rules</> is the base name of the file containing the list of synonyms. This file must be stored in <filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means @@ -90,7 +109,31 @@ mydb=# SELECT ts_lexize('xsyn', 'word'); ts_lexize ----------------------- + {syn1,syn2,syn3} + +mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true); +ALTER TEXT SEARCH DICTIONARY + +mydb=# SELECT ts_lexize('xsyn', 'word'); + ts_lexize +----------------------- {word,syn1,syn2,syn3} + +mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=false, MATCHSYNONYMS=true); +ALTER TEXT SEARCH DICTIONARY + +mydb=# SELECT ts_lexize('xsyn', 'syn1'); + ts_lexize +----------------------- + {syn1,syn2,syn3} + +mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false); +ALTER TEXT SEARCH DICTIONARY + +mydb=# SELECT ts_lexize('xsyn', 'syn1'); + ts_lexize +----------------------- + {word} </programlisting> but real-world usage will involve including it in a text search
pgsql-hackers by date: