Re: improvements for dict_xsyn extended synonym dictionary - RRR - Mailing list pgsql-hackers

From karpov@sao.ru (Sergey V. Karpov)
Subject Re: improvements for dict_xsyn extended synonym dictionary - RRR
Date
Msg-id 8763dad34e.fsf@sao.ru
Whole thread Raw
In response to Re: improvements for dict_xsyn extended synonym dictionary - RRR  (Andres Freund <andres@anarazel.de>)
Responses Re: improvements for dict_xsyn extended synonym dictionary - RRR
List pgsql-hackers
Andres Freund <andres@anarazel.de> writes:

Hi Andres,

> Looks nice. The only small gripe I have is that the patch adds trailing
> whitespaces at a lot of places...
>
> Except maybe that I do see no need for changes anymore...

My fault. Please check the patch version attached - I've tried to fix
all those.

Thanks,
Sergey

Index: contrib/dict_xsyn/dict_xsyn.c
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/dict_xsyn.c,v
retrieving revision 1.6
diff -u -r1.6 dict_xsyn.c
--- contrib/dict_xsyn/dict_xsyn.c    1 Jan 2009 17:23:32 -0000    1.6
+++ contrib/dict_xsyn/dict_xsyn.c    30 Jul 2009 07:37:54 -0000
@@ -26,6 +26,7 @@
     char       *key;            /* Word */
     char       *value;            /* Unparsed list of synonyms, including the
                                  * word itself */
+    int         pos;            /* Position of key word in original string */
 } Syn;

 typedef struct
@@ -33,7 +34,10 @@
     int            len;
     Syn           *syn;

+    bool        matchorig;
     bool        keeporig;
+    bool        matchsynonyms;
+    bool        keepsynonyms;
 } DictSyn;


@@ -88,6 +92,7 @@
     {
         char       *value;
         char       *key;
+        char       *pos;
         char       *end = NULL;

         if (*line == '\0')
@@ -96,26 +101,39 @@
         value = lowerstr(line);
         pfree(line);

-        key = find_word(value, &end);
-        if (!key)
-        {
-            pfree(value);
-            continue;
-        }
+        pos = value;

-        if (cur == d->len)
+        while((key = find_word(pos, &end)) != NULL)
         {
-            d->len = (d->len > 0) ? 2 * d->len : 16;
-            if (d->syn)
-                d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
-            else
-                d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
-        }
+            if (cur == d->len)
+            {
+                d->len = (d->len > 0) ? 2 * d->len : 16;
+                if (d->syn)
+                    d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
+                else
+                    d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
+            }
+
+            /* Read first word only if we will match it */
+            if (pos != value || d->matchorig)
+            {
+                d->syn[cur].key = pnstrdup(key, end - key);
+                d->syn[cur].value = pstrdup(value);
+                d->syn[cur].pos = key - value;

-        d->syn[cur].key = pnstrdup(key, end - key);
-        d->syn[cur].value = value;
+                cur++;
+            }
+
+            pos = end;
+
+            /* Don't read synonyms if we do not match them */
+            if (!d->matchsynonyms)
+            {
+                break;
+            }
+        }

-        cur++;
+        pfree(value);
     }

     tsearch_readline_end(&trst);
@@ -133,23 +151,40 @@
     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     DictSyn    *d;
     ListCell   *l;
+    char       *filename = NULL;

     d = (DictSyn *) palloc0(sizeof(DictSyn));
     d->len = 0;
     d->syn = NULL;
+    d->matchorig = true;
     d->keeporig = true;
+    d->matchsynonyms = false;
+    d->keepsynonyms = true;

     foreach(l, dictoptions)
     {
         DefElem    *defel = (DefElem *) lfirst(l);

-        if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
+        if (pg_strcasecmp(defel->defname, "MATCHORIG") == 0)
+        {
+            d->matchorig = defGetBoolean(defel);
+        }
+        else if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
         {
             d->keeporig = defGetBoolean(defel);
         }
+        else if (pg_strcasecmp(defel->defname, "MATCHSYNONYMS") == 0)
+        {
+            d->matchsynonyms = defGetBoolean(defel);
+        }
+        else if (pg_strcasecmp(defel->defname, "KEEPSYNONYMS") == 0)
+        {
+            d->keepsynonyms = defGetBoolean(defel);
+        }
         else if (pg_strcasecmp(defel->defname, "RULES") == 0)
         {
-            read_dictionary(d, defGetString(defel));
+            /* we can't read the rules before parsing all options! */
+            filename = pstrdup(defGetString(defel));
         }
         else
         {
@@ -160,6 +195,12 @@
         }
     }

+    if(filename)
+    {
+        read_dictionary(d, filename);
+        pfree(filename);
+    }
+
     PG_RETURN_POINTER(d);
 }

@@ -198,7 +239,6 @@
         int            value_length = strlen(value);
         char       *pos = value;
         int            nsyns = 0;
-        bool        is_first = true;

         res = palloc(0);

@@ -214,8 +254,8 @@
             res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
             res[nsyns].lexeme = NULL;

-            /* first word is added to result only if KEEPORIG flag is set */
-            if (d->keeporig || !is_first)
+            /* The first word is added only if keeporig=true */
+            if (pos != value || d->keeporig)
             {
                 res[nsyns].lexeme = pstrdup(syn);
                 res[nsyns + 1].lexeme = NULL;
@@ -223,9 +263,12 @@
                 nsyns++;
             }

-            is_first = false;
-
             pos = end + 1;
+
+            if(!d->keepsynonyms)
+            {
+                break;
+            }
         }

         pfree(value);
Index: contrib/dict_xsyn/expected/dict_xsyn.out
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/expected/dict_xsyn.out,v
retrieving revision 1.1
diff -u -r1.1 dict_xsyn.out
--- contrib/dict_xsyn/expected/dict_xsyn.out    15 Oct 2007 21:36:50 -0000    1.1
+++ contrib/dict_xsyn/expected/dict_xsyn.out    30 Jul 2009 07:37:54 -0000
@@ -5,10 +5,76 @@
 SET client_min_messages = warning;
 \set ECHO none
 RESET client_min_messages;
---configuration
-ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+-- default configuration - match first word and return it among with all synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true,
MATCHSYNONYMS=false);
 --lexize
 SELECT ts_lexize('xsyn', 'supernova');
+        ts_lexize
+--------------------------
+ {supernova,sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
+-- the same, but return only synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true,
MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+   ts_lexize
+----------------
+ {sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
+-- match any word and return all words
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true,
MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+        ts_lexize
+--------------------------
+ {supernova,sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+        ts_lexize
+--------------------------
+ {supernova,sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
+-- match any word and return all words except first one
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true,
MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+   ts_lexize
+----------------
+ {sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
    ts_lexize
 ----------------
  {sn,sne,1987a}
@@ -20,3 +86,63 @@

 (1 row)

+-- match any synonym but not first word, and return first word instead
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false,
MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+  ts_lexize
+-------------
+ {supernova}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
+-- do not match or return anything
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false,
MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
+-- match any word but return nothing
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false,
MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize
+-----------
+ {}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize
+-----------
+ {}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize
+-----------
+
+(1 row)
+
Index: contrib/dict_xsyn/sql/dict_xsyn.sql
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/sql/dict_xsyn.sql,v
retrieving revision 1.1
diff -u -r1.1 dict_xsyn.sql
--- contrib/dict_xsyn/sql/dict_xsyn.sql    15 Oct 2007 21:36:50 -0000    1.1
+++ contrib/dict_xsyn/sql/dict_xsyn.sql    30 Jul 2009 07:37:54 -0000
@@ -8,9 +8,47 @@
 \set ECHO all
 RESET client_min_messages;

---configuration
-ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+-- default configuration - match first word and return it among with all synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true,
MATCHSYNONYMS=false);

 --lexize
 SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
 SELECT ts_lexize('xsyn', 'grb');
+
+-- the same, but return only synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true,
MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any word and return all words
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true,
MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any word and return all words except first one
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true,
MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any synonym but not first word, and return first word instead
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false,
MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- do not match or return anything
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false,
MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any word but return nothing
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false,
MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
Index: doc/src/sgml/dict-xsyn.sgml
===================================================================
RCS file: /projects/cvsroot/pgsql/doc/src/sgml/dict-xsyn.sgml,v
retrieving revision 1.2
diff -u -r1.2 dict-xsyn.sgml
--- doc/src/sgml/dict-xsyn.sgml    6 Dec 2007 04:12:10 -0000    1.2
+++ doc/src/sgml/dict-xsyn.sgml    30 Jul 2009 07:37:56 -0000
@@ -23,13 +23,32 @@
   <itemizedlist>
    <listitem>
     <para>
+     <literal>matchorig</> controls whether the original word is accepted by
+     the dictionary. Default is <literal>true</>.
+    </para>
+   </listitem>
+   <listitem>
+    <para>
      <literal>keeporig</> controls whether the original word is included (if
-     <literal>true</>), or only its synonyms (if <literal>false</>). Default
-     is <literal>true</>.
+     <literal>true</>) in results, or only its synonyms (if
+     <literal>false</>). Default is <literal>true</>.
+    </para>
+   </listitem>
+   <listitem>
+    <para>
+     <literal>matchsynonyms</> controls whether any of the synonyms is accepted
+     by the dictionary (if <literal>true</>). Default is <literal>false</>.
     </para>
    </listitem>
    <listitem>
     <para>
+     <literal>keepsynonyms</> controls whether synonyms are returned by the
+     dictionary (if <literal>true</>). Default is <literal>true</>.
+    </para>
+   </listitem>
+
+   <listitem>
+    <para>
      <literal>rules</> is the base name of the file containing the list of
      synonyms.  This file must be stored in
      <filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means
@@ -90,7 +109,31 @@
 mydb=# SELECT ts_lexize('xsyn', 'word');
       ts_lexize
 -----------------------
+ {syn1,syn2,syn3}
+
+mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true);
+ALTER TEXT SEARCH DICTIONARY
+
+mydb=# SELECT ts_lexize('xsyn', 'word');
+      ts_lexize
+-----------------------
  {word,syn1,syn2,syn3}
+
+mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=false, MATCHSYNONYMS=true);
+ALTER TEXT SEARCH DICTIONARY
+
+mydb=# SELECT ts_lexize('xsyn', 'syn1');
+      ts_lexize
+-----------------------
+ {syn1,syn2,syn3}
+
+mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false);
+ALTER TEXT SEARCH DICTIONARY
+
+mydb=# SELECT ts_lexize('xsyn', 'syn1');
+      ts_lexize
+-----------------------
+ {word}
 </programlisting>

    but real-world usage will involve including it in a text search

pgsql-hackers by date:

Previous
From: Dean Rasheed
Date:
Subject: Re: WIP: Deferrable unique constraints
Next
From: Peter Eisentraut
Date:
Subject: Re: [RFC] new digest datatypes, or generic fixed-len hex types?