Thread: Bunch of tsearch fixes and cleanup

Bunch of tsearch fixes and cleanup

From

"Heikki Linnakangas"

Date:

23 August 2007, 14:18:40

Fixes the following bugs:
- ispell initialization crashed on empty dictionary file
- ispell initialization crashed on affix file with prefixes but no suffixes
- stop words file was ran through pg_verify_mbstr, with database
encoding, but it's later interpreted as being UTF-8. Now verifies that
it's UTF-8, regardless of database encoding.

Other changes:

- readstopwords now sorts the stop words after loading them. Removed the
separate sortstopwords function.

- readstopwords calls recode_and_lowerstr directly, instead of using the
 "wordop" function pointer in StopList struct. All callers used
recode_and_lowerstr anyway, so this simplifies the code a little bit. Is
there any external dictionary implementations that would require
different behavior?

- bunch of comments added, typos fixed, and other cleanup

The code still needs lots of love, but it's a start...

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/snowball/dict_snowball.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/snowball/dict_snowball.c,v
retrieving revision 1.2
diff -c -r1.2 dict_snowball.c
*** src/backend/snowball/dict_snowball.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/snowball/dict_snowball.c    23 Aug 2007 10:55:53 -0000
***************
*** 192,198 ****
      ListCell   *l;

      d = (DictSnowball *) palloc0(sizeof(DictSnowball));
-     d->stoplist.wordop = recode_and_lowerstr;

      foreach(l, dictoptions)
      {
--- 192,197 ----
***************
*** 205,211 ****
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
              readstoplist(defGetString(defel), &d->stoplist);
-             sortstoplist(&d->stoplist);
              stoploaded = true;
          }
          else if (pg_strcasecmp("Language", defel->defname) == 0)
--- 204,209 ----
Index: src/backend/tsearch/dict_ispell.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_ispell.c,v
retrieving revision 1.2
diff -c -r1.2 dict_ispell.c
*** src/backend/tsearch/dict_ispell.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/tsearch/dict_ispell.c    23 Aug 2007 10:57:12 -0000
***************
*** 39,45 ****
      ListCell   *l;

      d = (DictISpell *) palloc0(sizeof(DictISpell));
-     d->stoplist.wordop = recode_and_lowerstr;

      foreach(l, dictoptions)
      {
--- 39,44 ----
***************
*** 74,80 ****
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
              readstoplist(defGetString(defel), &(d->stoplist));
-             sortstoplist(&(d->stoplist));
              stoploaded = true;
          }
          else
--- 73,78 ----
Index: src/backend/tsearch/dict_simple.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_simple.c,v
retrieving revision 1.2
diff -c -r1.2 dict_simple.c
*** src/backend/tsearch/dict_simple.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/tsearch/dict_simple.c    23 Aug 2007 11:15:55 -0000
***************
*** 23,41 ****
  typedef struct
  {
      StopList    stoplist;
! } DictExample;


  Datum
  dsimple_init(PG_FUNCTION_ARGS)
  {
      List       *dictoptions = (List *) PG_GETARG_POINTER(0);
!     DictExample *d = (DictExample *) palloc0(sizeof(DictExample));
      bool        stoploaded = false;
      ListCell   *l;

-     d->stoplist.wordop = recode_and_lowerstr;
-
      foreach(l, dictoptions)
      {
          DefElem    *defel = (DefElem *) lfirst(l);
--- 23,39 ----
  typedef struct
  {
      StopList    stoplist;
! } DictSimple;


  Datum
  dsimple_init(PG_FUNCTION_ARGS)
  {
      List       *dictoptions = (List *) PG_GETARG_POINTER(0);
!     DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
      bool        stoploaded = false;
      ListCell   *l;

      foreach(l, dictoptions)
      {
          DefElem    *defel = (DefElem *) lfirst(l);
***************
*** 47,53 ****
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
              readstoplist(defGetString(defel), &d->stoplist);
-             sortstoplist(&d->stoplist);
              stoploaded = true;
          }
          else
--- 45,50 ----
***************
*** 65,80 ****
  Datum
  dsimple_lexize(PG_FUNCTION_ARGS)
  {
!     DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
      char       *in = (char *) PG_GETARG_POINTER(1);
      int32       len = PG_GETARG_INT32(2);
!     char       *txt = lowerstr_with_len(in, len);
      TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);

      if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
-     {
          pfree(txt);
-     }
      else
          res[0].lexeme = txt;

--- 62,77 ----
  Datum
  dsimple_lexize(PG_FUNCTION_ARGS)
  {
!     DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0);
      char       *in = (char *) PG_GETARG_POINTER(1);
      int32       len = PG_GETARG_INT32(2);
!     char       *txt;
      TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);

+     txt = lowerstr_with_len(in, len);
+
      if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
          pfree(txt);
      else
          res[0].lexeme = txt;

Index: src/backend/tsearch/dict_synonym.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_synonym.c,v
retrieving revision 1.2
diff -c -r1.2 dict_synonym.c
*** src/backend/tsearch/dict_synonym.c    22 Aug 2007 04:13:15 -0000    1.2
--- src/backend/tsearch/dict_synonym.c    23 Aug 2007 13:09:47 -0000
***************
*** 31,40 ****

  typedef struct
  {
!     int            len;
      Syn           *syn;
  } DictSyn;

  static char *
  findwrd(char *in, char **end)
  {
--- 31,45 ----

  typedef struct
  {
!     int            len;    /* length of syn array */
      Syn           *syn;
  } DictSyn;

+ /*
+  * Finds the next whitespace-delimited word within the 'in' string.
+  * Returns a pointer to the first character of the word, and a pointer
+  * to the next byte after the last character in the word in *end.
+  */
  static char *
  findwrd(char *in, char **end)
  {
***************
*** 137,149 ****

          d->syn[cur].in = recode_and_lowerstr(starti);
          d->syn[cur].out = recode_and_lowerstr(starto);
-         if (!(d->syn[cur].in && d->syn[cur].out))
-         {
-             FreeFile(fin);
-             ereport(ERROR,
-                     (errcode(ERRCODE_OUT_OF_MEMORY),
-                      errmsg("out of memory")));
-         }

          cur++;
      }
--- 142,147 ----
***************
*** 151,158 ****
      FreeFile(fin);

      d->len = cur;
!     if (cur > 1)
!         qsort(d->syn, d->len, sizeof(Syn), compareSyn);

      PG_RETURN_POINTER(d);
  }
--- 149,155 ----
      FreeFile(fin);

      d->len = cur;
!     qsort(d->syn, d->len, sizeof(Syn), compareSyn);

      PG_RETURN_POINTER(d);
  }
***************
*** 179,186 ****
      if (!found)
          PG_RETURN_POINTER(NULL);

!     res = palloc(sizeof(TSLexeme) * 2);
!     memset(res, 0, sizeof(TSLexeme) * 2);
      res[0].lexeme = pstrdup(found->out);

      PG_RETURN_POINTER(res);
--- 176,182 ----
      if (!found)
          PG_RETURN_POINTER(NULL);

!     res = palloc0(sizeof(TSLexeme) * 2);
      res[0].lexeme = pstrdup(found->out);

      PG_RETURN_POINTER(res);
Index: src/backend/tsearch/spell.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/spell.c,v
retrieving revision 1.1
diff -c -r1.1 spell.c
*** src/backend/tsearch/spell.c    21 Aug 2007 01:11:18 -0000    1.1
--- src/backend/tsearch/spell.c    23 Aug 2007 14:04:45 -0000
***************
*** 21,28 ****


  /*
!  * during initialization dictionary requires a lot
!  * of memory, so it will use temporary context
   */
  static MemoryContext tmpCtx = NULL;

--- 21,31 ----


  /*
!  * Initialization requires a lot of memory that's not needed
!  * after the initialization is done.  In init function,
!  * CurrentMemoryContext is a long lived memory context associated
!  * with the dictionary cache entry, so we use a temporary context
!  * for the short-lived stuff.
   */
  static MemoryContext tmpCtx = NULL;

***************
*** 32,37 ****
--- 35,43 ----
  static void
  checkTmpCtx(void)
  {
+     /* XXX: This assumes that CurrentMemoryContext doesn't have
+      * any children other than the one we create here.
+      */
      if (CurrentMemoryContext->firstchild == NULL)
      {
          tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
***************
*** 78,93 ****
  }

  static char *
- strnduplicate(char *s, int len)
- {
-     char       *d = (char *) palloc(len + 1);
-
-     memcpy(d, s, len);
-     d[len] = '\0';
-     return d;
- }
-
- static char *
  findchar(char *str, int c)
  {
      while (*str)
--- 84,89 ----
***************
*** 185,191 ****
      }
      Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
      strcpy(Conf->Spell[Conf->nspell]->word, word);
!     strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16);
      Conf->nspell++;
  }

--- 181,187 ----
      }
      Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
      strcpy(Conf->Spell[Conf->nspell]->word, word);
!     strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
      Conf->nspell++;
  }

***************
*** 733,745 ****
      char        find[BUFSIZ];
      char        repl[BUFSIZ];
      char       *s;
!     int            suffixes = 0;
!     int            prefixes = 0;
      int            flag = 0;
      char        flagflags = 0;
      FILE       *affix;
      int            line = 0;
!     int            oldformat = 0;

      checkTmpCtx();

--- 729,741 ----
      char        find[BUFSIZ];
      char        repl[BUFSIZ];
      char       *s;
!     bool        suffixes = false;
!     bool        prefixes = false;
      int            flag = 0;
      char        flagflags = 0;
      FILE       *affix;
      int            line = 0;
!     bool        oldformat = false;

      checkTmpCtx();

***************
*** 777,798 ****
                      Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
                      Conf->usecompound = true;
                  }
!                 oldformat++;
                  continue;
              }
          }
          if (STRNCMP(pstr, "suffixes") == 0)
          {
!             suffixes = 1;
!             prefixes = 0;
!             oldformat++;
              continue;
          }
          if (STRNCMP(pstr, "prefixes") == 0)
          {
!             suffixes = 0;
!             prefixes = 1;
!             oldformat++;
              continue;
          }
          if (STRNCMP(pstr, "flag") == 0)
--- 773,794 ----
                      Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
                      Conf->usecompound = true;
                  }
!                 oldformat = true;
                  continue;
              }
          }
          if (STRNCMP(pstr, "suffixes") == 0)
          {
!             suffixes = true;
!             prefixes = false;
!             oldformat = true;
              continue;
          }
          if (STRNCMP(pstr, "prefixes") == 0)
          {
!             suffixes = false;
!             prefixes = true;
!             oldformat = true;
              continue;
          }
          if (STRNCMP(pstr, "flag") == 0)
***************
*** 802,808 ****

              while (*s && t_isspace(s))
                  s++;
!             oldformat++;

              /* allow only single-encoded flags */
              if (pg_mblen(s) != 1)
--- 798,804 ----

              while (*s && t_isspace(s))
                  s++;
!             oldformat = true;

              /* allow only single-encoded flags */
              if (pg_mblen(s) != 1)
***************
*** 978,1012 ****
  void
  NISortDictionary(IspellDict * Conf)
  {
!     size_t        i;
!     int            naffix = 3;

      checkTmpCtx();

      /* compress affixes */
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
!     for (i = 1; i < Conf->nspell; i++)
!         if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
              naffix++;

      Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
!     naffix = 1;
!     Conf->AffixData[0] = pstrdup("");
!     Conf->AffixData[1] = pstrdup(Conf->Spell[0]->p.flag);
!     Conf->Spell[0]->p.d.affix = 1;
!     Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word);
!     for (i = 1; i < Conf->nspell; i++)
      {
!         if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix]))
          {
!             naffix++;
!             Conf->AffixData[naffix] = pstrdup(Conf->Spell[i]->p.flag);
          }
!         Conf->Spell[i]->p.d.affix = naffix;
          Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
      }

      Conf->lenAffixData = Conf->nAffixData = naffix;
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
      Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);

--- 974,1021 ----
  void
  NISortDictionary(IspellDict * Conf)
  {
!     int    i;
!     int    naffix = 0;
!     int    curaffix;

      checkTmpCtx();

      /* compress affixes */
+
+     /* Count the number of different flags used in the dictionary */
+
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
!
!     naffix = 0;
!     for (i = 0; i < Conf->nspell; i++)
!     {
!         if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
              naffix++;
+     }

+     /*
+      * Fill in Conf->AffixData with the affixes that were used
+      * in the dictionary. Replace textual flag-field of Conf->Spell
+      * entries with indexes into Conf->AffixData array.
+      */
      Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
!
!     curaffix = -1;
!     for (i = 0; i < Conf->nspell; i++)
      {
!         if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
          {
!             curaffix++;
!             Assert(curaffix < naffix);
!             Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag);
          }
!
!         Conf->Spell[i]->p.d.affix = curaffix;
          Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
      }

      Conf->lenAffixData = Conf->nAffixData = naffix;
+
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
      Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);

***************
*** 1085,1091 ****
  }

  static void
! mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix)
  {
      int            i,
                  cnt = 0;
--- 1094,1100 ----
  }

  static void
! mkVoidAffix(IspellDict * Conf, bool issuffix, int startsuffix)
  {
      int            i,
                  cnt = 0;
***************
*** 1145,1151 ****
      AFFIX       *Affix;
      size_t        i;
      CMPDAffix  *ptr;
!     int            firstsuffix = -1;

      checkTmpCtx();

--- 1154,1160 ----
      AFFIX       *Affix;
      size_t        i;
      CMPDAffix  *ptr;
!     int            firstsuffix = Conf->naffixes;

      checkTmpCtx();

***************
*** 1160,1166 ****
      for (i = 0; i < Conf->naffixes; i++)
      {
          Affix = &(((AFFIX *) Conf->Affix)[i]);
!         if (Affix->type == FF_SUFFIX && firstsuffix < 0)
              firstsuffix = i;

          if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
--- 1169,1175 ----
      for (i = 0; i < Conf->naffixes; i++)
      {
          Affix = &(((AFFIX *) Conf->Affix)[i]);
!         if (Affix->type == FF_SUFFIX && i < firstsuffix)
              firstsuffix = i;

          if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
***************
*** 1185,1196 ****

      Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
      Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
!     mkVoidAffix(Conf, 1, firstsuffix);
!     mkVoidAffix(Conf, 0, firstsuffix);
  }

  static AffixNodeData *
! FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
  {
      AffixNodeData *StopLow,
                 *StopHigh,
--- 1194,1205 ----

      Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
      Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
!     mkVoidAffix(Conf, true, firstsuffix);
!     mkVoidAffix(Conf, false, firstsuffix);
  }

  static AffixNodeData *
! FindAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
  {
      AffixNodeData *StopLow,
                 *StopHigh,
***************
*** 1374,1380 ****
      plevel = 0;
      while (pnode)
      {
!         prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
          if (!prefix)
              break;
          for (j = 0; j < prefix->naff; j++)
--- 1383,1389 ----
      plevel = 0;
      while (pnode)
      {
!         prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
          if (!prefix)
              break;
          for (j = 0; j < prefix->naff; j++)
***************
*** 1398,1404 ****
          int            baselen = 0;

          /* find possible suffix */
!         suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
          if (!suffix)
              break;
          /* foreach suffix check affix */
--- 1407,1413 ----
          int            baselen = 0;

          /* find possible suffix */
!         suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
          if (!suffix)
              break;
          /* foreach suffix check affix */
***************
*** 1416,1422 ****
                  swrdlen = strlen(newword);
                  while (pnode)
                  {
!                     prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
                      if (!prefix)
                          break;
                      for (j = 0; j < prefix->naff; j++)
--- 1425,1431 ----
                  swrdlen = strlen(newword);
                  while (pnode)
                  {
!                     prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
                      if (!prefix)
                          break;
                      for (j = 0; j < prefix->naff; j++)
***************
*** 1626,1632 ****
                      if (wordlen == level + 1)
                      {
                          /* well, it was last word */
!                         var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
                          var->nstem++;
                          pfree(notprobed);
                          return var;
--- 1635,1641 ----
                      if (wordlen == level + 1)
                      {
                          /* well, it was last word */
!                         var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
                          var->nstem++;
                          pfree(notprobed);
                          return var;
***************
*** 1641,1647 ****
                          ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
                          /* we can find next word */
                          level++;
!                         var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
                          var->nstem++;
                          node = Conf->Dictionary;
                          startpos = level;
--- 1650,1656 ----
                          ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
                          /* we can find next word */
                          level++;
!                         var->stem[var->nstem] = pnstrdup(word + startpos, level - startpos);
                          var->nstem++;
                          node = Conf->Dictionary;
                          startpos = level;
***************
*** 1656,1662 ****
          level++;
      }

!     var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
      var->nstem++;
      pfree(notprobed);
      return var;
--- 1665,1671 ----
          level++;
      }

!     var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
      var->nstem++;
      pfree(notprobed);
      return var;
Index: src/backend/tsearch/ts_parse.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_parse.c,v
retrieving revision 1.1
diff -c -r1.1 ts_parse.c
*** src/backend/tsearch/ts_parse.c    21 Aug 2007 01:11:18 -0000    1.1
--- src/backend/tsearch/ts_parse.c    23 Aug 2007 12:29:51 -0000
***************
*** 308,314 ****
              {
                  /*
                   * Dictionary normalizes lexemes, so we remove from stack all
!                  * used lexemes , return to basic mode and redo end of stack
                   * (if it exists)
                   */
                  if (res)
--- 308,314 ----
              {
                  /*
                   * Dictionary normalizes lexemes, so we remove from stack all
!                  * used lexemes, return to basic mode and redo end of stack
                   * (if it exists)
                   */
                  if (res)
***************
*** 571,577 ****
  }

  text *
! generatHeadline(HeadlineText * prs)
  {
      text       *out;
      int            len = 128;
--- 571,577 ----
  }

  text *
! generateHeadline(HeadlineText * prs)
  {
      text       *out;
      int            len = 128;
Index: src/backend/tsearch/ts_utils.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_utils.c,v
retrieving revision 1.2
diff -c -r1.2 ts_utils.c
*** src/backend/tsearch/ts_utils.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/tsearch/ts_utils.c    23 Aug 2007 12:51:06 -0000
***************
*** 63,70 ****
--- 63,82 ----
      return result;
  }

+ static int
+ comparestr(const void *a, const void *b)
+ {
+     return strcmp(*(char **) a, *(char **) b);
+ }
+
  #define STOPBUFLEN    4096

+ /*
+  * Reads a stopword file.
+  *
+  * The file must be in UTF-8 encoding, it will be converted to database
+  * encoding.
+  */
  void
  readstoplist(char *in, StopList * s)
  {
***************
*** 97,108 ****
              if (*buf == '\0')
                  continue;

!             if (!pg_verifymbstr(buf, strlen(buf), true))
              {
                  FreeFile(hin);
                  ereport(ERROR,
                          (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                          errmsg("invalid multibyte encoding at line %d in file \"%s\"",
                                  line, filename)));
              }

--- 109,120 ----
              if (*buf == '\0')
                  continue;

!             if (!pg_verify_mbstr(PG_UTF8, buf, strlen(buf), true))
              {
                  FreeFile(hin);
                  ereport(ERROR,
                          (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                          errmsg("invalid UTF-8 encoding at line %d in file \"%s\"",
                                  line, filename)));
              }

***************
*** 120,130 ****
                  }
              }

!
!             if (s->wordop)
!                 stop[s->len] = s->wordop(buf);
!             else
!                 stop[s->len] = pstrdup(buf);

              (s->len)++;
          }
--- 132,138 ----
                  }
              }

!             stop[s->len] = recode_and_lowerstr(buf);

              (s->len)++;
          }
***************
*** 133,149 ****
      }

      s->stop = stop;
- }

! static int
! comparestr(const void *a, const void *b)
! {
!     return strcmp(*(char **) a, *(char **) b);
! }
!
! void
! sortstoplist(StopList * s)
! {
      if (s->stop && s->len > 0)
          qsort(s->stop, s->len, sizeof(char *), comparestr);
  }
--- 141,148 ----
      }

      s->stop = stop;

!     /* Sort to allow binary searching */
      if (s->stop && s->len > 0)
          qsort(s->stop, s->len, sizeof(char *), comparestr);
  }
Index: src/backend/tsearch/wparser.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/wparser.c,v
retrieving revision 1.2
diff -c -r1.2 wparser.c
*** src/backend/tsearch/wparser.c    22 Aug 2007 01:39:45 -0000    1.2
--- src/backend/tsearch/wparser.c    23 Aug 2007 12:29:59 -0000
***************
*** 325,331 ****
                    PointerGetDatum(prsoptions),
                    PointerGetDatum(query));

!     out = generatHeadline(&prs);

      PG_FREE_IF_COPY(in, 1);
      PG_FREE_IF_COPY(query, 2);
--- 325,331 ----
                    PointerGetDatum(prsoptions),
                    PointerGetDatum(query));

!     out = generateHeadline(&prs);

      PG_FREE_IF_COPY(in, 1);
      PG_FREE_IF_COPY(query, 2);
Index: src/include/tsearch/ts_public.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_public.h,v
retrieving revision 1.2
diff -c -r1.2 ts_public.h
*** src/include/tsearch/ts_public.h    22 Aug 2007 01:39:46 -0000    1.2
--- src/include/tsearch/ts_public.h    23 Aug 2007 10:54:50 -0000
***************
*** 71,80 ****
  {
      int            len;
      char      **stop;
-     char       *(*wordop) (char *);
  } StopList;

- extern void sortstoplist(StopList * s);
  extern void readstoplist(char *in, StopList * s);
  extern bool searchstoplist(StopList * s, char *key);

--- 71,78 ----
Index: src/include/tsearch/ts_utils.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_utils.h,v
retrieving revision 1.1
diff -c -r1.1 ts_utils.h
*** src/include/tsearch/ts_utils.h    21 Aug 2007 01:11:29 -0000    1.1
--- src/include/tsearch/ts_utils.h    23 Aug 2007 12:30:32 -0000
***************
*** 102,108 ****
   * headline framework, flow in common to generate:
   *    1 parse text with hlparsetext
   *    2 parser-specific function to find part
!  *    3 generatHeadline to generate result text
   */

  typedef struct
--- 102,108 ----
   * headline framework, flow in common to generate:
   *    1 parse text with hlparsetext
   *    2 parser-specific function to find part
!  *    3 generateHeadline to generate result text
   */

  typedef struct
***************
*** 131,137 ****

  extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
              char *buf, int4 buflen);
! extern text *generatHeadline(HeadlineText * prs);

  /*
   * token/node types for parsing
--- 131,137 ----

  extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
              char *buf, int4 buflen);
! extern text *generateHeadline(HeadlineText * prs);

  /*
   * token/node types for parsing
Index: src/include/tsearch/dicts/spell.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/dicts/spell.h,v
retrieving revision 1.1
diff -c -r1.1 spell.h
*** src/include/tsearch/dicts/spell.h    21 Aug 2007 01:11:29 -0000    1.1
--- src/include/tsearch/dicts/spell.h    23 Aug 2007 14:02:52 -0000
***************
*** 18,23 ****
--- 18,29 ----
  #include "tsearch/dicts/regis.h"
  #include "tsearch/ts_public.h"

+ /*
+  * Max length of a flag name. Names longer than this will be truncated
+  * to the maximum.
+  */
+ #define MAXFLAGLEN 16
+
  struct SPNode;

  typedef struct
***************
*** 54,67 ****
  {
      union
      {
!         char        flag[16];
          struct
          {
              int            affix;
              int            len;
          }            d;
      }            p;
!     char        word[1];
  } SPELL;

  #define SPELLHDRSZ    (offsetof(SPELL, word))
--- 60,76 ----
  {
      union
      {
!         /* flag is filled in by NIImportDictionary, and after NISortDictionary,
!          * d is used and flag is invalid.
!          */
!         char        flag[MAXFLAGLEN];
          struct
          {
              int            affix;
              int            len;
          }            d;
      }            p;
!     char        word[1]; /* variable length, null-terminated */
  } SPELL;

  #define SPELLHDRSZ    (offsetof(SPELL, word))
***************
*** 90,95 ****
--- 99,109 ----
  #define FF_COMPOUNDPERMITFLAG    0x10
  #define FF_COMPOUNDFORBIDFLAG    0x20
  #define FF_CROSSPRODUCT            0x40
+
+ /*
+  * don't change the ordering of these because it's
+  * taken advantage of in initialization
+  */
  #define FF_SUFFIX                1
  #define FF_PREFIX                0

***************
*** 126,134 ****
      int            naffixes;
      AFFIX       *Affix;

!     int            nspell;
!     int            mspell;
      SPELL      **Spell;

      AffixNode  *Suffix;
      AffixNode  *Prefix;
--- 140,150 ----
      int            naffixes;
      AFFIX       *Affix;

!     /* Temporary array of all words in the dict file. Only used during
!      * initialization */
      SPELL      **Spell;
+     int            nspell; /* number of entries in Spell-array */
+     int            mspell; /* allocated length of Spell-array */

      AffixNode  *Suffix;
      AffixNode  *Prefix;

Re: Bunch of tsearch fixes and cleanup

From

Tom Lane

Date:

23 August 2007, 14:49:34

"Heikki Linnakangas" <heikki@enterprisedb.com> writes:
> - readstopwords calls recode_and_lowerstr directly, instead of using the
>  "wordop" function pointer in StopList struct. All callers used
> recode_and_lowerstr anyway, so this simplifies the code a little bit. Is
> there any external dictionary implementations that would require
> different behavior?

I don't think eliminating wordop altogether is such a hot idea; some
dictionary could possibly want to do different processing than that.

Something that was annoying me yesterday was that it was not clear
whether we had fixed every single place that uses a tsearch config file
to assume that the file is in UTF8 and should be converted to database
encoding.  So I was thinking of hardwiring the "recode" part into
readstopwords, and using wordop just for the "lowercase" part, which
seemed to me like a saner division of labor.  That is, UTF8 is a policy
that we want to enforce globally, but lowercasing maybe not, and this
still leaves the door open for more processing besides lowercasing.

Oleg, Teodor, what do you think about this?

            regards, tom lane

Re: Bunch of tsearch fixes and cleanup

From

"Heikki Linnakangas"

Date:

23 August 2007, 14:57:36

Tom Lane wrote:
> "Heikki Linnakangas" <heikki@enterprisedb.com> writes:
>> - readstopwords calls recode_and_lowerstr directly, instead of using the
>>  "wordop" function pointer in StopList struct. All callers used
>> recode_and_lowerstr anyway, so this simplifies the code a little bit. Is
>> there any external dictionary implementations that would require
>> different behavior?
>
> I don't think eliminating wordop altogether is such a hot idea; some
> dictionary could possibly want to do different processing than that.

Ok.

> Something that was annoying me yesterday was that it was not clear
> whether we had fixed every single place that uses a tsearch config file
> to assume that the file is in UTF8 and should be converted to database
> encoding.

I'm afraid there's still a lot of inconsistencies in that. I'm just
looking at dict_synonym, and it looks like it has the same problem I
patched in readstopwords; it's using pg_verifymbstr, with database
encoding, to verify the input file. It also seems to be calling
pg_mblen, which depends on database encoding, against UTF-8 encoded
strings. I'll look at those more closely..

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

Re: Bunch of tsearch fixes and cleanup

From

Oleg Bartunov

Date:

23 August 2007, 17:54:55

On Thu, 23 Aug 2007, Tom Lane wrote:

> "Heikki Linnakangas" <heikki@enterprisedb.com> writes:
>> - readstopwords calls recode_and_lowerstr directly, instead of using the
>>  "wordop" function pointer in StopList struct. All callers used
>> recode_and_lowerstr anyway, so this simplifies the code a little bit. Is
>> there any external dictionary implementations that would require
>> different behavior?
>
> I don't think eliminating wordop altogether is such a hot idea; some
> dictionary could possibly want to do different processing than that.
>
> Something that was annoying me yesterday was that it was not clear
> whether we had fixed every single place that uses a tsearch config file
> to assume that the file is in UTF8 and should be converted to database
> encoding.  So I was thinking of hardwiring the "recode" part into
> readstopwords, and using wordop just for the "lowercase" part, which
> seemed to me like a saner division of labor.  That is, UTF8 is a policy
> that we want to enforce globally, but lowercasing maybe not, and this
> still leaves the door open for more processing besides lowercasing.
>
> Oleg, Teodor, what do you think about this?
>

I agrre with utf-8 recoding and please, don't lowercase. Dictionaries
are very different.

>             regards, tom lane
>

     Regards,
         Oleg
_____________________________________________________________
Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
Sternberg Astronomical Institute, Moscow University, Russia
Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/
phone: +007(495)939-16-83, +007(495)939-23-83

Re: Bunch of tsearch fixes and cleanup

From

"Heikki Linnakangas"

Date:

23 August 2007, 20:30:42

Tom Lane wrote:
> Something that was annoying me yesterday was that it was not clear
> whether we had fixed every single place that uses a tsearch config file
> to assume that the file is in UTF8 and should be converted to database
> encoding.  So I was thinking of hardwiring the "recode" part into
> readstopwords, and using wordop just for the "lowercase" part, which
> seemed to me like a saner division of labor.  That is, UTF8 is a policy
> that we want to enforce globally, but lowercasing maybe not, and this
> still leaves the door open for more processing besides lowercasing.

I think we also want to always run input files through pg_verify_mbstr.
We do it for stopwords, and synonym files (though incorrectly), but not
for thesaurus files or ispell files. It's probably best to do that
within the recode-function as well.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

Re: Bunch of tsearch fixes and cleanup

From

"Heikki Linnakangas"

Date:

24 August 2007, 11:40:29

Heikki Linnakangas wrote:
> Tom Lane wrote:
>> Something that was annoying me yesterday was that it was not clear
>> whether we had fixed every single place that uses a tsearch config file
>> to assume that the file is in UTF8 and should be converted to database
>> encoding.  So I was thinking of hardwiring the "recode" part into
>> readstopwords, and using wordop just for the "lowercase" part, which
>> seemed to me like a saner division of labor.  That is, UTF8 is a policy
>> that we want to enforce globally, but lowercasing maybe not, and this
>> still leaves the door open for more processing besides lowercasing.
>
> I think we also want to always run input files through pg_verify_mbstr.
> We do it for stopwords, and synonym files (though incorrectly), but not
> for thesaurus files or ispell files. It's probably best to do that
> within the recode-function as well.

Ok, here's an updated version of the patch.

- ispell initialization crashed on empty dictionary file
- ispell initialization crashed on affix file with prefixes but no suffixes
- stop words file was ran through pg_verify_mbstr, with database
encoding, but it's later interpreted as being UTF-8. Now verifies that
it's UTF-8, regardless of database encoding.


- introduces new t_readline function that reads a line from a file,
verifies that it's valid UTF-8, and converts it to database encoding.
Modified all places that read tsearch config files to use this function
instead of fgets directly.

- readstopwords now sorts the stop words after loading them. Removed the
separate sortstopwords function.

- moved the wordop-input parameter from StopList struct to a direct
argument to readstopwords. Seems cleaner to me that way, the struct is
now purely an output of readstopwords, not mixed input/output.
readstopwords now recodes the input implicitly using t_readline.

- bunch of comments added, typos fixed, and other cleanup

PS. It's bank holiday here in the UK on Monday, so I won't be around
until Tuesday if something comes up.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

Re: Bunch of tsearch fixes and cleanup

From

"Heikki Linnakangas"

Date:

24 August 2007, 11:41:34

And here's the attachment I forgot.

Heikki Linnakangas wrote:
> Heikki Linnakangas wrote:
>> Tom Lane wrote:
>>> Something that was annoying me yesterday was that it was not clear
>>> whether we had fixed every single place that uses a tsearch config file
>>> to assume that the file is in UTF8 and should be converted to database
>>> encoding.  So I was thinking of hardwiring the "recode" part into
>>> readstopwords, and using wordop just for the "lowercase" part, which
>>> seemed to me like a saner division of labor.  That is, UTF8 is a policy
>>> that we want to enforce globally, but lowercasing maybe not, and this
>>> still leaves the door open for more processing besides lowercasing.
>> I think we also want to always run input files through pg_verify_mbstr.
>> We do it for stopwords, and synonym files (though incorrectly), but not
>> for thesaurus files or ispell files. It's probably best to do that
>> within the recode-function as well.
>
> Ok, here's an updated version of the patch.
>
> - ispell initialization crashed on empty dictionary file
> - ispell initialization crashed on affix file with prefixes but no suffixes
> - stop words file was ran through pg_verify_mbstr, with database
> encoding, but it's later interpreted as being UTF-8. Now verifies that
> it's UTF-8, regardless of database encoding.
>
>
> - introduces new t_readline function that reads a line from a file,
> verifies that it's valid UTF-8, and converts it to database encoding.
> Modified all places that read tsearch config files to use this function
> instead of fgets directly.
>
> - readstopwords now sorts the stop words after loading them. Removed the
> separate sortstopwords function.
>
> - moved the wordop-input parameter from StopList struct to a direct
> argument to readstopwords. Seems cleaner to me that way, the struct is
> now purely an output of readstopwords, not mixed input/output.
> readstopwords now recodes the input implicitly using t_readline.
>
> - bunch of comments added, typos fixed, and other cleanup
>
> PS. It's bank holiday here in the UK on Monday, so I won't be around
> until Tuesday if something comes up.
>


--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/snowball/dict_snowball.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/snowball/dict_snowball.c,v
retrieving revision 1.2
diff -c -r1.2 dict_snowball.c
*** src/backend/snowball/dict_snowball.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/snowball/dict_snowball.c    24 Aug 2007 09:37:50 -0000
***************
*** 192,198 ****
      ListCell   *l;

      d = (DictSnowball *) palloc0(sizeof(DictSnowball));
-     d->stoplist.wordop = recode_and_lowerstr;

      foreach(l, dictoptions)
      {
--- 192,197 ----
***************
*** 204,211 ****
                  ereport(ERROR,
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
!             readstoplist(defGetString(defel), &d->stoplist);
!             sortstoplist(&d->stoplist);
              stoploaded = true;
          }
          else if (pg_strcasecmp("Language", defel->defname) == 0)
--- 203,209 ----
                  ereport(ERROR,
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
!             readstoplist(defGetString(defel), &d->stoplist, lowerstr);
              stoploaded = true;
          }
          else if (pg_strcasecmp("Language", defel->defname) == 0)
Index: src/backend/tsearch/dict_ispell.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_ispell.c,v
retrieving revision 1.2
diff -c -r1.2 dict_ispell.c
*** src/backend/tsearch/dict_ispell.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/tsearch/dict_ispell.c    23 Aug 2007 21:12:33 -0000
***************
*** 39,45 ****
      ListCell   *l;

      d = (DictISpell *) palloc0(sizeof(DictISpell));
-     d->stoplist.wordop = recode_and_lowerstr;

      foreach(l, dictoptions)
      {
--- 39,44 ----
***************
*** 73,80 ****
                  ereport(ERROR,
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
!             readstoplist(defGetString(defel), &(d->stoplist));
!             sortstoplist(&(d->stoplist));
              stoploaded = true;
          }
          else
--- 72,78 ----
                  ereport(ERROR,
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
!             readstoplist(defGetString(defel), &(d->stoplist), lowerstr);
              stoploaded = true;
          }
          else
Index: src/backend/tsearch/dict_simple.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_simple.c,v
retrieving revision 1.2
diff -c -r1.2 dict_simple.c
*** src/backend/tsearch/dict_simple.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/tsearch/dict_simple.c    23 Aug 2007 21:12:24 -0000
***************
*** 23,41 ****
  typedef struct
  {
      StopList    stoplist;
! } DictExample;


  Datum
  dsimple_init(PG_FUNCTION_ARGS)
  {
      List       *dictoptions = (List *) PG_GETARG_POINTER(0);
!     DictExample *d = (DictExample *) palloc0(sizeof(DictExample));
      bool        stoploaded = false;
      ListCell   *l;

-     d->stoplist.wordop = recode_and_lowerstr;
-
      foreach(l, dictoptions)
      {
          DefElem    *defel = (DefElem *) lfirst(l);
--- 23,39 ----
  typedef struct
  {
      StopList    stoplist;
! } DictSimple;


  Datum
  dsimple_init(PG_FUNCTION_ARGS)
  {
      List       *dictoptions = (List *) PG_GETARG_POINTER(0);
!     DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
      bool        stoploaded = false;
      ListCell   *l;

      foreach(l, dictoptions)
      {
          DefElem    *defel = (DefElem *) lfirst(l);
***************
*** 46,53 ****
                  ereport(ERROR,
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
!             readstoplist(defGetString(defel), &d->stoplist);
!             sortstoplist(&d->stoplist);
              stoploaded = true;
          }
          else
--- 44,50 ----
                  ereport(ERROR,
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
!             readstoplist(defGetString(defel), &d->stoplist, lowerstr);
              stoploaded = true;
          }
          else
***************
*** 65,80 ****
  Datum
  dsimple_lexize(PG_FUNCTION_ARGS)
  {
!     DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
      char       *in = (char *) PG_GETARG_POINTER(1);
      int32       len = PG_GETARG_INT32(2);
!     char       *txt = lowerstr_with_len(in, len);
      TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);

      if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
-     {
          pfree(txt);
-     }
      else
          res[0].lexeme = txt;

--- 62,77 ----
  Datum
  dsimple_lexize(PG_FUNCTION_ARGS)
  {
!     DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0);
      char       *in = (char *) PG_GETARG_POINTER(1);
      int32       len = PG_GETARG_INT32(2);
!     char       *txt;
      TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);

+     txt = lowerstr_with_len(in, len);
+
      if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
          pfree(txt);
      else
          res[0].lexeme = txt;

Index: src/backend/tsearch/dict_synonym.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_synonym.c,v
retrieving revision 1.2
diff -c -r1.2 dict_synonym.c
*** src/backend/tsearch/dict_synonym.c    22 Aug 2007 04:13:15 -0000    1.2
--- src/backend/tsearch/dict_synonym.c    24 Aug 2007 10:00:05 -0000
***************
*** 20,28 ****
  #include "tsearch/ts_utils.h"
  #include "utils/builtins.h"

-
- #define SYNBUFLEN    4096
-
  typedef struct
  {
      char       *in;
--- 20,25 ----
***************
*** 31,53 ****

  typedef struct
  {
!     int            len;
      Syn           *syn;
  } DictSyn;

  static char *
  findwrd(char *in, char **end)
  {
      char       *start;

!     *end = NULL;
      while (*in && t_isspace(in))
          in += pg_mblen(in);

      if (*in == '\0')
          return NULL;
      start = in;

      while (*in && !t_isspace(in))
          in += pg_mblen(in);

--- 28,61 ----

  typedef struct
  {
!     int            len;    /* length of syn array */
      Syn           *syn;
  } DictSyn;

+ /*
+  * Finds the next whitespace-delimited word within the 'in' string.
+  * Returns a pointer to the first character of the word, and a pointer
+  * to the next byte after the last character in the word (in *end).
+  */
  static char *
  findwrd(char *in, char **end)
  {
      char       *start;

!     /* Skip leading spaces */
      while (*in && t_isspace(in))
          in += pg_mblen(in);

+     /* Return NULL on empty lines */
      if (*in == '\0')
+     {
+         *end = NULL;
          return NULL;
+     }
+
      start = in;

+     /* Find end of word */
      while (*in && !t_isspace(in))
          in += pg_mblen(in);

***************
*** 70,81 ****
      ListCell   *l;
      char       *filename = NULL;
      FILE       *fin;
-     char        buf[SYNBUFLEN];
      char       *starti,
                 *starto,
                 *end = NULL;
      int            cur = 0;
!     int            slen;

      foreach(l, dictoptions)
      {
--- 78,88 ----
      ListCell   *l;
      char       *filename = NULL;
      FILE       *fin;
      char       *starti,
                 *starto,
                 *end = NULL;
      int            cur = 0;
!     char       *line = NULL;

      foreach(l, dictoptions)
      {
***************
*** 105,114 ****

      d = (DictSyn *) palloc0(sizeof(DictSyn));

!     while (fgets(buf, SYNBUFLEN, fin))
      {
!         slen = strlen(buf);
!         pg_verifymbstr(buf, slen, false);
          if (cur == d->len)
          {
              if (d->len == 0)
--- 112,144 ----

      d = (DictSyn *) palloc0(sizeof(DictSyn));

!     while ((line = t_readline(fin)) != NULL)
      {
!         starti = findwrd(line, &end);
!         if (!starti)
!         {
!             /* Empty line */
!             goto skipline;
!         }
!         *end = '\0';
!         if (end >= line + strlen(line))
!         {
!             /* A line with only one word. Ignore silently. */
!             goto skipline;
!         }
!
!         starto = findwrd(end + 1, &end);
!         if (!starto)
!         {
!             /* A line with only one word. Ignore silently. */
!             goto skipline;
!         }
!         *end = '\0';
!
!         /* starti now points to the first word, and starto to the second
!          * word on the line, with a \0 terminator at the end of both words.
!          */
!
          if (cur == d->len)
          {
              if (d->len == 0)
***************
*** 123,158 ****
              }
          }

!         starti = findwrd(buf, &end);
!         if (!starti)
!             continue;
!         *end = '\0';
!         if (end >= buf + slen)
!             continue;
!
!         starto = findwrd(end + 1, &end);
!         if (!starto)
!             continue;
!         *end = '\0';
!
!         d->syn[cur].in = recode_and_lowerstr(starti);
!         d->syn[cur].out = recode_and_lowerstr(starto);
!         if (!(d->syn[cur].in && d->syn[cur].out))
!         {
!             FreeFile(fin);
!             ereport(ERROR,
!                     (errcode(ERRCODE_OUT_OF_MEMORY),
!                      errmsg("out of memory")));
!         }

          cur++;
      }

      FreeFile(fin);

      d->len = cur;
!     if (cur > 1)
!         qsort(d->syn, d->len, sizeof(Syn), compareSyn);

      PG_RETURN_POINTER(d);
  }
--- 153,171 ----
              }
          }

!         d->syn[cur].in = lowerstr(starti);
!         d->syn[cur].out = lowerstr(starto);

          cur++;
+
+     skipline:
+         pfree(line);
      }

      FreeFile(fin);

      d->len = cur;
!     qsort(d->syn, d->len, sizeof(Syn), compareSyn);

      PG_RETURN_POINTER(d);
  }
***************
*** 179,186 ****
      if (!found)
          PG_RETURN_POINTER(NULL);

!     res = palloc(sizeof(TSLexeme) * 2);
!     memset(res, 0, sizeof(TSLexeme) * 2);
      res[0].lexeme = pstrdup(found->out);

      PG_RETURN_POINTER(res);
--- 192,198 ----
      if (!found)
          PG_RETURN_POINTER(NULL);

!     res = palloc0(sizeof(TSLexeme) * 2);
      res[0].lexeme = pstrdup(found->out);

      PG_RETURN_POINTER(res);
Index: src/backend/tsearch/dict_thesaurus.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_thesaurus.c,v
retrieving revision 1.2
diff -c -r1.2 dict_thesaurus.c
*** src/backend/tsearch/dict_thesaurus.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/tsearch/dict_thesaurus.c    24 Aug 2007 10:02:16 -0000
***************
*** 170,179 ****
  thesaurusRead(char *filename, DictThesaurus * d)
  {
      FILE       *fh;
-     char        str[BUFSIZ];
      int            lineno = 0;
      uint16        idsubst = 0;
      bool        useasis = false;

      filename = get_tsearch_config_filename(filename, "ths");
      fh = AllocateFile(filename, "r");
--- 170,179 ----
  thesaurusRead(char *filename, DictThesaurus * d)
  {
      FILE       *fh;
      int            lineno = 0;
      uint16        idsubst = 0;
      bool        useasis = false;
+     char       *line;

      filename = get_tsearch_config_filename(filename, "ths");
      fh = AllocateFile(filename, "r");
***************
*** 183,209 ****
                   errmsg("could not open thesaurus file \"%s\": %m",
                          filename)));

!     while (fgets(str, sizeof(str), fh))
      {
!         char       *ptr,
!                    *recoded;
          int            state = TR_WAITLEX;
          char       *beginwrd = NULL;
          uint16        posinsubst = 0;
          uint16        nwrd = 0;

-         ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
-                                              GetDatabaseEncoding(), PG_UTF8);
-         if (recoded == NULL)
-             elog(ERROR, "encoding conversion failed");
-
          lineno++;

!         /* is it comment ? */
!         while (t_isspace(ptr))
              ptr += pg_mblen(ptr);
!         if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r'))
              continue;

          while (*ptr)
          {
--- 183,210 ----
                   errmsg("could not open thesaurus file \"%s\": %m",
                          filename)));

!     while ((line = t_readline(fh)) != NULL)
      {
!         char       *ptr;
          int            state = TR_WAITLEX;
          char       *beginwrd = NULL;
          uint16        posinsubst = 0;
          uint16        nwrd = 0;

          lineno++;

!         ptr = line;
!
!         /* is it a comment? */
!         while (*ptr && t_isspace(ptr))
              ptr += pg_mblen(ptr);
!
!         if (t_iseq(ptr, '#') || *ptr == '\0' ||
!             t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
!         {
!             pfree(line);
              continue;
+         }

          while (*ptr)
          {
***************
*** 301,308 ****
                              lineno, filename)));
          }

!         if (recoded != str)
!             pfree(recoded);
      }

      d->nsubst = idsubst;
--- 302,308 ----
                              lineno, filename)));
          }

!         pfree(line);
      }

      d->nsubst = idsubst;
Index: src/backend/tsearch/spell.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/spell.c,v
retrieving revision 1.1
diff -c -r1.1 spell.c
*** src/backend/tsearch/spell.c    21 Aug 2007 01:11:18 -0000    1.1
--- src/backend/tsearch/spell.c    24 Aug 2007 10:41:12 -0000
***************
*** 21,28 ****


  /*
!  * during initialization dictionary requires a lot
!  * of memory, so it will use temporary context
   */
  static MemoryContext tmpCtx = NULL;

--- 21,31 ----


  /*
!  * Initialization requires a lot of memory that's not needed
!  * after the initialization is done.  In init function,
!  * CurrentMemoryContext is a long lived memory context associated
!  * with the dictionary cache entry, so we use a temporary context
!  * for the short-lived stuff.
   */
  static MemoryContext tmpCtx = NULL;

***************
*** 32,37 ****
--- 35,43 ----
  static void
  checkTmpCtx(void)
  {
+     /* XXX: This assumes that CurrentMemoryContext doesn't have
+      * any children other than the one we create here.
+      */
      if (CurrentMemoryContext->firstchild == NULL)
      {
          tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
***************
*** 74,90 ****
  static int
  cmpspellaffix(const void *s1, const void *s2)
  {
!     return (strcmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag));
! }
!
! static char *
! strnduplicate(char *s, int len)
! {
!     char       *d = (char *) palloc(len + 1);
!
!     memcpy(d, s, len);
!     d[len] = '\0';
!     return d;
  }

  static char *
--- 80,86 ----
  static int
  cmpspellaffix(const void *s1, const void *s2)
  {
!     return (strncmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag, MAXFLAGLEN));
  }

  static char *
***************
*** 185,191 ****
      }
      Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
      strcpy(Conf->Spell[Conf->nspell]->word, word);
!     strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16);
      Conf->nspell++;
  }

--- 181,187 ----
      }
      Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
      strcpy(Conf->Spell[Conf->nspell]->word, word);
!     strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
      Conf->nspell++;
  }

***************
*** 197,205 ****
  void
  NIImportDictionary(IspellDict * Conf, const char *filename)
  {
-     char        str[BUFSIZ],
-                *pstr;
      FILE       *dict;

      checkTmpCtx();

--- 193,200 ----
  void
  NIImportDictionary(IspellDict * Conf, const char *filename)
  {
      FILE       *dict;
+     char       *line;

      checkTmpCtx();

***************
*** 209,227 ****
                   errmsg("could not open dictionary file \"%s\": %m",
                          filename)));

!     while (fgets(str, sizeof(str), dict))
      {
!         char       *s,
!                    *recoded;
          const char *flag;

!         recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
!                                              PG_UTF8, GetDatabaseEncoding());
!         if (recoded == NULL)
!             elog(ERROR, "encoding conversion failed");
!
          flag = NULL;
!         if ((s = findchar(recoded, '/')))
          {
              *s++ = '\0';
              flag = s;
--- 204,217 ----
                   errmsg("could not open dictionary file \"%s\": %m",
                          filename)));

!     while ((line = t_readline(dict)) != NULL)
      {
!         char       *s, *pstr;
          const char *flag;

!         /* Extract flag from the line */
          flag = NULL;
!         if ((s = findchar(line, '/')))
          {
              *s++ = '\0';
              flag = s;
***************
*** 240,247 ****
          else
              flag = "";

!
!         s = recoded;
          while (*s)
          {
              if (t_isspace(s))
--- 230,237 ----
          else
              flag = "";

!         /* Remove trailing spaces */
!         s = line;
          while (*s)
          {
              if (t_isspace(s))
***************
*** 251,263 ****
              }
              s += pg_mblen(s);
          }
!         pstr = lowerstr_ctx(recoded);

          NIAddSpell(Conf, pstr, flag);
          pfree(pstr);

!         if (recoded != str)
!             pfree(recoded);
      }
      FreeFile(dict);
  }
--- 241,252 ----
              }
              s += pg_mblen(s);
          }
!         pstr = lowerstr_ctx(line);

          NIAddSpell(Conf, pstr, flag);
          pfree(pstr);

!         pfree(line);
      }
      FreeFile(dict);
  }
***************
*** 402,408 ****

  static bool
  parse_affentry(char *str, char *mask, char *find, char *repl,
!                const char *filename, int line)
  {
      int            state = PAE_WAIT_MASK;
      char       *pmask = mask,
--- 391,397 ----

  static bool
  parse_affentry(char *str, char *mask, char *find, char *repl,
!                const char *filename, int lineno)
  {
      int            state = PAE_WAIT_MASK;
      char       *pmask = mask,
***************
*** 453,459 ****
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("syntax error at line %d of affix file \"%s\"",
!                                 line, filename)));
          }
          else if (state == PAE_INFIND)
          {
--- 442,448 ----
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("syntax error at line %d of affix file \"%s\"",
!                                 lineno, filename)));
          }
          else if (state == PAE_INFIND)
          {
***************
*** 471,477 ****
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("syntax error at line %d of affix file \"%s\"",
!                                 line, filename)));
          }
          else if (state == PAE_WAIT_REPL)
          {
--- 460,466 ----
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("syntax error at line %d of affix file \"%s\"",
!                                 lineno, filename)));
          }
          else if (state == PAE_WAIT_REPL)
          {
***************
*** 489,495 ****
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("syntax error at line %d of affix file \"%s\"",
!                                 line, filename)));
          }
          else if (state == PAE_INREPL)
          {
--- 478,484 ----
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("syntax error at line %d of affix file \"%s\"",
!                                 lineno, filename)));
          }
          else if (state == PAE_INREPL)
          {
***************
*** 507,513 ****
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("syntax error at line %d of affix file \"%s\"",
!                                 line, filename)));
          }
          else
              elog(ERROR, "unknown state in parse_affentry: %d", state);
--- 496,502 ----
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("syntax error at line %d of affix file \"%s\"",
!                                 lineno, filename)));
          }
          else
              elog(ERROR, "unknown state in parse_affentry: %d", state);
***************
*** 522,528 ****

  static void
  addFlagValue(IspellDict * Conf, char *s, uint32 val,
!              const char *filename, int line)
  {
      while (*s && t_isspace(s))
          s++;
--- 511,517 ----

  static void
  addFlagValue(IspellDict * Conf, char *s, uint32 val,
!              const char *filename, int lineno)
  {
      while (*s && t_isspace(s))
          s++;
***************
*** 531,543 ****
          ereport(ERROR,
                  (errcode(ERRCODE_CONFIG_FILE_ERROR),
                   errmsg("syntax error at line %d of affix file \"%s\"",
!                         line, filename)));

      if (pg_mblen(s) != 1)
          ereport(ERROR,
                  (errcode(ERRCODE_CONFIG_FILE_ERROR),
                   errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
!                         line, filename)));

      Conf->flagval[(unsigned int) *s] = (unsigned char) val;
      Conf->usecompound = true;
--- 520,532 ----
          ereport(ERROR,
                  (errcode(ERRCODE_CONFIG_FILE_ERROR),
                   errmsg("syntax error at line %d of affix file \"%s\"",
!                         lineno, filename)));

      if (pg_mblen(s) != 1)
          ereport(ERROR,
                  (errcode(ERRCODE_CONFIG_FILE_ERROR),
                   errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
!                         lineno, filename)));

      Conf->flagval[(unsigned int) *s] = (unsigned char) val;
      Conf->usecompound = true;
***************
*** 546,552 ****
  static void
  NIImportOOAffixes(IspellDict * Conf, const char *filename)
  {
-     char        str[BUFSIZ];
      char        type[BUFSIZ],
                 *ptype = NULL;
      char        sflag[BUFSIZ];
--- 535,540 ----
***************
*** 560,568 ****
      int            flag = 0;
      char        flagflags = 0;
      FILE       *affix;
!     int            line = 0;
      int            scanread = 0;
      char        scanbuf[BUFSIZ];

      checkTmpCtx();

--- 548,557 ----
      int            flag = 0;
      char        flagflags = 0;
      FILE       *affix;
!     int            lineno = 0;
      int            scanread = 0;
      char        scanbuf[BUFSIZ];
+     char       *recoded;

      checkTmpCtx();

***************
*** 576,620 ****
                   errmsg("could not open affix file \"%s\": %m",
                          filename)));

!     while (fgets(str, sizeof(str), affix))
      {
!         char       *recoded;
!
!         recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
!                                              PG_UTF8, GetDatabaseEncoding());
!         if (recoded == NULL)
!             elog(ERROR, "encoding conversion failed");
!
!         line++;

          if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
              continue;

          if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
!                          FF_COMPOUNDFLAG, filename, line);
          else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
!                          FF_COMPOUNDBEGIN, filename, line);
          else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
!                          FF_COMPOUNDLAST, filename, line);
          /* COMPOUNDLAST and COMPOUNDEND are synonyms */
          else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
!                          FF_COMPOUNDLAST, filename, line);
          else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
!                          FF_COMPOUNDMIDDLE, filename, line);
          else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
              addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
!                          FF_COMPOUNDONLY, filename, line);
          else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
!                          FF_COMPOUNDPERMITFLAG, filename, line);
          else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
!                          FF_COMPOUNDFORBIDFLAG, filename, line);
          else if (STRNCMP(recoded, "FLAG") == 0)
          {
              char       *s = recoded + strlen("FLAG");
--- 565,605 ----
                   errmsg("could not open affix file \"%s\": %m",
                          filename)));

!     while ((recoded = t_readline(affix)) != NULL)
      {
!         lineno++;

          if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+         {
+             pfree(recoded);
              continue;
+         }

          if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
!                          FF_COMPOUNDFLAG, filename, lineno);
          else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
!                          FF_COMPOUNDBEGIN, filename, lineno);
          else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
!                          FF_COMPOUNDLAST, filename, lineno);
          /* COMPOUNDLAST and COMPOUNDEND are synonyms */
          else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
!                          FF_COMPOUNDLAST, filename, lineno);
          else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
!                          FF_COMPOUNDMIDDLE, filename, lineno);
          else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
              addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
!                          FF_COMPOUNDONLY, filename, lineno);
          else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
!                          FF_COMPOUNDPERMITFLAG, filename, lineno);
          else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
              addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
!                          FF_COMPOUNDFORBIDFLAG, filename, lineno);
          else if (STRNCMP(recoded, "FLAG") == 0)
          {
              char       *s = recoded + strlen("FLAG");
***************
*** 626,639 ****
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"",
!                                 line, filename)));
          }

!         if (recoded != str)
!             pfree(recoded);
      }
      FreeFile(affix);
!     line = 0;

      sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);

--- 611,623 ----
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"",
!                                 lineno, filename)));
          }

!         pfree(recoded);
      }
      FreeFile(affix);
!     lineno = 0;

      sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);

***************
*** 643,660 ****
                   errmsg("could not open affix file \"%s\": %m",
                          filename)));

!     while (fgets(str, sizeof(str), affix))
      {
!         char       *recoded;
!
!         recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
!                                              PG_UTF8, GetDatabaseEncoding());
!         if (recoded == NULL)
!             elog(ERROR, "encoding conversion failed");
!
!         line++;
          if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
!             continue;

          scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);

--- 627,637 ----
                   errmsg("could not open affix file \"%s\": %m",
                          filename)));

!     while ((recoded = t_readline(affix)) != NULL)
      {
!         lineno++;
          if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
!             goto nextline;

          scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);

***************
*** 662,673 ****
              pfree(ptype);
          ptype = lowerstr_ctx(type);
          if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
!             continue;

          if (scanread == 4)
          {
              if (strlen(sflag) != 1)
!                 continue;
              flag = *sflag;
              isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
              pfind = lowerstr_ctx(find);
--- 639,650 ----
              pfree(ptype);
          ptype = lowerstr_ctx(type);
          if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
!             goto nextline;

          if (scanread == 4)
          {
              if (strlen(sflag) != 1)
!                 goto nextline;
              flag = *sflag;
              isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
              pfind = lowerstr_ctx(find);
***************
*** 683,689 ****
              int            aflg = 0;

              if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
!                 continue;
              prepl = lowerstr_ctx(repl);
              /* affix flag */
              if ((ptr = strchr(prepl, '/')) != NULL)
--- 660,666 ----
              int            aflg = 0;

              if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
!                 goto nextline;
              prepl = lowerstr_ctx(repl);
              /* affix flag */
              if ((ptr = strchr(prepl, '/')) != NULL)
***************
*** 710,717 ****
              pfree(pmask);
          }

!         if (recoded != str)
!             pfree(recoded);
      }

      if (ptype)
--- 687,694 ----
              pfree(pmask);
          }

!     nextline:
!         pfree(recoded);
      }

      if (ptype)
***************
*** 733,745 ****
      char        find[BUFSIZ];
      char        repl[BUFSIZ];
      char       *s;
!     int            suffixes = 0;
!     int            prefixes = 0;
      int            flag = 0;
      char        flagflags = 0;
      FILE       *affix;
!     int            line = 0;
!     int            oldformat = 0;

      checkTmpCtx();

--- 710,723 ----
      char        find[BUFSIZ];
      char        repl[BUFSIZ];
      char       *s;
!     bool        suffixes = false;
!     bool        prefixes = false;
      int            flag = 0;
      char        flagflags = 0;
      FILE       *affix;
!     int            lineno = 0;
!     bool        oldformat = false;
!     char       *recoded = NULL;

      checkTmpCtx();

***************
*** 752,767 ****
      memset(Conf->flagval, 0, sizeof(Conf->flagval));
      Conf->usecompound = false;

!     while (fgets(str, sizeof(str), affix))
      {
!         if (pstr)
!             pfree(pstr);

!         pstr = recode_and_lowerstr(str);

!         line++;
          if (*pstr == '#' || *pstr == '\n')
!             continue;

          if (STRNCMP(pstr, "compoundwords") == 0)
          {
--- 730,745 ----
      memset(Conf->flagval, 0, sizeof(Conf->flagval));
      Conf->usecompound = false;

!     while ((recoded = t_readline(affix)) != NULL)
      {
!         pstr = lowerstr(recoded);
!         pfree(recoded);

!         lineno++;

!         /* Skip comments and empty lines */
          if (*pstr == '#' || *pstr == '\n')
!             goto nextline;

          if (STRNCMP(pstr, "compoundwords") == 0)
          {
***************
*** 777,799 ****
                      Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
                      Conf->usecompound = true;
                  }
!                 oldformat++;
!                 continue;
              }
          }
          if (STRNCMP(pstr, "suffixes") == 0)
          {
!             suffixes = 1;
!             prefixes = 0;
!             oldformat++;
!             continue;
          }
          if (STRNCMP(pstr, "prefixes") == 0)
          {
!             suffixes = 0;
!             prefixes = 1;
!             oldformat++;
!             continue;
          }
          if (STRNCMP(pstr, "flag") == 0)
          {
--- 755,777 ----
                      Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
                      Conf->usecompound = true;
                  }
!                 oldformat = true;
!                 goto nextline;
              }
          }
          if (STRNCMP(pstr, "suffixes") == 0)
          {
!             suffixes = true;
!             prefixes = false;
!             oldformat = true;
!             goto nextline;
          }
          if (STRNCMP(pstr, "prefixes") == 0)
          {
!             suffixes = false;
!             prefixes = true;
!             oldformat = true;
!             goto nextline;
          }
          if (STRNCMP(pstr, "flag") == 0)
          {
***************
*** 802,815 ****

              while (*s && t_isspace(s))
                  s++;
!             oldformat++;

              /* allow only single-encoded flags */
              if (pg_mblen(s) != 1)
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
!                                 line, filename)));

              if (*s == '*')
              {
--- 780,793 ----

              while (*s && t_isspace(s))
                  s++;
!             oldformat = true;

              /* allow only single-encoded flags */
              if (pg_mblen(s) != 1)
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
!                                 lineno, filename)));

              if (*s == '*')
              {
***************
*** 830,839 ****
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
!                                 line, filename)));

              flag = (unsigned char) *s;
!             continue;
          }
          if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 ||
              STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0)
--- 808,817 ----
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
!                                 lineno, filename)));

              flag = (unsigned char) *s;
!             goto nextline;
          }
          if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 ||
              STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0)
***************
*** 842,864 ****
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("wrong affix file format for flag at line %d of affix file \"%s\"",
!                                 line, filename)));
              FreeFile(affix);
              NIImportOOAffixes(Conf, filename);
              return;
          }
          if ((!suffixes) && (!prefixes))
!             continue;

!         if (!parse_affentry(pstr, mask, find, repl, filename, line))
!             continue;

          NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
-     }
-     FreeFile(affix);

!     if (pstr)
          pfree(pstr);
  }

  static int
--- 820,842 ----
                  ereport(ERROR,
                          (errcode(ERRCODE_CONFIG_FILE_ERROR),
                           errmsg("wrong affix file format for flag at line %d of affix file \"%s\"",
!                                 lineno, filename)));
              FreeFile(affix);
              NIImportOOAffixes(Conf, filename);
              return;
          }
          if ((!suffixes) && (!prefixes))
!             goto nextline;

!         if (!parse_affentry(pstr, mask, find, repl, filename, lineno))
!             goto nextline;

          NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);

!     nextline:
          pfree(pstr);
+     }
+     FreeFile(affix);
  }

  static int
***************
*** 975,1012 ****
      return rs;
  }

  void
  NISortDictionary(IspellDict * Conf)
  {
!     size_t        i;
!     int            naffix = 3;

      checkTmpCtx();

      /* compress affixes */
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
!     for (i = 1; i < Conf->nspell; i++)
!         if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
              naffix++;

      Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
!     naffix = 1;
!     Conf->AffixData[0] = pstrdup("");
!     Conf->AffixData[1] = pstrdup(Conf->Spell[0]->p.flag);
!     Conf->Spell[0]->p.d.affix = 1;
!     Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word);
!     for (i = 1; i < Conf->nspell; i++)
      {
!         if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix]))
          {
!             naffix++;
!             Conf->AffixData[naffix] = pstrdup(Conf->Spell[i]->p.flag);
          }
!         Conf->Spell[i]->p.d.affix = naffix;
          Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
      }

      Conf->lenAffixData = Conf->nAffixData = naffix;
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
      Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);

--- 953,1007 ----
      return rs;
  }

+ /*
+  * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
+  * and affixes.
+  */
  void
  NISortDictionary(IspellDict * Conf)
  {
!     int    i;
!     int    naffix = 0;
!     int    curaffix;

      checkTmpCtx();

      /* compress affixes */
+
+     /* Count the number of different flags used in the dictionary */
+
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
!
!     naffix = 0;
!     for (i = 0; i < Conf->nspell; i++)
!     {
!         if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
              naffix++;
+     }

+     /*
+      * Fill in Conf->AffixData with the affixes that were used
+      * in the dictionary. Replace textual flag-field of Conf->Spell
+      * entries with indexes into Conf->AffixData array.
+      */
      Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
!
!     curaffix = -1;
!     for (i = 0; i < Conf->nspell; i++)
      {
!         if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
          {
!             curaffix++;
!             Assert(curaffix < naffix);
!             Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag);
          }
!
!         Conf->Spell[i]->p.d.affix = curaffix;
          Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
      }

      Conf->lenAffixData = Conf->nAffixData = naffix;
+
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
      Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);

***************
*** 1085,1091 ****
  }

  static void
! mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix)
  {
      int            i,
                  cnt = 0;
--- 1080,1086 ----
  }

  static void
! mkVoidAffix(IspellDict * Conf, bool issuffix, int startsuffix)
  {
      int            i,
                  cnt = 0;
***************
*** 1145,1151 ****
      AFFIX       *Affix;
      size_t        i;
      CMPDAffix  *ptr;
!     int            firstsuffix = -1;

      checkTmpCtx();

--- 1140,1146 ----
      AFFIX       *Affix;
      size_t        i;
      CMPDAffix  *ptr;
!     int            firstsuffix = Conf->naffixes;

      checkTmpCtx();

***************
*** 1160,1166 ****
      for (i = 0; i < Conf->naffixes; i++)
      {
          Affix = &(((AFFIX *) Conf->Affix)[i]);
!         if (Affix->type == FF_SUFFIX && firstsuffix < 0)
              firstsuffix = i;

          if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
--- 1155,1161 ----
      for (i = 0; i < Conf->naffixes; i++)
      {
          Affix = &(((AFFIX *) Conf->Affix)[i]);
!         if (Affix->type == FF_SUFFIX && i < firstsuffix)
              firstsuffix = i;

          if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
***************
*** 1185,1196 ****

      Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
      Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
!     mkVoidAffix(Conf, 1, firstsuffix);
!     mkVoidAffix(Conf, 0, firstsuffix);
  }

  static AffixNodeData *
! FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
  {
      AffixNodeData *StopLow,
                 *StopHigh,
--- 1180,1191 ----

      Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
      Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
!     mkVoidAffix(Conf, true, firstsuffix);
!     mkVoidAffix(Conf, false, firstsuffix);
  }

  static AffixNodeData *
! FindAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
  {
      AffixNodeData *StopLow,
                 *StopHigh,
***************
*** 1374,1380 ****
      plevel = 0;
      while (pnode)
      {
!         prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
          if (!prefix)
              break;
          for (j = 0; j < prefix->naff; j++)
--- 1369,1375 ----
      plevel = 0;
      while (pnode)
      {
!         prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
          if (!prefix)
              break;
          for (j = 0; j < prefix->naff; j++)
***************
*** 1398,1404 ****
          int            baselen = 0;

          /* find possible suffix */
!         suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
          if (!suffix)
              break;
          /* foreach suffix check affix */
--- 1393,1399 ----
          int            baselen = 0;

          /* find possible suffix */
!         suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
          if (!suffix)
              break;
          /* foreach suffix check affix */
***************
*** 1416,1422 ****
                  swrdlen = strlen(newword);
                  while (pnode)
                  {
!                     prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
                      if (!prefix)
                          break;
                      for (j = 0; j < prefix->naff; j++)
--- 1411,1417 ----
                  swrdlen = strlen(newword);
                  while (pnode)
                  {
!                     prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
                      if (!prefix)
                          break;
                      for (j = 0; j < prefix->naff; j++)
***************
*** 1626,1632 ****
                      if (wordlen == level + 1)
                      {
                          /* well, it was last word */
!                         var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
                          var->nstem++;
                          pfree(notprobed);
                          return var;
--- 1621,1627 ----
                      if (wordlen == level + 1)
                      {
                          /* well, it was last word */
!                         var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
                          var->nstem++;
                          pfree(notprobed);
                          return var;
***************
*** 1641,1647 ****
                          ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
                          /* we can find next word */
                          level++;
!                         var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
                          var->nstem++;
                          node = Conf->Dictionary;
                          startpos = level;
--- 1636,1642 ----
                          ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
                          /* we can find next word */
                          level++;
!                         var->stem[var->nstem] = pnstrdup(word + startpos, level - startpos);
                          var->nstem++;
                          node = Conf->Dictionary;
                          startpos = level;
***************
*** 1656,1662 ****
          level++;
      }

!     var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
      var->nstem++;
      pfree(notprobed);
      return var;
--- 1651,1657 ----
          level++;
      }

!     var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
      var->nstem++;
      pfree(notprobed);
      return var;
Index: src/backend/tsearch/ts_locale.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_locale.c,v
retrieving revision 1.1
diff -c -r1.1 ts_locale.c
*** src/backend/tsearch/ts_locale.c    21 Aug 2007 01:11:18 -0000    1.1
--- src/backend/tsearch/ts_locale.c    24 Aug 2007 09:47:44 -0000
***************
*** 125,152 ****
  }
  #endif   /* TS_USE_WIDE */

  /*
!  * Convert C-string from UTF8 to server encoding and
!  * lower it
   */
  char *
! recode_and_lowerstr(char *str)
  {
!     char       *recoded;
!     char       *ret;
!
!     recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
!                                              PG_UTF8, GetDatabaseEncoding());

      if (recoded == NULL)
          elog(ERROR, "encoding conversion failed");

!     ret = lowerstr(recoded);
!
!     if (recoded != str)
!         pfree(recoded);

!     return ret;
  }

  char *
--- 125,169 ----
  }
  #endif   /* TS_USE_WIDE */

+
  /*
!  * Utility function to read a line from a tsearch data file,
!  * and recode it to database encoding. The returned string
!  * is palloc'd.
   */
  char *
! t_readline(FILE *fp)
  {
!     int len;
!     static char *recoded = NULL;
!     static char buf[4096];
!
!     if(fgets(buf, sizeof(buf), fp) == NULL)
!         return NULL;
!
!     len = strnlen(buf, sizeof(buf));
!
!     /* Make sure the input is valid UTF-8 */
!     (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
!
!     recoded = (char *) pg_do_encoding_conversion(
!         (unsigned char *) buf,
!         len,
!         PG_UTF8,
!         GetDatabaseEncoding());

      if (recoded == NULL)
          elog(ERROR, "encoding conversion failed");

!     if (recoded == buf)
!     {
!         /* we can use the length of the original string, because
!          * no conversion was done
!          */
!         recoded = pnstrdup(recoded, len);
!     }

!     return recoded;
  }

  char *
***************
*** 155,160 ****
--- 172,180 ----
      return lowerstr_with_len(str, strlen(str));
  }

+ /*
+  * Returned string is palloc'd
+  */
  char *
  lowerstr_with_len(char *str, int len)
  {
Index: src/backend/tsearch/ts_parse.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_parse.c,v
retrieving revision 1.1
diff -c -r1.1 ts_parse.c
*** src/backend/tsearch/ts_parse.c    21 Aug 2007 01:11:18 -0000    1.1
--- src/backend/tsearch/ts_parse.c    23 Aug 2007 12:29:51 -0000
***************
*** 308,314 ****
              {
                  /*
                   * Dictionary normalizes lexemes, so we remove from stack all
!                  * used lexemes , return to basic mode and redo end of stack
                   * (if it exists)
                   */
                  if (res)
--- 308,314 ----
              {
                  /*
                   * Dictionary normalizes lexemes, so we remove from stack all
!                  * used lexemes, return to basic mode and redo end of stack
                   * (if it exists)
                   */
                  if (res)
***************
*** 571,577 ****
  }

  text *
! generatHeadline(HeadlineText * prs)
  {
      text       *out;
      int            len = 128;
--- 571,577 ----
  }

  text *
! generateHeadline(HeadlineText * prs)
  {
      text       *out;
      int            len = 128;
Index: src/backend/tsearch/ts_utils.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_utils.c,v
retrieving revision 1.2
diff -c -r1.2 ts_utils.c
*** src/backend/tsearch/ts_utils.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/tsearch/ts_utils.c    24 Aug 2007 10:57:58 -0000
***************
*** 63,83 ****
      return result;
  }

! #define STOPBUFLEN    4096

  void
! readstoplist(char *in, StopList * s)
  {
      char      **stop = NULL;

      s->len = 0;
      if (in && *in)
      {
          char       *filename = get_tsearch_config_filename(in, "stop");
          FILE       *hin;
-         char        buf[STOPBUFLEN];
          int            reallen = 0;
-         int            line = 0;

          if ((hin = AllocateFile(filename, "r")) == NULL)
              ereport(ERROR,
--- 63,90 ----
      return result;
  }

! static int
! comparestr(const void *a, const void *b)
! {
!     return strcmp(*(char **) a, *(char **) b);
! }

+ /*
+  * Reads a stopword file. Each word is ran through 'wordop'
+  * function, if given.
+  */
  void
! readstoplist(char *in, StopList * s, char *(*wordop) (char *))
  {
      char      **stop = NULL;
+     char       *line;

      s->len = 0;
      if (in && *in)
      {
          char       *filename = get_tsearch_config_filename(in, "stop");
          FILE       *hin;
          int            reallen = 0;

          if ((hin = AllocateFile(filename, "r")) == NULL)
              ereport(ERROR,
***************
*** 85,109 ****
                       errmsg("could not open stopword file \"%s\": %m",
                              filename)));

!         while (fgets(buf, STOPBUFLEN, hin))
          {
!             char       *pbuf = buf;

!             line++;
!             while (*pbuf && !isspace(*pbuf))
                  pbuf++;
              *pbuf = '\0';

!             if (*buf == '\0')
!                 continue;
!
!             if (!pg_verifymbstr(buf, strlen(buf), true))
              {
!                 FreeFile(hin);
!                 ereport(ERROR,
!                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                          errmsg("invalid multibyte encoding at line %d in file \"%s\"",
!                                 line, filename)));
              }

              if (s->len >= reallen)
--- 92,111 ----
                       errmsg("could not open stopword file \"%s\": %m",
                              filename)));

!         while ((line = t_readline(hin)) != NULL)
          {
!             char *pbuf = line;

!             /* Trim trailing space */
!             while (*pbuf && !t_isspace(pbuf))
                  pbuf++;
              *pbuf = '\0';

!             /* Skip empty lines */
!             if (*line == '\0')
              {
!                 pfree(line);
!                 continue;
              }

              if (s->len >= reallen)
***************
*** 120,130 ****
                  }
              }

!
!             if (s->wordop)
!                 stop[s->len] = s->wordop(buf);
              else
!                 stop[s->len] = pstrdup(buf);

              (s->len)++;
          }
--- 122,135 ----
                  }
              }

!             if (wordop)
!             {
!                 stop[s->len] = wordop(line);
!                 if (stop[s->len] != line)
!                     pfree(line);
!             }
              else
!                 stop[s->len] = line;

              (s->len)++;
          }
***************
*** 133,149 ****
      }

      s->stop = stop;
- }
-
- static int
- comparestr(const void *a, const void *b)
- {
-     return strcmp(*(char **) a, *(char **) b);
- }

! void
! sortstoplist(StopList * s)
! {
      if (s->stop && s->len > 0)
          qsort(s->stop, s->len, sizeof(char *), comparestr);
  }
--- 138,145 ----
      }

      s->stop = stop;

!     /* Sort to allow binary searching */
      if (s->stop && s->len > 0)
          qsort(s->stop, s->len, sizeof(char *), comparestr);
  }
Index: src/backend/tsearch/wparser.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/wparser.c,v
retrieving revision 1.2
diff -c -r1.2 wparser.c
*** src/backend/tsearch/wparser.c    22 Aug 2007 01:39:45 -0000    1.2
--- src/backend/tsearch/wparser.c    23 Aug 2007 12:29:59 -0000
***************
*** 325,331 ****
                    PointerGetDatum(prsoptions),
                    PointerGetDatum(query));

!     out = generatHeadline(&prs);

      PG_FREE_IF_COPY(in, 1);
      PG_FREE_IF_COPY(query, 2);
--- 325,331 ----
                    PointerGetDatum(prsoptions),
                    PointerGetDatum(query));

!     out = generateHeadline(&prs);

      PG_FREE_IF_COPY(in, 1);
      PG_FREE_IF_COPY(query, 2);
Index: src/include/tsearch/ts_locale.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_locale.h,v
retrieving revision 1.1
diff -c -r1.1 ts_locale.h
*** src/include/tsearch/ts_locale.h    21 Aug 2007 01:11:29 -0000    1.1
--- src/include/tsearch/ts_locale.h    24 Aug 2007 09:48:14 -0000
***************
*** 83,88 ****

  char       *lowerstr(char *str);
  char       *lowerstr_with_len(char *str, int len);
! char       *recode_and_lowerstr(char *str);

  #endif   /* __TSLOCALE_H__ */
--- 83,88 ----

  char       *lowerstr(char *str);
  char       *lowerstr_with_len(char *str, int len);
! char       *t_readline(FILE *fp);

  #endif   /* __TSLOCALE_H__ */
Index: src/include/tsearch/ts_public.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_public.h,v
retrieving revision 1.2
diff -c -r1.2 ts_public.h
*** src/include/tsearch/ts_public.h    22 Aug 2007 01:39:46 -0000    1.2
--- src/include/tsearch/ts_public.h    23 Aug 2007 19:55:25 -0000
***************
*** 71,81 ****
  {
      int            len;
      char      **stop;
-     char       *(*wordop) (char *);
  } StopList;

! extern void sortstoplist(StopList * s);
! extern void readstoplist(char *in, StopList * s);
  extern bool searchstoplist(StopList * s, char *key);

  /*
--- 71,79 ----
  {
      int            len;
      char      **stop;
  } StopList;

! extern void readstoplist(char *in, StopList * s, char *(*wordop) (char *));
  extern bool searchstoplist(StopList * s, char *key);

  /*
Index: src/include/tsearch/ts_utils.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_utils.h,v
retrieving revision 1.1
diff -c -r1.1 ts_utils.h
*** src/include/tsearch/ts_utils.h    21 Aug 2007 01:11:29 -0000    1.1
--- src/include/tsearch/ts_utils.h    23 Aug 2007 12:30:32 -0000
***************
*** 102,108 ****
   * headline framework, flow in common to generate:
   *    1 parse text with hlparsetext
   *    2 parser-specific function to find part
!  *    3 generatHeadline to generate result text
   */

  typedef struct
--- 102,108 ----
   * headline framework, flow in common to generate:
   *    1 parse text with hlparsetext
   *    2 parser-specific function to find part
!  *    3 generateHeadline to generate result text
   */

  typedef struct
***************
*** 131,137 ****

  extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
              char *buf, int4 buflen);
! extern text *generatHeadline(HeadlineText * prs);

  /*
   * token/node types for parsing
--- 131,137 ----

  extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
              char *buf, int4 buflen);
! extern text *generateHeadline(HeadlineText * prs);

  /*
   * token/node types for parsing
Index: src/include/tsearch/dicts/spell.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/dicts/spell.h,v
retrieving revision 1.1
diff -c -r1.1 spell.h
*** src/include/tsearch/dicts/spell.h    21 Aug 2007 01:11:29 -0000    1.1
--- src/include/tsearch/dicts/spell.h    24 Aug 2007 10:59:49 -0000
***************
*** 18,23 ****
--- 18,29 ----
  #include "tsearch/dicts/regis.h"
  #include "tsearch/ts_public.h"

+ /*
+  * Max length of a flag name. Names longer than this will be truncated
+  * to the maximum.
+  */
+ #define MAXFLAGLEN 16
+
  struct SPNode;

  typedef struct
***************
*** 54,67 ****
  {
      union
      {
!         char        flag[16];
          struct
          {
              int            affix;
              int            len;
          }            d;
      }            p;
!     char        word[1];
  } SPELL;

  #define SPELLHDRSZ    (offsetof(SPELL, word))
--- 60,76 ----
  {
      union
      {
!         /* flag is filled in by NIImportDictionary. After NISortDictionary,
!          * d is valid and flag is invalid.
!          */
!         char        flag[MAXFLAGLEN];
          struct
          {
              int            affix;
              int            len;
          }            d;
      }            p;
!     char        word[1]; /* variable length, null-terminated */
  } SPELL;

  #define SPELLHDRSZ    (offsetof(SPELL, word))
***************
*** 90,95 ****
--- 99,110 ----
  #define FF_COMPOUNDPERMITFLAG    0x10
  #define FF_COMPOUNDFORBIDFLAG    0x20
  #define FF_CROSSPRODUCT            0x40
+
+ /*
+  * Don't change the order of these. Initialization
+  * sorts by because these, and expects prefixes to
+  * come first after sorting.
+  */
  #define FF_SUFFIX                1
  #define FF_PREFIX                0

***************
*** 126,134 ****
      int            naffixes;
      AFFIX       *Affix;

!     int            nspell;
!     int            mspell;
      SPELL      **Spell;

      AffixNode  *Suffix;
      AffixNode  *Prefix;
--- 141,151 ----
      int            naffixes;
      AFFIX       *Affix;

!     /* Temporary array of all words in the dict file. Only used during
!      * initialization */
      SPELL      **Spell;
+     int            nspell; /* number of entries in Spell-array */
+     int            mspell; /* allocated length of Spell-array */

      AffixNode  *Suffix;
      AffixNode  *Prefix;

Re: Bunch of tsearch fixes and cleanup

From

Tom Lane

Date:

24 August 2007, 15:36:35

"Heikki Linnakangas" <heikki@enterprisedb.com> writes:
> Ok, here's an updated version of the patch.

I haven't actually read this patch yet, but the description all sounds
like the Right Thing now.  Will review and commit today.

Also, I believe there's consensus to rename the standard Snowball
dictionaries to "english_stem" etc, so I'll make that happen too.

            regards, tom lane

Re: Bunch of tsearch fixes and cleanup

From

Tom Lane

Date:

25 August 2007, 00:05:53

"Heikki Linnakangas" <heikki@enterprisedb.com> writes:
> Ok, here's an updated version of the patch.

Applied, with a few trivial additional cleanups I noticed while reading
the patch.  I included your HeadlineText de-duplication too.

            regards, tom lane