Bunch of tsearch fixes and cleanup - Mailing list pgsql-patches

From Heikki Linnakangas
Subject Bunch of tsearch fixes and cleanup
Date
Msg-id 46CD9713.6080107@enterprisedb.com
Whole thread Raw
Responses Re: Bunch of tsearch fixes and cleanup  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-patches
Fixes the following bugs:
- ispell initialization crashed on empty dictionary file
- ispell initialization crashed on affix file with prefixes but no suffixes
- stop words file was ran through pg_verify_mbstr, with database
encoding, but it's later interpreted as being UTF-8. Now verifies that
it's UTF-8, regardless of database encoding.

Other changes:

- readstopwords now sorts the stop words after loading them. Removed the
separate sortstopwords function.

- readstopwords calls recode_and_lowerstr directly, instead of using the
 "wordop" function pointer in StopList struct. All callers used
recode_and_lowerstr anyway, so this simplifies the code a little bit. Is
there any external dictionary implementations that would require
different behavior?

- bunch of comments added, typos fixed, and other cleanup

The code still needs lots of love, but it's a start...

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/snowball/dict_snowball.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/snowball/dict_snowball.c,v
retrieving revision 1.2
diff -c -r1.2 dict_snowball.c
*** src/backend/snowball/dict_snowball.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/snowball/dict_snowball.c    23 Aug 2007 10:55:53 -0000
***************
*** 192,198 ****
      ListCell   *l;

      d = (DictSnowball *) palloc0(sizeof(DictSnowball));
-     d->stoplist.wordop = recode_and_lowerstr;

      foreach(l, dictoptions)
      {
--- 192,197 ----
***************
*** 205,211 ****
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
              readstoplist(defGetString(defel), &d->stoplist);
-             sortstoplist(&d->stoplist);
              stoploaded = true;
          }
          else if (pg_strcasecmp("Language", defel->defname) == 0)
--- 204,209 ----
Index: src/backend/tsearch/dict_ispell.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_ispell.c,v
retrieving revision 1.2
diff -c -r1.2 dict_ispell.c
*** src/backend/tsearch/dict_ispell.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/tsearch/dict_ispell.c    23 Aug 2007 10:57:12 -0000
***************
*** 39,45 ****
      ListCell   *l;

      d = (DictISpell *) palloc0(sizeof(DictISpell));
-     d->stoplist.wordop = recode_and_lowerstr;

      foreach(l, dictoptions)
      {
--- 39,44 ----
***************
*** 74,80 ****
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
              readstoplist(defGetString(defel), &(d->stoplist));
-             sortstoplist(&(d->stoplist));
              stoploaded = true;
          }
          else
--- 73,78 ----
Index: src/backend/tsearch/dict_simple.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_simple.c,v
retrieving revision 1.2
diff -c -r1.2 dict_simple.c
*** src/backend/tsearch/dict_simple.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/tsearch/dict_simple.c    23 Aug 2007 11:15:55 -0000
***************
*** 23,41 ****
  typedef struct
  {
      StopList    stoplist;
! } DictExample;


  Datum
  dsimple_init(PG_FUNCTION_ARGS)
  {
      List       *dictoptions = (List *) PG_GETARG_POINTER(0);
!     DictExample *d = (DictExample *) palloc0(sizeof(DictExample));
      bool        stoploaded = false;
      ListCell   *l;

-     d->stoplist.wordop = recode_and_lowerstr;
-
      foreach(l, dictoptions)
      {
          DefElem    *defel = (DefElem *) lfirst(l);
--- 23,39 ----
  typedef struct
  {
      StopList    stoplist;
! } DictSimple;


  Datum
  dsimple_init(PG_FUNCTION_ARGS)
  {
      List       *dictoptions = (List *) PG_GETARG_POINTER(0);
!     DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
      bool        stoploaded = false;
      ListCell   *l;

      foreach(l, dictoptions)
      {
          DefElem    *defel = (DefElem *) lfirst(l);
***************
*** 47,53 ****
                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                           errmsg("multiple StopWords parameters")));
              readstoplist(defGetString(defel), &d->stoplist);
-             sortstoplist(&d->stoplist);
              stoploaded = true;
          }
          else
--- 45,50 ----
***************
*** 65,80 ****
  Datum
  dsimple_lexize(PG_FUNCTION_ARGS)
  {
!     DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
      char       *in = (char *) PG_GETARG_POINTER(1);
      int32       len = PG_GETARG_INT32(2);
!     char       *txt = lowerstr_with_len(in, len);
      TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);

      if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
-     {
          pfree(txt);
-     }
      else
          res[0].lexeme = txt;

--- 62,77 ----
  Datum
  dsimple_lexize(PG_FUNCTION_ARGS)
  {
!     DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0);
      char       *in = (char *) PG_GETARG_POINTER(1);
      int32       len = PG_GETARG_INT32(2);
!     char       *txt;
      TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);

+     txt = lowerstr_with_len(in, len);
+
      if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
          pfree(txt);
      else
          res[0].lexeme = txt;

Index: src/backend/tsearch/dict_synonym.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_synonym.c,v
retrieving revision 1.2
diff -c -r1.2 dict_synonym.c
*** src/backend/tsearch/dict_synonym.c    22 Aug 2007 04:13:15 -0000    1.2
--- src/backend/tsearch/dict_synonym.c    23 Aug 2007 13:09:47 -0000
***************
*** 31,40 ****

  typedef struct
  {
!     int            len;
      Syn           *syn;
  } DictSyn;

  static char *
  findwrd(char *in, char **end)
  {
--- 31,45 ----

  typedef struct
  {
!     int            len;    /* length of syn array */
      Syn           *syn;
  } DictSyn;

+ /*
+  * Finds the next whitespace-delimited word within the 'in' string.
+  * Returns a pointer to the first character of the word, and a pointer
+  * to the next byte after the last character in the word in *end.
+  */
  static char *
  findwrd(char *in, char **end)
  {
***************
*** 137,149 ****

          d->syn[cur].in = recode_and_lowerstr(starti);
          d->syn[cur].out = recode_and_lowerstr(starto);
-         if (!(d->syn[cur].in && d->syn[cur].out))
-         {
-             FreeFile(fin);
-             ereport(ERROR,
-                     (errcode(ERRCODE_OUT_OF_MEMORY),
-                      errmsg("out of memory")));
-         }

          cur++;
      }
--- 142,147 ----
***************
*** 151,158 ****
      FreeFile(fin);

      d->len = cur;
!     if (cur > 1)
!         qsort(d->syn, d->len, sizeof(Syn), compareSyn);

      PG_RETURN_POINTER(d);
  }
--- 149,155 ----
      FreeFile(fin);

      d->len = cur;
!     qsort(d->syn, d->len, sizeof(Syn), compareSyn);

      PG_RETURN_POINTER(d);
  }
***************
*** 179,186 ****
      if (!found)
          PG_RETURN_POINTER(NULL);

!     res = palloc(sizeof(TSLexeme) * 2);
!     memset(res, 0, sizeof(TSLexeme) * 2);
      res[0].lexeme = pstrdup(found->out);

      PG_RETURN_POINTER(res);
--- 176,182 ----
      if (!found)
          PG_RETURN_POINTER(NULL);

!     res = palloc0(sizeof(TSLexeme) * 2);
      res[0].lexeme = pstrdup(found->out);

      PG_RETURN_POINTER(res);
Index: src/backend/tsearch/spell.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/spell.c,v
retrieving revision 1.1
diff -c -r1.1 spell.c
*** src/backend/tsearch/spell.c    21 Aug 2007 01:11:18 -0000    1.1
--- src/backend/tsearch/spell.c    23 Aug 2007 14:04:45 -0000
***************
*** 21,28 ****


  /*
!  * during initialization dictionary requires a lot
!  * of memory, so it will use temporary context
   */
  static MemoryContext tmpCtx = NULL;

--- 21,31 ----


  /*
!  * Initialization requires a lot of memory that's not needed
!  * after the initialization is done.  In init function,
!  * CurrentMemoryContext is a long lived memory context associated
!  * with the dictionary cache entry, so we use a temporary context
!  * for the short-lived stuff.
   */
  static MemoryContext tmpCtx = NULL;

***************
*** 32,37 ****
--- 35,43 ----
  static void
  checkTmpCtx(void)
  {
+     /* XXX: This assumes that CurrentMemoryContext doesn't have
+      * any children other than the one we create here.
+      */
      if (CurrentMemoryContext->firstchild == NULL)
      {
          tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
***************
*** 78,93 ****
  }

  static char *
- strnduplicate(char *s, int len)
- {
-     char       *d = (char *) palloc(len + 1);
-
-     memcpy(d, s, len);
-     d[len] = '\0';
-     return d;
- }
-
- static char *
  findchar(char *str, int c)
  {
      while (*str)
--- 84,89 ----
***************
*** 185,191 ****
      }
      Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
      strcpy(Conf->Spell[Conf->nspell]->word, word);
!     strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16);
      Conf->nspell++;
  }

--- 181,187 ----
      }
      Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
      strcpy(Conf->Spell[Conf->nspell]->word, word);
!     strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
      Conf->nspell++;
  }

***************
*** 733,745 ****
      char        find[BUFSIZ];
      char        repl[BUFSIZ];
      char       *s;
!     int            suffixes = 0;
!     int            prefixes = 0;
      int            flag = 0;
      char        flagflags = 0;
      FILE       *affix;
      int            line = 0;
!     int            oldformat = 0;

      checkTmpCtx();

--- 729,741 ----
      char        find[BUFSIZ];
      char        repl[BUFSIZ];
      char       *s;
!     bool        suffixes = false;
!     bool        prefixes = false;
      int            flag = 0;
      char        flagflags = 0;
      FILE       *affix;
      int            line = 0;
!     bool        oldformat = false;

      checkTmpCtx();

***************
*** 777,798 ****
                      Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
                      Conf->usecompound = true;
                  }
!                 oldformat++;
                  continue;
              }
          }
          if (STRNCMP(pstr, "suffixes") == 0)
          {
!             suffixes = 1;
!             prefixes = 0;
!             oldformat++;
              continue;
          }
          if (STRNCMP(pstr, "prefixes") == 0)
          {
!             suffixes = 0;
!             prefixes = 1;
!             oldformat++;
              continue;
          }
          if (STRNCMP(pstr, "flag") == 0)
--- 773,794 ----
                      Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
                      Conf->usecompound = true;
                  }
!                 oldformat = true;
                  continue;
              }
          }
          if (STRNCMP(pstr, "suffixes") == 0)
          {
!             suffixes = true;
!             prefixes = false;
!             oldformat = true;
              continue;
          }
          if (STRNCMP(pstr, "prefixes") == 0)
          {
!             suffixes = false;
!             prefixes = true;
!             oldformat = true;
              continue;
          }
          if (STRNCMP(pstr, "flag") == 0)
***************
*** 802,808 ****

              while (*s && t_isspace(s))
                  s++;
!             oldformat++;

              /* allow only single-encoded flags */
              if (pg_mblen(s) != 1)
--- 798,804 ----

              while (*s && t_isspace(s))
                  s++;
!             oldformat = true;

              /* allow only single-encoded flags */
              if (pg_mblen(s) != 1)
***************
*** 978,1012 ****
  void
  NISortDictionary(IspellDict * Conf)
  {
!     size_t        i;
!     int            naffix = 3;

      checkTmpCtx();

      /* compress affixes */
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
!     for (i = 1; i < Conf->nspell; i++)
!         if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
              naffix++;

      Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
!     naffix = 1;
!     Conf->AffixData[0] = pstrdup("");
!     Conf->AffixData[1] = pstrdup(Conf->Spell[0]->p.flag);
!     Conf->Spell[0]->p.d.affix = 1;
!     Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word);
!     for (i = 1; i < Conf->nspell; i++)
      {
!         if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix]))
          {
!             naffix++;
!             Conf->AffixData[naffix] = pstrdup(Conf->Spell[i]->p.flag);
          }
!         Conf->Spell[i]->p.d.affix = naffix;
          Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
      }

      Conf->lenAffixData = Conf->nAffixData = naffix;
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
      Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);

--- 974,1021 ----
  void
  NISortDictionary(IspellDict * Conf)
  {
!     int    i;
!     int    naffix = 0;
!     int    curaffix;

      checkTmpCtx();

      /* compress affixes */
+
+     /* Count the number of different flags used in the dictionary */
+
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
!
!     naffix = 0;
!     for (i = 0; i < Conf->nspell; i++)
!     {
!         if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
              naffix++;
+     }

+     /*
+      * Fill in Conf->AffixData with the affixes that were used
+      * in the dictionary. Replace textual flag-field of Conf->Spell
+      * entries with indexes into Conf->AffixData array.
+      */
      Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
!
!     curaffix = -1;
!     for (i = 0; i < Conf->nspell; i++)
      {
!         if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
          {
!             curaffix++;
!             Assert(curaffix < naffix);
!             Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag);
          }
!
!         Conf->Spell[i]->p.d.affix = curaffix;
          Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
      }

      Conf->lenAffixData = Conf->nAffixData = naffix;
+
      qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
      Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);

***************
*** 1085,1091 ****
  }

  static void
! mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix)
  {
      int            i,
                  cnt = 0;
--- 1094,1100 ----
  }

  static void
! mkVoidAffix(IspellDict * Conf, bool issuffix, int startsuffix)
  {
      int            i,
                  cnt = 0;
***************
*** 1145,1151 ****
      AFFIX       *Affix;
      size_t        i;
      CMPDAffix  *ptr;
!     int            firstsuffix = -1;

      checkTmpCtx();

--- 1154,1160 ----
      AFFIX       *Affix;
      size_t        i;
      CMPDAffix  *ptr;
!     int            firstsuffix = Conf->naffixes;

      checkTmpCtx();

***************
*** 1160,1166 ****
      for (i = 0; i < Conf->naffixes; i++)
      {
          Affix = &(((AFFIX *) Conf->Affix)[i]);
!         if (Affix->type == FF_SUFFIX && firstsuffix < 0)
              firstsuffix = i;

          if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
--- 1169,1175 ----
      for (i = 0; i < Conf->naffixes; i++)
      {
          Affix = &(((AFFIX *) Conf->Affix)[i]);
!         if (Affix->type == FF_SUFFIX && i < firstsuffix)
              firstsuffix = i;

          if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
***************
*** 1185,1196 ****

      Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
      Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
!     mkVoidAffix(Conf, 1, firstsuffix);
!     mkVoidAffix(Conf, 0, firstsuffix);
  }

  static AffixNodeData *
! FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
  {
      AffixNodeData *StopLow,
                 *StopHigh,
--- 1194,1205 ----

      Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
      Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
!     mkVoidAffix(Conf, true, firstsuffix);
!     mkVoidAffix(Conf, false, firstsuffix);
  }

  static AffixNodeData *
! FindAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
  {
      AffixNodeData *StopLow,
                 *StopHigh,
***************
*** 1374,1380 ****
      plevel = 0;
      while (pnode)
      {
!         prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
          if (!prefix)
              break;
          for (j = 0; j < prefix->naff; j++)
--- 1383,1389 ----
      plevel = 0;
      while (pnode)
      {
!         prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
          if (!prefix)
              break;
          for (j = 0; j < prefix->naff; j++)
***************
*** 1398,1404 ****
          int            baselen = 0;

          /* find possible suffix */
!         suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
          if (!suffix)
              break;
          /* foreach suffix check affix */
--- 1407,1413 ----
          int            baselen = 0;

          /* find possible suffix */
!         suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
          if (!suffix)
              break;
          /* foreach suffix check affix */
***************
*** 1416,1422 ****
                  swrdlen = strlen(newword);
                  while (pnode)
                  {
!                     prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
                      if (!prefix)
                          break;
                      for (j = 0; j < prefix->naff; j++)
--- 1425,1431 ----
                  swrdlen = strlen(newword);
                  while (pnode)
                  {
!                     prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
                      if (!prefix)
                          break;
                      for (j = 0; j < prefix->naff; j++)
***************
*** 1626,1632 ****
                      if (wordlen == level + 1)
                      {
                          /* well, it was last word */
!                         var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
                          var->nstem++;
                          pfree(notprobed);
                          return var;
--- 1635,1641 ----
                      if (wordlen == level + 1)
                      {
                          /* well, it was last word */
!                         var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
                          var->nstem++;
                          pfree(notprobed);
                          return var;
***************
*** 1641,1647 ****
                          ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
                          /* we can find next word */
                          level++;
!                         var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
                          var->nstem++;
                          node = Conf->Dictionary;
                          startpos = level;
--- 1650,1656 ----
                          ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
                          /* we can find next word */
                          level++;
!                         var->stem[var->nstem] = pnstrdup(word + startpos, level - startpos);
                          var->nstem++;
                          node = Conf->Dictionary;
                          startpos = level;
***************
*** 1656,1662 ****
          level++;
      }

!     var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
      var->nstem++;
      pfree(notprobed);
      return var;
--- 1665,1671 ----
          level++;
      }

!     var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
      var->nstem++;
      pfree(notprobed);
      return var;
Index: src/backend/tsearch/ts_parse.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_parse.c,v
retrieving revision 1.1
diff -c -r1.1 ts_parse.c
*** src/backend/tsearch/ts_parse.c    21 Aug 2007 01:11:18 -0000    1.1
--- src/backend/tsearch/ts_parse.c    23 Aug 2007 12:29:51 -0000
***************
*** 308,314 ****
              {
                  /*
                   * Dictionary normalizes lexemes, so we remove from stack all
!                  * used lexemes , return to basic mode and redo end of stack
                   * (if it exists)
                   */
                  if (res)
--- 308,314 ----
              {
                  /*
                   * Dictionary normalizes lexemes, so we remove from stack all
!                  * used lexemes, return to basic mode and redo end of stack
                   * (if it exists)
                   */
                  if (res)
***************
*** 571,577 ****
  }

  text *
! generatHeadline(HeadlineText * prs)
  {
      text       *out;
      int            len = 128;
--- 571,577 ----
  }

  text *
! generateHeadline(HeadlineText * prs)
  {
      text       *out;
      int            len = 128;
Index: src/backend/tsearch/ts_utils.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_utils.c,v
retrieving revision 1.2
diff -c -r1.2 ts_utils.c
*** src/backend/tsearch/ts_utils.c    22 Aug 2007 01:39:44 -0000    1.2
--- src/backend/tsearch/ts_utils.c    23 Aug 2007 12:51:06 -0000
***************
*** 63,70 ****
--- 63,82 ----
      return result;
  }

+ static int
+ comparestr(const void *a, const void *b)
+ {
+     return strcmp(*(char **) a, *(char **) b);
+ }
+
  #define STOPBUFLEN    4096

+ /*
+  * Reads a stopword file.
+  *
+  * The file must be in UTF-8 encoding, it will be converted to database
+  * encoding.
+  */
  void
  readstoplist(char *in, StopList * s)
  {
***************
*** 97,108 ****
              if (*buf == '\0')
                  continue;

!             if (!pg_verifymbstr(buf, strlen(buf), true))
              {
                  FreeFile(hin);
                  ereport(ERROR,
                          (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                          errmsg("invalid multibyte encoding at line %d in file \"%s\"",
                                  line, filename)));
              }

--- 109,120 ----
              if (*buf == '\0')
                  continue;

!             if (!pg_verify_mbstr(PG_UTF8, buf, strlen(buf), true))
              {
                  FreeFile(hin);
                  ereport(ERROR,
                          (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                          errmsg("invalid UTF-8 encoding at line %d in file \"%s\"",
                                  line, filename)));
              }

***************
*** 120,130 ****
                  }
              }

!
!             if (s->wordop)
!                 stop[s->len] = s->wordop(buf);
!             else
!                 stop[s->len] = pstrdup(buf);

              (s->len)++;
          }
--- 132,138 ----
                  }
              }

!             stop[s->len] = recode_and_lowerstr(buf);

              (s->len)++;
          }
***************
*** 133,149 ****
      }

      s->stop = stop;
- }

! static int
! comparestr(const void *a, const void *b)
! {
!     return strcmp(*(char **) a, *(char **) b);
! }
!
! void
! sortstoplist(StopList * s)
! {
      if (s->stop && s->len > 0)
          qsort(s->stop, s->len, sizeof(char *), comparestr);
  }
--- 141,148 ----
      }

      s->stop = stop;

!     /* Sort to allow binary searching */
      if (s->stop && s->len > 0)
          qsort(s->stop, s->len, sizeof(char *), comparestr);
  }
Index: src/backend/tsearch/wparser.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/wparser.c,v
retrieving revision 1.2
diff -c -r1.2 wparser.c
*** src/backend/tsearch/wparser.c    22 Aug 2007 01:39:45 -0000    1.2
--- src/backend/tsearch/wparser.c    23 Aug 2007 12:29:59 -0000
***************
*** 325,331 ****
                    PointerGetDatum(prsoptions),
                    PointerGetDatum(query));

!     out = generatHeadline(&prs);

      PG_FREE_IF_COPY(in, 1);
      PG_FREE_IF_COPY(query, 2);
--- 325,331 ----
                    PointerGetDatum(prsoptions),
                    PointerGetDatum(query));

!     out = generateHeadline(&prs);

      PG_FREE_IF_COPY(in, 1);
      PG_FREE_IF_COPY(query, 2);
Index: src/include/tsearch/ts_public.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_public.h,v
retrieving revision 1.2
diff -c -r1.2 ts_public.h
*** src/include/tsearch/ts_public.h    22 Aug 2007 01:39:46 -0000    1.2
--- src/include/tsearch/ts_public.h    23 Aug 2007 10:54:50 -0000
***************
*** 71,80 ****
  {
      int            len;
      char      **stop;
-     char       *(*wordop) (char *);
  } StopList;

- extern void sortstoplist(StopList * s);
  extern void readstoplist(char *in, StopList * s);
  extern bool searchstoplist(StopList * s, char *key);

--- 71,78 ----
Index: src/include/tsearch/ts_utils.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_utils.h,v
retrieving revision 1.1
diff -c -r1.1 ts_utils.h
*** src/include/tsearch/ts_utils.h    21 Aug 2007 01:11:29 -0000    1.1
--- src/include/tsearch/ts_utils.h    23 Aug 2007 12:30:32 -0000
***************
*** 102,108 ****
   * headline framework, flow in common to generate:
   *    1 parse text with hlparsetext
   *    2 parser-specific function to find part
!  *    3 generatHeadline to generate result text
   */

  typedef struct
--- 102,108 ----
   * headline framework, flow in common to generate:
   *    1 parse text with hlparsetext
   *    2 parser-specific function to find part
!  *    3 generateHeadline to generate result text
   */

  typedef struct
***************
*** 131,137 ****

  extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
              char *buf, int4 buflen);
! extern text *generatHeadline(HeadlineText * prs);

  /*
   * token/node types for parsing
--- 131,137 ----

  extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
              char *buf, int4 buflen);
! extern text *generateHeadline(HeadlineText * prs);

  /*
   * token/node types for parsing
Index: src/include/tsearch/dicts/spell.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/dicts/spell.h,v
retrieving revision 1.1
diff -c -r1.1 spell.h
*** src/include/tsearch/dicts/spell.h    21 Aug 2007 01:11:29 -0000    1.1
--- src/include/tsearch/dicts/spell.h    23 Aug 2007 14:02:52 -0000
***************
*** 18,23 ****
--- 18,29 ----
  #include "tsearch/dicts/regis.h"
  #include "tsearch/ts_public.h"

+ /*
+  * Max length of a flag name. Names longer than this will be truncated
+  * to the maximum.
+  */
+ #define MAXFLAGLEN 16
+
  struct SPNode;

  typedef struct
***************
*** 54,67 ****
  {
      union
      {
!         char        flag[16];
          struct
          {
              int            affix;
              int            len;
          }            d;
      }            p;
!     char        word[1];
  } SPELL;

  #define SPELLHDRSZ    (offsetof(SPELL, word))
--- 60,76 ----
  {
      union
      {
!         /* flag is filled in by NIImportDictionary, and after NISortDictionary,
!          * d is used and flag is invalid.
!          */
!         char        flag[MAXFLAGLEN];
          struct
          {
              int            affix;
              int            len;
          }            d;
      }            p;
!     char        word[1]; /* variable length, null-terminated */
  } SPELL;

  #define SPELLHDRSZ    (offsetof(SPELL, word))
***************
*** 90,95 ****
--- 99,109 ----
  #define FF_COMPOUNDPERMITFLAG    0x10
  #define FF_COMPOUNDFORBIDFLAG    0x20
  #define FF_CROSSPRODUCT            0x40
+
+ /*
+  * don't change the ordering of these because it's
+  * taken advantage of in initialization
+  */
  #define FF_SUFFIX                1
  #define FF_PREFIX                0

***************
*** 126,134 ****
      int            naffixes;
      AFFIX       *Affix;

!     int            nspell;
!     int            mspell;
      SPELL      **Spell;

      AffixNode  *Suffix;
      AffixNode  *Prefix;
--- 140,150 ----
      int            naffixes;
      AFFIX       *Affix;

!     /* Temporary array of all words in the dict file. Only used during
!      * initialization */
      SPELL      **Spell;
+     int            nspell; /* number of entries in Spell-array */
+     int            mspell; /* allocated length of Spell-array */

      AffixNode  *Suffix;
      AffixNode  *Prefix;

pgsql-patches by date:

Previous
From: "Marko Kreen"
Date:
Subject: Re: [BUGS] BUG #3571: call to decrypt causes segfault
Next
From: Tom Lane
Date:
Subject: Re: Bunch of tsearch fixes and cleanup