Thread: New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding)

New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding)

From

"eggli"

Date:

09 July 2002, 16:17:17

Hi, all, I found that contrib/fulltextindex is unable to process multibyte
characters, so I tried to make it suit for my mother language as Chinese, I
believe it's able to process Unicode by wcrok(), but I'm lazy to complete
it.;)

USAGE:

Just replace the fti.c from this mail on contrib/fulltextindex and re-make
it.

And the indexing/query way is most the same.

CODE:
/* The difference: breakup() and issleadbyte()

#include "postgres.h"

#include <ctype.h>

#include "executor/spi.h"
#include "commands/trigger.h"

/*
 * Trigger function accepts variable number of arguments:
 *
 *  1. relation in which to store the substrings
 *  2. fields to extract substrings from
 *
 * The relation in which to insert *must* have the following layout:
 *
 *  string  varchar(#)
 *  id   oid
 *
 *  where # is the largest size of the varchar columns being indexed
 *
 * Example:
 *
 * -- Create the SQL function based on the compiled shared object
 * create function fti() returns opaque as
 *   '/usr/local/pgsql/lib/contrib/fti.so' language 'C';
 *
 * -- Create the FTI table
 * create table product_fti (string varchar(255), id oid);
 *
 * -- Create an index to assist string matches
 * create index product_fti_string_idx on product_fti (string);
 *
 * -- Create an index to assist trigger'd deletes
 * create index product_fti_id_idx on product_fti (id);
 *
 * -- Create an index on the product oid column to assist joins
 * -- between the fti table and the product table
 * create index product_oid_idx on product (oid);
 *
 * -- Create the trigger to perform incremental changes to the full text
index.
 * create trigger product_fti_trig after update or insert or delete on
product
 * for each row execute procedure fti(product_fti, title, artist);
 *            ^^^^^^^^^^^
 *            table where full text index is stored
 *             ^^^^^^^^^^^^^
 *             columns to index in the base table
 *
 * After populating 'product', try something like:
 *
 * SELECT DISTINCT(p.*) FROM product p, product_fti f1, product_fti f2 WHERE
 * f1.string ~ '^slippery' AND f2.string ~ '^wet' AND p.oid=f1.id AND
p.oid=f2.id;
 *
 * To check that your indicies are being used correctly, make sure you
 * EXPLAIN SELECT ... your test query above.
 *
 * CHANGELOG
 * ---------
 *
 * august 3 2001
 *     Extended fti function to accept more than one column as a
 *     parameter and all specified columns are indexed.  Changed
 *     all uses of sprintf to snprintf.  Made error messages more
 *     consistent.
 *
 * march 4 1998 Changed breakup() to return less substrings. Only breakup
 *     in word parts which are in turn shortened from the start
 *     of the word (ie. word, ord, rd)
 *     Did allocation of substring buffer outside of breakup()
 *
 * oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha
 *     characters between words then 1).
 *
 * oct 4-5 1997 implemented the thing, at least the basic functionallity
 *     of it all....
 *
 * TODO
 * ----
 *
 *  prevent generating duplicate words for an oid in the fti table
 *  save a plan for deletes
 *  create a function that will make the index *after* we have populated
 *  the main table (probably first delete all contents to be sure there's
 *  nothing in it, then re-populate the fti-table)
 *
 *  can we do something with operator overloading or a seperate function
 *  that can build the final query automatigally?
 */

#define MAX_FTI_QUERY_LENGTH 8192

extern Datum fti(PG_FUNCTION_ARGS);
static char *breakup(char *, char *);
static bool is_stopword(char *);
static bool isleadbyte(unsigned char ch);
static bool new_tuple = false;


#ifdef USE_STOP_WORDS

/* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */
char    *StopWords[] = {  /* list of words to skip in indexing */
 "no",
 "the",
 "yes"
};
#endif   /* USE_STOP_WORDS */

/* stuff for caching query-plans, stolen from contrib/spi/\*.c */
typedef struct
{
 char    *ident;
 int   nplans;
 void   **splan;
} EPlan;

static EPlan *InsertPlans = NULL;
static EPlan *DeletePlans = NULL;
static int nInsertPlans = 0;
static int nDeletePlans = 0;

static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans);

/***********************************************************************/
PG_FUNCTION_INFO_V1(fti);

Datum
fti(PG_FUNCTION_ARGS)
{
 TriggerData *trigdata;
 Trigger    *trigger;  /* to get trigger name */
 int   nargs;   /* # of arguments */
 char   **args;   /* arguments */
 char    *relname;  /* triggered relation name */
 Relation rel;   /* triggered relation */
 char    *indexname;  /* name of table for substrings */
 HeapTuple rettuple = NULL;
 TupleDesc tupdesc;  /* tuple description */
 bool  isinsert = false;
 bool  isdelete = false;
 int   ret;
 char  query[MAX_FTI_QUERY_LENGTH];
 Oid   oid;

 /*
  * FILE   *debug;
  */

 /*
  * debug = fopen("/dev/xconsole", "w"); fprintf(debug, "FTI: entered
  * function\n"); fflush(debug);
  */

 if (!CALLED_AS_TRIGGER(fcinfo))
  elog(ERROR, "Full Text Indexing: Not fired by trigger manager");

 /* It's safe to cast now that we've checked */
 trigdata = (TriggerData *) fcinfo->context;

 if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
  elog(ERROR, "Full Text Indexing: Can't process STATEMENT events");
 if (TRIGGER_FIRED_BEFORE(trigdata->tg_event))
  elog(ERROR, "Full Text Indexing: Must be fired AFTER event");

 if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
  isinsert = true;
 if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
 {
  isdelete = true;
  isinsert = true;
 }
 if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event))
  isdelete = true;

 trigger = trigdata->tg_trigger;
 rel = trigdata->tg_relation;
 relname = SPI_getrelname(rel);
 rettuple = trigdata->tg_trigtuple;
 if (isdelete && isinsert) /* is an UPDATE */
  rettuple = trigdata->tg_newtuple;

       if ((ret = SPI_connect()) < 0)
  elog(ERROR, "Full Text Indexing: SPI_connect: Failed, returned %d\n",
ret);

 nargs = trigger->tgnargs;
 if (nargs < 2)
  elog(ERROR, "Full Text Indexing: Trigger must have at least 2
arguments\n");

 args = trigger->tgargs;
 indexname = args[0];
 tupdesc = rel->rd_att;  /* what the tuple looks like (?) */

 /* get oid of current tuple, needed by all, so place here */
 oid = rettuple->t_data->t_oid;
 if (!OidIsValid(oid))
  elog(ERROR, "Full Text Indexing: Oid of current tuple is invalid");

 if (isdelete)
 {
  void    *pplan;
  Oid     *argtypes;
  Datum  values[1];
  EPlan    *plan;
  int   i;

  snprintf(query, MAX_FTI_QUERY_LENGTH, "D%s", indexname);
  for (i = 1; i < nargs; i++)
   snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);

  plan = find_plan(query, &DeletePlans, &nDeletePlans);
  if (plan->nplans <= 0)
  {
   argtypes = (Oid *) palloc(sizeof(Oid));

   argtypes[0] = OIDOID;

   snprintf(query, MAX_FTI_QUERY_LENGTH, "DELETE FROM %s WHERE id = $1",
indexname);
   pplan = SPI_prepare(query, 1, argtypes);
   if (!pplan)
    elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in delete");
   pplan = SPI_saveplan(pplan);
   if (pplan == NULL)
    elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in
delete");

   plan->splan = (void **) malloc(sizeof(void *));
   *(plan->splan) = pplan;
   plan->nplans = 1;
  }

  values[0] = oid;

  ret = SPI_execp(*(plan->splan), values, NULL, 0);
  if (ret != SPI_OK_DELETE)
   elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in
delete");
 }

 if (isinsert)
 {
  char    *substring;
  char    *column;
  void    *pplan;
  Oid     *argtypes;
  Datum  values[2];
  int   colnum;
  struct varlena *data;
  EPlan    *plan;
  int   i;
  char    *buff;
  char    *string;

  snprintf(query, MAX_FTI_QUERY_LENGTH, "I%s", indexname);
  for (i = 1; i < nargs; i++)
   snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);

  plan = find_plan(query, &InsertPlans, &nInsertPlans);

  /* no plan yet, so allocate mem for argtypes */
  if (plan->nplans <= 0)
  {
   argtypes = (Oid *) palloc(2 * sizeof(Oid));

   argtypes[0] = VARCHAROID; /* create table t_name (string
           * varchar, */
   argtypes[1] = OIDOID;  /* id   oid);    */

   /* prepare plan to gain speed */
   snprintf(query, MAX_FTI_QUERY_LENGTH, "INSERT INTO %s (string, id) VALUES
($1, $2)",
      indexname);
   pplan = SPI_prepare(query, 2, argtypes);
   if (!pplan)
    elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in insert");

   pplan = SPI_saveplan(pplan);
   if (pplan == NULL)
    elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in
insert");

   plan->splan = (void **) malloc(sizeof(void *));
   *(plan->splan) = pplan;
   plan->nplans = 1;
  }

  /* prepare plan for query */
  for (i = 0; i < nargs - 1; i++)
  {
   colnum = SPI_fnumber(tupdesc, args[i + 1]);
   if (colnum == SPI_ERROR_NOATTRIBUTE)
    elog(ERROR, "Full Text Indexing: SPI_fnumber: Column '%s' of '%s' not
found", args[i + 1], indexname);

   /* Get the char* representation of the column */
   column = SPI_getvalue(rettuple, tupdesc, colnum);

   /* make sure we don't try to index NULL's */
   if (column)
   {
    string = column;
    while (*string != '\0')
    {
     *string = tolower((unsigned char) *string);
     string++;
    }

    data = (struct varlena *) palloc(sizeof(int32) + strlen(column) +1);
    buff = palloc(strlen(column) + 1);
    /* saves lots of calls in while-loop and in breakup() */

    new_tuple = true;

    while ((substring = breakup(column, buff)))
    {
     int   l;

     l = strlen(substring);

     data->vl_len = l + sizeof(int32);
     memcpy(VARDATA(data), substring, l);
     values[0] = PointerGetDatum(data);
     values[1] = oid;

     ret = SPI_execp(*(plan->splan), values, NULL, 0);
     if (ret != SPI_OK_INSERT)
      elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in
insert");
    }
    pfree(buff);
    pfree(data);
   }
  }
 }

 SPI_finish();
 return PointerGetDatum(rettuple);
}

static char *
breakup(char *string, char *substring)
{
 static char *last_start;
 static char *cur_pos;
 static char *string_end;

 if (new_tuple)
 {
  string_end = &string[strlen(string)-1];
  cur_pos = last_start = &string[0];
  new_tuple = false;/* don't initialize this next time */
 }

 while (cur_pos <= string_end)/* don't read after end of 'string' */
 {
  if (isleadbyte((unsigned)*cur_pos )) {
   /* Bingo, got a Big-5 word (2 bytes) */
   cur_pos += 2;
   memcpy(substring, last_start, cur_pos - last_start);
   substring[cur_pos - last_start] = '\0';
   if (!is_stopword(substring))
    return substring;
  } else if (isalnum((unsigned char) *cur_pos)) {
   /* Houston, we have a substring! :) */
   cur_pos++;
   memcpy(substring, last_start, cur_pos - last_start);
   substring[cur_pos - last_start] = '\0';
   if (!is_stopword(substring))
    return substring;
  } else {
   last_start = cur_pos + 1;
   cur_pos = last_start;
  }
 }

 return NULL;/* we've processed all of 'string' */
}

bool isleadbyte(unsigned char ch)
{
 if ((ch >= 0xA1) && (ch <= 0xFE))
  return true;

 if ((ch >= 0x8E) && (ch <= 0xA0))
  return true;

 if ((ch >= 0x81) && (ch <= 0x8D))
  return true;

return false;
}
/* copied from src/backend/parser/keywords.c and adjusted for our
situation*/
static bool
is_stopword(char *text)
{
#ifdef USE_STOP_WORDS
 char   **StopLow;  /* for list of stop-words */
 char   **StopHigh;
 char   **StopMiddle;
 int   difference;

 StopLow = &StopWords[0]; /* initialize stuff for binary search */
 StopHigh = endof(StopWords);

 /* Loop invariant: *StopLow <= text < *StopHigh */

 while (StopLow < StopHigh)
 {
  StopMiddle = StopLow + (StopHigh - StopLow) / 2;
  difference = strcmp(*StopMiddle, text);
  if (difference == 0)
   return (true);
  else if (difference < 0)
   StopLow = StopMiddle + 1;
  else
   StopHigh = StopMiddle;
 }
#endif   /* USE_STOP_WORDS */

 return (false);
}

/* for caching of query plans, stolen from contrib/spi/\*.c */
static EPlan *
find_plan(char *ident, EPlan ** eplan, int *nplans)
{
 EPlan    *newp;
 int   i;

 if (*nplans > 0)
 {
  for (i = 0; i < *nplans; i++)
  {
   if (strcmp((*eplan)[i].ident, ident) == 0)
    break;
  }
  if (i != *nplans)
   return (*eplan + i);
  *eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan));
  newp = *eplan + i;
 }
 else
 {
  newp = *eplan = (EPlan *) malloc(sizeof(EPlan));
  (*nplans) = i = 0;
 }

 newp->ident = (char *) malloc(strlen(ident) + 1);
 strcpy(newp->ident, ident);
 newp->nplans = 0;
 newp->splan = NULL;
 (*nplans)++;

 return (newp);
}

Re: New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding)

From

"Christopher Kings-Lynne"

Date:

10 July 2002, 02:03:46

Hi eggli,

I'm currently working on another patch that has been submitted for
fulltextindex.  I will try to merge both yours and Florian's changes for the
fulltextindex module and get it into CVS for 7.3.

Cheers,

Chris

> -----Original Message-----
> From: pgsql-patches-owner@postgresql.org
> [mailto:pgsql-patches-owner@postgresql.org]On Behalf Of eggli
> Sent: Tuesday, 9 July 2002 6:58 PM
> To: pgsql-patches@postgresql.org
> Subject: [PATCHES] New Full Text Index using contrib/fulltextindex which
> now able to processing Traditional Chinese characters(Big5 encoding)
>
>
> Hi, all, I found that contrib/fulltextindex is unable to process multibyte
> characters, so I tried to make it suit for my mother language as
> Chinese, I
> believe it's able to process Unicode by wcrok(), but I'm lazy to complete
> it.;)
>
> USAGE:
>
> Just replace the fti.c from this mail on contrib/fulltextindex and re-make
> it.
>
> And the indexing/query way is most the same.
>
> CODE:
> /* The difference: breakup() and issleadbyte()
>
> #include "postgres.h"
>
> #include <ctype.h>
>
> #include "executor/spi.h"
> #include "commands/trigger.h"
>
> /*
>  * Trigger function accepts variable number of arguments:
>  *
>  *  1. relation in which to store the substrings
>  *  2. fields to extract substrings from
>  *
>  * The relation in which to insert *must* have the following layout:
>  *
>  *  string  varchar(#)
>  *  id   oid
>  *
>  *  where # is the largest size of the varchar columns being indexed
>  *
>  * Example:
>  *
>  * -- Create the SQL function based on the compiled shared object
>  * create function fti() returns opaque as
>  *   '/usr/local/pgsql/lib/contrib/fti.so' language 'C';
>  *
>  * -- Create the FTI table
>  * create table product_fti (string varchar(255), id oid);
>  *
>  * -- Create an index to assist string matches
>  * create index product_fti_string_idx on product_fti (string);
>  *
>  * -- Create an index to assist trigger'd deletes
>  * create index product_fti_id_idx on product_fti (id);
>  *
>  * -- Create an index on the product oid column to assist joins
>  * -- between the fti table and the product table
>  * create index product_oid_idx on product (oid);
>  *
>  * -- Create the trigger to perform incremental changes to the full text
> index.
>  * create trigger product_fti_trig after update or insert or delete on
> product
>  * for each row execute procedure fti(product_fti, title, artist);
>  *            ^^^^^^^^^^^
>  *            table where full text index is stored
>  *             ^^^^^^^^^^^^^
>  *             columns to index in the base table
>  *
>  * After populating 'product', try something like:
>  *
>  * SELECT DISTINCT(p.*) FROM product p, product_fti f1,
> product_fti f2 WHERE
>  * f1.string ~ '^slippery' AND f2.string ~ '^wet' AND p.oid=f1.id AND
> p.oid=f2.id;
>  *
>  * To check that your indicies are being used correctly, make sure you
>  * EXPLAIN SELECT ... your test query above.
>  *
>  * CHANGELOG
>  * ---------
>  *
>  * august 3 2001
>  *     Extended fti function to accept more than one column as a
>  *     parameter and all specified columns are indexed.  Changed
>  *     all uses of sprintf to snprintf.  Made error messages more
>  *     consistent.
>  *
>  * march 4 1998 Changed breakup() to return less substrings. Only breakup
>  *     in word parts which are in turn shortened from the start
>  *     of the word (ie. word, ord, rd)
>  *     Did allocation of substring buffer outside of breakup()
>  *
>  * oct. 5 1997, fixed a bug in string breakup (where there are
> more nonalpha
>  *     characters between words then 1).
>  *
>  * oct 4-5 1997 implemented the thing, at least the basic functionallity
>  *     of it all....
>  *
>  * TODO
>  * ----
>  *
>  *  prevent generating duplicate words for an oid in the fti table
>  *  save a plan for deletes
>  *  create a function that will make the index *after* we have populated
>  *  the main table (probably first delete all contents to be sure there's
>  *  nothing in it, then re-populate the fti-table)
>  *
>  *  can we do something with operator overloading or a seperate function
>  *  that can build the final query automatigally?
>  */
>
> #define MAX_FTI_QUERY_LENGTH 8192
>
> extern Datum fti(PG_FUNCTION_ARGS);
> static char *breakup(char *, char *);
> static bool is_stopword(char *);
> static bool isleadbyte(unsigned char ch);
> static bool new_tuple = false;
>
>
> #ifdef USE_STOP_WORDS
>
> /* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */
> char    *StopWords[] = {  /* list of words to skip in indexing */
>  "no",
>  "the",
>  "yes"
> };
> #endif   /* USE_STOP_WORDS */
>
> /* stuff for caching query-plans, stolen from contrib/spi/\*.c */
> typedef struct
> {
>  char    *ident;
>  int   nplans;
>  void   **splan;
> } EPlan;
>
> static EPlan *InsertPlans = NULL;
> static EPlan *DeletePlans = NULL;
> static int nInsertPlans = 0;
> static int nDeletePlans = 0;
>
> static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans);
>
> /***********************************************************************/
> PG_FUNCTION_INFO_V1(fti);
>
> Datum
> fti(PG_FUNCTION_ARGS)
> {
>  TriggerData *trigdata;
>  Trigger    *trigger;  /* to get trigger name */
>  int   nargs;   /* # of arguments */
>  char   **args;   /* arguments */
>  char    *relname;  /* triggered relation name */
>  Relation rel;   /* triggered relation */
>  char    *indexname;  /* name of table for substrings */
>  HeapTuple rettuple = NULL;
>  TupleDesc tupdesc;  /* tuple description */
>  bool  isinsert = false;
>  bool  isdelete = false;
>  int   ret;
>  char  query[MAX_FTI_QUERY_LENGTH];
>  Oid   oid;
>
>  /*
>   * FILE   *debug;
>   */
>
>  /*
>   * debug = fopen("/dev/xconsole", "w"); fprintf(debug, "FTI: entered
>   * function\n"); fflush(debug);
>   */
>
>  if (!CALLED_AS_TRIGGER(fcinfo))
>   elog(ERROR, "Full Text Indexing: Not fired by trigger manager");
>
>  /* It's safe to cast now that we've checked */
>  trigdata = (TriggerData *) fcinfo->context;
>
>  if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
>   elog(ERROR, "Full Text Indexing: Can't process STATEMENT events");
>  if (TRIGGER_FIRED_BEFORE(trigdata->tg_event))
>   elog(ERROR, "Full Text Indexing: Must be fired AFTER event");
>
>  if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
>   isinsert = true;
>  if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
>  {
>   isdelete = true;
>   isinsert = true;
>  }
>  if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event))
>   isdelete = true;
>
>  trigger = trigdata->tg_trigger;
>  rel = trigdata->tg_relation;
>  relname = SPI_getrelname(rel);
>  rettuple = trigdata->tg_trigtuple;
>  if (isdelete && isinsert) /* is an UPDATE */
>   rettuple = trigdata->tg_newtuple;
>
>        if ((ret = SPI_connect()) < 0)
>   elog(ERROR, "Full Text Indexing: SPI_connect: Failed, returned %d\n",
> ret);
>
>  nargs = trigger->tgnargs;
>  if (nargs < 2)
>   elog(ERROR, "Full Text Indexing: Trigger must have at least 2
> arguments\n");
>
>  args = trigger->tgargs;
>  indexname = args[0];
>  tupdesc = rel->rd_att;  /* what the tuple looks like (?) */
>
>  /* get oid of current tuple, needed by all, so place here */
>  oid = rettuple->t_data->t_oid;
>  if (!OidIsValid(oid))
>   elog(ERROR, "Full Text Indexing: Oid of current tuple is invalid");
>
>  if (isdelete)
>  {
>   void    *pplan;
>   Oid     *argtypes;
>   Datum  values[1];
>   EPlan    *plan;
>   int   i;
>
>   snprintf(query, MAX_FTI_QUERY_LENGTH, "D%s", indexname);
>   for (i = 1; i < nargs; i++)
>    snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);
>
>   plan = find_plan(query, &DeletePlans, &nDeletePlans);
>   if (plan->nplans <= 0)
>   {
>    argtypes = (Oid *) palloc(sizeof(Oid));
>
>    argtypes[0] = OIDOID;
>
>    snprintf(query, MAX_FTI_QUERY_LENGTH, "DELETE FROM %s WHERE id = $1",
> indexname);
>    pplan = SPI_prepare(query, 1, argtypes);
>    if (!pplan)
>     elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL
> in delete");
>    pplan = SPI_saveplan(pplan);
>    if (pplan == NULL)
>     elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in
> delete");
>
>    plan->splan = (void **) malloc(sizeof(void *));
>    *(plan->splan) = pplan;
>    plan->nplans = 1;
>   }
>
>   values[0] = oid;
>
>   ret = SPI_execp(*(plan->splan), values, NULL, 0);
>   if (ret != SPI_OK_DELETE)
>    elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in
> delete");
>  }
>
>  if (isinsert)
>  {
>   char    *substring;
>   char    *column;
>   void    *pplan;
>   Oid     *argtypes;
>   Datum  values[2];
>   int   colnum;
>   struct varlena *data;
>   EPlan    *plan;
>   int   i;
>   char    *buff;
>   char    *string;
>
>   snprintf(query, MAX_FTI_QUERY_LENGTH, "I%s", indexname);
>   for (i = 1; i < nargs; i++)
>    snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);
>
>   plan = find_plan(query, &InsertPlans, &nInsertPlans);
>
>   /* no plan yet, so allocate mem for argtypes */
>   if (plan->nplans <= 0)
>   {
>    argtypes = (Oid *) palloc(2 * sizeof(Oid));
>
>    argtypes[0] = VARCHAROID; /* create table t_name (string
>            * varchar, */
>    argtypes[1] = OIDOID;  /* id   oid);    */
>
>    /* prepare plan to gain speed */
>    snprintf(query, MAX_FTI_QUERY_LENGTH, "INSERT INTO %s (string,
> id) VALUES
> ($1, $2)",
>       indexname);
>    pplan = SPI_prepare(query, 2, argtypes);
>    if (!pplan)
>     elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL
> in insert");
>
>    pplan = SPI_saveplan(pplan);
>    if (pplan == NULL)
>     elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in
> insert");
>
>    plan->splan = (void **) malloc(sizeof(void *));
>    *(plan->splan) = pplan;
>    plan->nplans = 1;
>   }
>
>   /* prepare plan for query */
>   for (i = 0; i < nargs - 1; i++)
>   {
>    colnum = SPI_fnumber(tupdesc, args[i + 1]);
>    if (colnum == SPI_ERROR_NOATTRIBUTE)
>     elog(ERROR, "Full Text Indexing: SPI_fnumber: Column '%s' of '%s' not
> found", args[i + 1], indexname);
>
>    /* Get the char* representation of the column */
>    column = SPI_getvalue(rettuple, tupdesc, colnum);
>
>    /* make sure we don't try to index NULL's */
>    if (column)
>    {
>     string = column;
>     while (*string != '\0')
>     {
>      *string = tolower((unsigned char) *string);
>      string++;
>     }
>
>     data = (struct varlena *) palloc(sizeof(int32) + strlen(column) +1);
>     buff = palloc(strlen(column) + 1);
>     /* saves lots of calls in while-loop and in breakup() */
>
>     new_tuple = true;
>
>     while ((substring = breakup(column, buff)))
>     {
>      int   l;
>
>      l = strlen(substring);
>
>      data->vl_len = l + sizeof(int32);
>      memcpy(VARDATA(data), substring, l);
>      values[0] = PointerGetDatum(data);
>      values[1] = oid;
>
>      ret = SPI_execp(*(plan->splan), values, NULL, 0);
>      if (ret != SPI_OK_INSERT)
>       elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in
> insert");
>     }
>     pfree(buff);
>     pfree(data);
>    }
>   }
>  }
>
>  SPI_finish();
>  return PointerGetDatum(rettuple);
> }
>
> static char *
> breakup(char *string, char *substring)
> {
>  static char *last_start;
>  static char *cur_pos;
>  static char *string_end;
>
>  if (new_tuple)
>  {
>   string_end = &string[strlen(string)-1];
>   cur_pos = last_start = &string[0];
>   new_tuple = false;/* don't initialize this next time */
>  }
>
>  while (cur_pos <= string_end)/* don't read after end of 'string' */
>  {
>   if (isleadbyte((unsigned)*cur_pos )) {
>    /* Bingo, got a Big-5 word (2 bytes) */
>    cur_pos += 2;
>    memcpy(substring, last_start, cur_pos - last_start);
>    substring[cur_pos - last_start] = '\0';
>    if (!is_stopword(substring))
>     return substring;
>   } else if (isalnum((unsigned char) *cur_pos)) {
>    /* Houston, we have a substring! :) */
>    cur_pos++;
>    memcpy(substring, last_start, cur_pos - last_start);
>    substring[cur_pos - last_start] = '\0';
>    if (!is_stopword(substring))
>     return substring;
>   } else {
>    last_start = cur_pos + 1;
>    cur_pos = last_start;
>   }
>  }
>
>  return NULL;/* we've processed all of 'string' */
> }
>
> bool isleadbyte(unsigned char ch)
> {
>  if ((ch >= 0xA1) && (ch <= 0xFE))
>   return true;
>
>  if ((ch >= 0x8E) && (ch <= 0xA0))
>   return true;
>
>  if ((ch >= 0x81) && (ch <= 0x8D))
>   return true;
>
> return false;
> }
> /* copied from src/backend/parser/keywords.c and adjusted for our
> situation*/
> static bool
> is_stopword(char *text)
> {
> #ifdef USE_STOP_WORDS
>  char   **StopLow;  /* for list of stop-words */
>  char   **StopHigh;
>  char   **StopMiddle;
>  int   difference;
>
>  StopLow = &StopWords[0]; /* initialize stuff for binary search */
>  StopHigh = endof(StopWords);
>
>  /* Loop invariant: *StopLow <= text < *StopHigh */
>
>  while (StopLow < StopHigh)
>  {
>   StopMiddle = StopLow + (StopHigh - StopLow) / 2;
>   difference = strcmp(*StopMiddle, text);
>   if (difference == 0)
>    return (true);
>   else if (difference < 0)
>    StopLow = StopMiddle + 1;
>   else
>    StopHigh = StopMiddle;
>  }
> #endif   /* USE_STOP_WORDS */
>
>  return (false);
> }
>
> /* for caching of query plans, stolen from contrib/spi/\*.c */
> static EPlan *
> find_plan(char *ident, EPlan ** eplan, int *nplans)
> {
>  EPlan    *newp;
>  int   i;
>
>  if (*nplans > 0)
>  {
>   for (i = 0; i < *nplans; i++)
>   {
>    if (strcmp((*eplan)[i].ident, ident) == 0)
>     break;
>   }
>   if (i != *nplans)
>    return (*eplan + i);
>   *eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan));
>   newp = *eplan + i;
>  }
>  else
>  {
>   newp = *eplan = (EPlan *) malloc(sizeof(EPlan));
>   (*nplans) = i = 0;
>  }
>
>  newp->ident = (char *) malloc(strlen(ident) + 1);
>  strcpy(newp->ident, ident);
>  newp->nplans = 0;
>  newp->splan = NULL;
>  (*nplans)++;
>
>  return (newp);
> }
>
>
>
>
>
>
> ---------------------------(end of broadcast)---------------------------
> TIP 4: Don't 'kill -9' the postmaster
>