Thread: New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding)
New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding)
From
"eggli"
Date:
Hi, all, I found that contrib/fulltextindex is unable to process multibyte characters, so I tried to make it suit for my mother language as Chinese, I believe it's able to process Unicode by wcrok(), but I'm lazy to complete it.;) USAGE: Just replace the fti.c from this mail on contrib/fulltextindex and re-make it. And the indexing/query way is most the same. CODE: /* The difference: breakup() and issleadbyte() #include "postgres.h" #include <ctype.h> #include "executor/spi.h" #include "commands/trigger.h" /* * Trigger function accepts variable number of arguments: * * 1. relation in which to store the substrings * 2. fields to extract substrings from * * The relation in which to insert *must* have the following layout: * * string varchar(#) * id oid * * where # is the largest size of the varchar columns being indexed * * Example: * * -- Create the SQL function based on the compiled shared object * create function fti() returns opaque as * '/usr/local/pgsql/lib/contrib/fti.so' language 'C'; * * -- Create the FTI table * create table product_fti (string varchar(255), id oid); * * -- Create an index to assist string matches * create index product_fti_string_idx on product_fti (string); * * -- Create an index to assist trigger'd deletes * create index product_fti_id_idx on product_fti (id); * * -- Create an index on the product oid column to assist joins * -- between the fti table and the product table * create index product_oid_idx on product (oid); * * -- Create the trigger to perform incremental changes to the full text index. * create trigger product_fti_trig after update or insert or delete on product * for each row execute procedure fti(product_fti, title, artist); * ^^^^^^^^^^^ * table where full text index is stored * ^^^^^^^^^^^^^ * columns to index in the base table * * After populating 'product', try something like: * * SELECT DISTINCT(p.*) FROM product p, product_fti f1, product_fti f2 WHERE * f1.string ~ '^slippery' AND f2.string ~ '^wet' AND p.oid=f1.id AND p.oid=f2.id; * * To check that your indicies are being used correctly, make sure you * EXPLAIN SELECT ... your test query above. * * CHANGELOG * --------- * * august 3 2001 * Extended fti function to accept more than one column as a * parameter and all specified columns are indexed. Changed * all uses of sprintf to snprintf. Made error messages more * consistent. * * march 4 1998 Changed breakup() to return less substrings. Only breakup * in word parts which are in turn shortened from the start * of the word (ie. word, ord, rd) * Did allocation of substring buffer outside of breakup() * * oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha * characters between words then 1). * * oct 4-5 1997 implemented the thing, at least the basic functionallity * of it all.... * * TODO * ---- * * prevent generating duplicate words for an oid in the fti table * save a plan for deletes * create a function that will make the index *after* we have populated * the main table (probably first delete all contents to be sure there's * nothing in it, then re-populate the fti-table) * * can we do something with operator overloading or a seperate function * that can build the final query automatigally? */ #define MAX_FTI_QUERY_LENGTH 8192 extern Datum fti(PG_FUNCTION_ARGS); static char *breakup(char *, char *); static bool is_stopword(char *); static bool isleadbyte(unsigned char ch); static bool new_tuple = false; #ifdef USE_STOP_WORDS /* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */ char *StopWords[] = { /* list of words to skip in indexing */ "no", "the", "yes" }; #endif /* USE_STOP_WORDS */ /* stuff for caching query-plans, stolen from contrib/spi/\*.c */ typedef struct { char *ident; int nplans; void **splan; } EPlan; static EPlan *InsertPlans = NULL; static EPlan *DeletePlans = NULL; static int nInsertPlans = 0; static int nDeletePlans = 0; static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans); /***********************************************************************/ PG_FUNCTION_INFO_V1(fti); Datum fti(PG_FUNCTION_ARGS) { TriggerData *trigdata; Trigger *trigger; /* to get trigger name */ int nargs; /* # of arguments */ char **args; /* arguments */ char *relname; /* triggered relation name */ Relation rel; /* triggered relation */ char *indexname; /* name of table for substrings */ HeapTuple rettuple = NULL; TupleDesc tupdesc; /* tuple description */ bool isinsert = false; bool isdelete = false; int ret; char query[MAX_FTI_QUERY_LENGTH]; Oid oid; /* * FILE *debug; */ /* * debug = fopen("/dev/xconsole", "w"); fprintf(debug, "FTI: entered * function\n"); fflush(debug); */ if (!CALLED_AS_TRIGGER(fcinfo)) elog(ERROR, "Full Text Indexing: Not fired by trigger manager"); /* It's safe to cast now that we've checked */ trigdata = (TriggerData *) fcinfo->context; if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event)) elog(ERROR, "Full Text Indexing: Can't process STATEMENT events"); if (TRIGGER_FIRED_BEFORE(trigdata->tg_event)) elog(ERROR, "Full Text Indexing: Must be fired AFTER event"); if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) isinsert = true; if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) { isdelete = true; isinsert = true; } if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event)) isdelete = true; trigger = trigdata->tg_trigger; rel = trigdata->tg_relation; relname = SPI_getrelname(rel); rettuple = trigdata->tg_trigtuple; if (isdelete && isinsert) /* is an UPDATE */ rettuple = trigdata->tg_newtuple; if ((ret = SPI_connect()) < 0) elog(ERROR, "Full Text Indexing: SPI_connect: Failed, returned %d\n", ret); nargs = trigger->tgnargs; if (nargs < 2) elog(ERROR, "Full Text Indexing: Trigger must have at least 2 arguments\n"); args = trigger->tgargs; indexname = args[0]; tupdesc = rel->rd_att; /* what the tuple looks like (?) */ /* get oid of current tuple, needed by all, so place here */ oid = rettuple->t_data->t_oid; if (!OidIsValid(oid)) elog(ERROR, "Full Text Indexing: Oid of current tuple is invalid"); if (isdelete) { void *pplan; Oid *argtypes; Datum values[1]; EPlan *plan; int i; snprintf(query, MAX_FTI_QUERY_LENGTH, "D%s", indexname); for (i = 1; i < nargs; i++) snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]); plan = find_plan(query, &DeletePlans, &nDeletePlans); if (plan->nplans <= 0) { argtypes = (Oid *) palloc(sizeof(Oid)); argtypes[0] = OIDOID; snprintf(query, MAX_FTI_QUERY_LENGTH, "DELETE FROM %s WHERE id = $1", indexname); pplan = SPI_prepare(query, 1, argtypes); if (!pplan) elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in delete"); pplan = SPI_saveplan(pplan); if (pplan == NULL) elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in delete"); plan->splan = (void **) malloc(sizeof(void *)); *(plan->splan) = pplan; plan->nplans = 1; } values[0] = oid; ret = SPI_execp(*(plan->splan), values, NULL, 0); if (ret != SPI_OK_DELETE) elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in delete"); } if (isinsert) { char *substring; char *column; void *pplan; Oid *argtypes; Datum values[2]; int colnum; struct varlena *data; EPlan *plan; int i; char *buff; char *string; snprintf(query, MAX_FTI_QUERY_LENGTH, "I%s", indexname); for (i = 1; i < nargs; i++) snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]); plan = find_plan(query, &InsertPlans, &nInsertPlans); /* no plan yet, so allocate mem for argtypes */ if (plan->nplans <= 0) { argtypes = (Oid *) palloc(2 * sizeof(Oid)); argtypes[0] = VARCHAROID; /* create table t_name (string * varchar, */ argtypes[1] = OIDOID; /* id oid); */ /* prepare plan to gain speed */ snprintf(query, MAX_FTI_QUERY_LENGTH, "INSERT INTO %s (string, id) VALUES ($1, $2)", indexname); pplan = SPI_prepare(query, 2, argtypes); if (!pplan) elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in insert"); pplan = SPI_saveplan(pplan); if (pplan == NULL) elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in insert"); plan->splan = (void **) malloc(sizeof(void *)); *(plan->splan) = pplan; plan->nplans = 1; } /* prepare plan for query */ for (i = 0; i < nargs - 1; i++) { colnum = SPI_fnumber(tupdesc, args[i + 1]); if (colnum == SPI_ERROR_NOATTRIBUTE) elog(ERROR, "Full Text Indexing: SPI_fnumber: Column '%s' of '%s' not found", args[i + 1], indexname); /* Get the char* representation of the column */ column = SPI_getvalue(rettuple, tupdesc, colnum); /* make sure we don't try to index NULL's */ if (column) { string = column; while (*string != '\0') { *string = tolower((unsigned char) *string); string++; } data = (struct varlena *) palloc(sizeof(int32) + strlen(column) +1); buff = palloc(strlen(column) + 1); /* saves lots of calls in while-loop and in breakup() */ new_tuple = true; while ((substring = breakup(column, buff))) { int l; l = strlen(substring); data->vl_len = l + sizeof(int32); memcpy(VARDATA(data), substring, l); values[0] = PointerGetDatum(data); values[1] = oid; ret = SPI_execp(*(plan->splan), values, NULL, 0); if (ret != SPI_OK_INSERT) elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in insert"); } pfree(buff); pfree(data); } } } SPI_finish(); return PointerGetDatum(rettuple); } static char * breakup(char *string, char *substring) { static char *last_start; static char *cur_pos; static char *string_end; if (new_tuple) { string_end = &string[strlen(string)-1]; cur_pos = last_start = &string[0]; new_tuple = false;/* don't initialize this next time */ } while (cur_pos <= string_end)/* don't read after end of 'string' */ { if (isleadbyte((unsigned)*cur_pos )) { /* Bingo, got a Big-5 word (2 bytes) */ cur_pos += 2; memcpy(substring, last_start, cur_pos - last_start); substring[cur_pos - last_start] = '\0'; if (!is_stopword(substring)) return substring; } else if (isalnum((unsigned char) *cur_pos)) { /* Houston, we have a substring! :) */ cur_pos++; memcpy(substring, last_start, cur_pos - last_start); substring[cur_pos - last_start] = '\0'; if (!is_stopword(substring)) return substring; } else { last_start = cur_pos + 1; cur_pos = last_start; } } return NULL;/* we've processed all of 'string' */ } bool isleadbyte(unsigned char ch) { if ((ch >= 0xA1) && (ch <= 0xFE)) return true; if ((ch >= 0x8E) && (ch <= 0xA0)) return true; if ((ch >= 0x81) && (ch <= 0x8D)) return true; return false; } /* copied from src/backend/parser/keywords.c and adjusted for our situation*/ static bool is_stopword(char *text) { #ifdef USE_STOP_WORDS char **StopLow; /* for list of stop-words */ char **StopHigh; char **StopMiddle; int difference; StopLow = &StopWords[0]; /* initialize stuff for binary search */ StopHigh = endof(StopWords); /* Loop invariant: *StopLow <= text < *StopHigh */ while (StopLow < StopHigh) { StopMiddle = StopLow + (StopHigh - StopLow) / 2; difference = strcmp(*StopMiddle, text); if (difference == 0) return (true); else if (difference < 0) StopLow = StopMiddle + 1; else StopHigh = StopMiddle; } #endif /* USE_STOP_WORDS */ return (false); } /* for caching of query plans, stolen from contrib/spi/\*.c */ static EPlan * find_plan(char *ident, EPlan ** eplan, int *nplans) { EPlan *newp; int i; if (*nplans > 0) { for (i = 0; i < *nplans; i++) { if (strcmp((*eplan)[i].ident, ident) == 0) break; } if (i != *nplans) return (*eplan + i); *eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan)); newp = *eplan + i; } else { newp = *eplan = (EPlan *) malloc(sizeof(EPlan)); (*nplans) = i = 0; } newp->ident = (char *) malloc(strlen(ident) + 1); strcpy(newp->ident, ident); newp->nplans = 0; newp->splan = NULL; (*nplans)++; return (newp); }
Re: New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding)
From
"Christopher Kings-Lynne"
Date:
Hi eggli, I'm currently working on another patch that has been submitted for fulltextindex. I will try to merge both yours and Florian's changes for the fulltextindex module and get it into CVS for 7.3. Cheers, Chris > -----Original Message----- > From: pgsql-patches-owner@postgresql.org > [mailto:pgsql-patches-owner@postgresql.org]On Behalf Of eggli > Sent: Tuesday, 9 July 2002 6:58 PM > To: pgsql-patches@postgresql.org > Subject: [PATCHES] New Full Text Index using contrib/fulltextindex which > now able to processing Traditional Chinese characters(Big5 encoding) > > > Hi, all, I found that contrib/fulltextindex is unable to process multibyte > characters, so I tried to make it suit for my mother language as > Chinese, I > believe it's able to process Unicode by wcrok(), but I'm lazy to complete > it.;) > > USAGE: > > Just replace the fti.c from this mail on contrib/fulltextindex and re-make > it. > > And the indexing/query way is most the same. > > CODE: > /* The difference: breakup() and issleadbyte() > > #include "postgres.h" > > #include <ctype.h> > > #include "executor/spi.h" > #include "commands/trigger.h" > > /* > * Trigger function accepts variable number of arguments: > * > * 1. relation in which to store the substrings > * 2. fields to extract substrings from > * > * The relation in which to insert *must* have the following layout: > * > * string varchar(#) > * id oid > * > * where # is the largest size of the varchar columns being indexed > * > * Example: > * > * -- Create the SQL function based on the compiled shared object > * create function fti() returns opaque as > * '/usr/local/pgsql/lib/contrib/fti.so' language 'C'; > * > * -- Create the FTI table > * create table product_fti (string varchar(255), id oid); > * > * -- Create an index to assist string matches > * create index product_fti_string_idx on product_fti (string); > * > * -- Create an index to assist trigger'd deletes > * create index product_fti_id_idx on product_fti (id); > * > * -- Create an index on the product oid column to assist joins > * -- between the fti table and the product table > * create index product_oid_idx on product (oid); > * > * -- Create the trigger to perform incremental changes to the full text > index. > * create trigger product_fti_trig after update or insert or delete on > product > * for each row execute procedure fti(product_fti, title, artist); > * ^^^^^^^^^^^ > * table where full text index is stored > * ^^^^^^^^^^^^^ > * columns to index in the base table > * > * After populating 'product', try something like: > * > * SELECT DISTINCT(p.*) FROM product p, product_fti f1, > product_fti f2 WHERE > * f1.string ~ '^slippery' AND f2.string ~ '^wet' AND p.oid=f1.id AND > p.oid=f2.id; > * > * To check that your indicies are being used correctly, make sure you > * EXPLAIN SELECT ... your test query above. > * > * CHANGELOG > * --------- > * > * august 3 2001 > * Extended fti function to accept more than one column as a > * parameter and all specified columns are indexed. Changed > * all uses of sprintf to snprintf. Made error messages more > * consistent. > * > * march 4 1998 Changed breakup() to return less substrings. Only breakup > * in word parts which are in turn shortened from the start > * of the word (ie. word, ord, rd) > * Did allocation of substring buffer outside of breakup() > * > * oct. 5 1997, fixed a bug in string breakup (where there are > more nonalpha > * characters between words then 1). > * > * oct 4-5 1997 implemented the thing, at least the basic functionallity > * of it all.... > * > * TODO > * ---- > * > * prevent generating duplicate words for an oid in the fti table > * save a plan for deletes > * create a function that will make the index *after* we have populated > * the main table (probably first delete all contents to be sure there's > * nothing in it, then re-populate the fti-table) > * > * can we do something with operator overloading or a seperate function > * that can build the final query automatigally? > */ > > #define MAX_FTI_QUERY_LENGTH 8192 > > extern Datum fti(PG_FUNCTION_ARGS); > static char *breakup(char *, char *); > static bool is_stopword(char *); > static bool isleadbyte(unsigned char ch); > static bool new_tuple = false; > > > #ifdef USE_STOP_WORDS > > /* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */ > char *StopWords[] = { /* list of words to skip in indexing */ > "no", > "the", > "yes" > }; > #endif /* USE_STOP_WORDS */ > > /* stuff for caching query-plans, stolen from contrib/spi/\*.c */ > typedef struct > { > char *ident; > int nplans; > void **splan; > } EPlan; > > static EPlan *InsertPlans = NULL; > static EPlan *DeletePlans = NULL; > static int nInsertPlans = 0; > static int nDeletePlans = 0; > > static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans); > > /***********************************************************************/ > PG_FUNCTION_INFO_V1(fti); > > Datum > fti(PG_FUNCTION_ARGS) > { > TriggerData *trigdata; > Trigger *trigger; /* to get trigger name */ > int nargs; /* # of arguments */ > char **args; /* arguments */ > char *relname; /* triggered relation name */ > Relation rel; /* triggered relation */ > char *indexname; /* name of table for substrings */ > HeapTuple rettuple = NULL; > TupleDesc tupdesc; /* tuple description */ > bool isinsert = false; > bool isdelete = false; > int ret; > char query[MAX_FTI_QUERY_LENGTH]; > Oid oid; > > /* > * FILE *debug; > */ > > /* > * debug = fopen("/dev/xconsole", "w"); fprintf(debug, "FTI: entered > * function\n"); fflush(debug); > */ > > if (!CALLED_AS_TRIGGER(fcinfo)) > elog(ERROR, "Full Text Indexing: Not fired by trigger manager"); > > /* It's safe to cast now that we've checked */ > trigdata = (TriggerData *) fcinfo->context; > > if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event)) > elog(ERROR, "Full Text Indexing: Can't process STATEMENT events"); > if (TRIGGER_FIRED_BEFORE(trigdata->tg_event)) > elog(ERROR, "Full Text Indexing: Must be fired AFTER event"); > > if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) > isinsert = true; > if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) > { > isdelete = true; > isinsert = true; > } > if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event)) > isdelete = true; > > trigger = trigdata->tg_trigger; > rel = trigdata->tg_relation; > relname = SPI_getrelname(rel); > rettuple = trigdata->tg_trigtuple; > if (isdelete && isinsert) /* is an UPDATE */ > rettuple = trigdata->tg_newtuple; > > if ((ret = SPI_connect()) < 0) > elog(ERROR, "Full Text Indexing: SPI_connect: Failed, returned %d\n", > ret); > > nargs = trigger->tgnargs; > if (nargs < 2) > elog(ERROR, "Full Text Indexing: Trigger must have at least 2 > arguments\n"); > > args = trigger->tgargs; > indexname = args[0]; > tupdesc = rel->rd_att; /* what the tuple looks like (?) */ > > /* get oid of current tuple, needed by all, so place here */ > oid = rettuple->t_data->t_oid; > if (!OidIsValid(oid)) > elog(ERROR, "Full Text Indexing: Oid of current tuple is invalid"); > > if (isdelete) > { > void *pplan; > Oid *argtypes; > Datum values[1]; > EPlan *plan; > int i; > > snprintf(query, MAX_FTI_QUERY_LENGTH, "D%s", indexname); > for (i = 1; i < nargs; i++) > snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]); > > plan = find_plan(query, &DeletePlans, &nDeletePlans); > if (plan->nplans <= 0) > { > argtypes = (Oid *) palloc(sizeof(Oid)); > > argtypes[0] = OIDOID; > > snprintf(query, MAX_FTI_QUERY_LENGTH, "DELETE FROM %s WHERE id = $1", > indexname); > pplan = SPI_prepare(query, 1, argtypes); > if (!pplan) > elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL > in delete"); > pplan = SPI_saveplan(pplan); > if (pplan == NULL) > elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in > delete"); > > plan->splan = (void **) malloc(sizeof(void *)); > *(plan->splan) = pplan; > plan->nplans = 1; > } > > values[0] = oid; > > ret = SPI_execp(*(plan->splan), values, NULL, 0); > if (ret != SPI_OK_DELETE) > elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in > delete"); > } > > if (isinsert) > { > char *substring; > char *column; > void *pplan; > Oid *argtypes; > Datum values[2]; > int colnum; > struct varlena *data; > EPlan *plan; > int i; > char *buff; > char *string; > > snprintf(query, MAX_FTI_QUERY_LENGTH, "I%s", indexname); > for (i = 1; i < nargs; i++) > snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]); > > plan = find_plan(query, &InsertPlans, &nInsertPlans); > > /* no plan yet, so allocate mem for argtypes */ > if (plan->nplans <= 0) > { > argtypes = (Oid *) palloc(2 * sizeof(Oid)); > > argtypes[0] = VARCHAROID; /* create table t_name (string > * varchar, */ > argtypes[1] = OIDOID; /* id oid); */ > > /* prepare plan to gain speed */ > snprintf(query, MAX_FTI_QUERY_LENGTH, "INSERT INTO %s (string, > id) VALUES > ($1, $2)", > indexname); > pplan = SPI_prepare(query, 2, argtypes); > if (!pplan) > elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL > in insert"); > > pplan = SPI_saveplan(pplan); > if (pplan == NULL) > elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in > insert"); > > plan->splan = (void **) malloc(sizeof(void *)); > *(plan->splan) = pplan; > plan->nplans = 1; > } > > /* prepare plan for query */ > for (i = 0; i < nargs - 1; i++) > { > colnum = SPI_fnumber(tupdesc, args[i + 1]); > if (colnum == SPI_ERROR_NOATTRIBUTE) > elog(ERROR, "Full Text Indexing: SPI_fnumber: Column '%s' of '%s' not > found", args[i + 1], indexname); > > /* Get the char* representation of the column */ > column = SPI_getvalue(rettuple, tupdesc, colnum); > > /* make sure we don't try to index NULL's */ > if (column) > { > string = column; > while (*string != '\0') > { > *string = tolower((unsigned char) *string); > string++; > } > > data = (struct varlena *) palloc(sizeof(int32) + strlen(column) +1); > buff = palloc(strlen(column) + 1); > /* saves lots of calls in while-loop and in breakup() */ > > new_tuple = true; > > while ((substring = breakup(column, buff))) > { > int l; > > l = strlen(substring); > > data->vl_len = l + sizeof(int32); > memcpy(VARDATA(data), substring, l); > values[0] = PointerGetDatum(data); > values[1] = oid; > > ret = SPI_execp(*(plan->splan), values, NULL, 0); > if (ret != SPI_OK_INSERT) > elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in > insert"); > } > pfree(buff); > pfree(data); > } > } > } > > SPI_finish(); > return PointerGetDatum(rettuple); > } > > static char * > breakup(char *string, char *substring) > { > static char *last_start; > static char *cur_pos; > static char *string_end; > > if (new_tuple) > { > string_end = &string[strlen(string)-1]; > cur_pos = last_start = &string[0]; > new_tuple = false;/* don't initialize this next time */ > } > > while (cur_pos <= string_end)/* don't read after end of 'string' */ > { > if (isleadbyte((unsigned)*cur_pos )) { > /* Bingo, got a Big-5 word (2 bytes) */ > cur_pos += 2; > memcpy(substring, last_start, cur_pos - last_start); > substring[cur_pos - last_start] = '\0'; > if (!is_stopword(substring)) > return substring; > } else if (isalnum((unsigned char) *cur_pos)) { > /* Houston, we have a substring! :) */ > cur_pos++; > memcpy(substring, last_start, cur_pos - last_start); > substring[cur_pos - last_start] = '\0'; > if (!is_stopword(substring)) > return substring; > } else { > last_start = cur_pos + 1; > cur_pos = last_start; > } > } > > return NULL;/* we've processed all of 'string' */ > } > > bool isleadbyte(unsigned char ch) > { > if ((ch >= 0xA1) && (ch <= 0xFE)) > return true; > > if ((ch >= 0x8E) && (ch <= 0xA0)) > return true; > > if ((ch >= 0x81) && (ch <= 0x8D)) > return true; > > return false; > } > /* copied from src/backend/parser/keywords.c and adjusted for our > situation*/ > static bool > is_stopword(char *text) > { > #ifdef USE_STOP_WORDS > char **StopLow; /* for list of stop-words */ > char **StopHigh; > char **StopMiddle; > int difference; > > StopLow = &StopWords[0]; /* initialize stuff for binary search */ > StopHigh = endof(StopWords); > > /* Loop invariant: *StopLow <= text < *StopHigh */ > > while (StopLow < StopHigh) > { > StopMiddle = StopLow + (StopHigh - StopLow) / 2; > difference = strcmp(*StopMiddle, text); > if (difference == 0) > return (true); > else if (difference < 0) > StopLow = StopMiddle + 1; > else > StopHigh = StopMiddle; > } > #endif /* USE_STOP_WORDS */ > > return (false); > } > > /* for caching of query plans, stolen from contrib/spi/\*.c */ > static EPlan * > find_plan(char *ident, EPlan ** eplan, int *nplans) > { > EPlan *newp; > int i; > > if (*nplans > 0) > { > for (i = 0; i < *nplans; i++) > { > if (strcmp((*eplan)[i].ident, ident) == 0) > break; > } > if (i != *nplans) > return (*eplan + i); > *eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan)); > newp = *eplan + i; > } > else > { > newp = *eplan = (EPlan *) malloc(sizeof(EPlan)); > (*nplans) = i = 0; > } > > newp->ident = (char *) malloc(strlen(ident) + 1); > strcpy(newp->ident, ident); > newp->nplans = 0; > newp->splan = NULL; > (*nplans)++; > > return (newp); > } > > > > > > > ---------------------------(end of broadcast)--------------------------- > TIP 4: Don't 'kill -9' the postmaster >