And here's the attachment I forgot.
Heikki Linnakangas wrote:
> Heikki Linnakangas wrote:
>> Tom Lane wrote:
>>> Something that was annoying me yesterday was that it was not clear
>>> whether we had fixed every single place that uses a tsearch config file
>>> to assume that the file is in UTF8 and should be converted to database
>>> encoding. So I was thinking of hardwiring the "recode" part into
>>> readstopwords, and using wordop just for the "lowercase" part, which
>>> seemed to me like a saner division of labor. That is, UTF8 is a policy
>>> that we want to enforce globally, but lowercasing maybe not, and this
>>> still leaves the door open for more processing besides lowercasing.
>> I think we also want to always run input files through pg_verify_mbstr.
>> We do it for stopwords, and synonym files (though incorrectly), but not
>> for thesaurus files or ispell files. It's probably best to do that
>> within the recode-function as well.
>
> Ok, here's an updated version of the patch.
>
> - ispell initialization crashed on empty dictionary file
> - ispell initialization crashed on affix file with prefixes but no suffixes
> - stop words file was ran through pg_verify_mbstr, with database
> encoding, but it's later interpreted as being UTF-8. Now verifies that
> it's UTF-8, regardless of database encoding.
>
>
> - introduces new t_readline function that reads a line from a file,
> verifies that it's valid UTF-8, and converts it to database encoding.
> Modified all places that read tsearch config files to use this function
> instead of fgets directly.
>
> - readstopwords now sorts the stop words after loading them. Removed the
> separate sortstopwords function.
>
> - moved the wordop-input parameter from StopList struct to a direct
> argument to readstopwords. Seems cleaner to me that way, the struct is
> now purely an output of readstopwords, not mixed input/output.
> readstopwords now recodes the input implicitly using t_readline.
>
> - bunch of comments added, typos fixed, and other cleanup
>
> PS. It's bank holiday here in the UK on Monday, so I won't be around
> until Tuesday if something comes up.
>
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
Index: src/backend/snowball/dict_snowball.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/snowball/dict_snowball.c,v
retrieving revision 1.2
diff -c -r1.2 dict_snowball.c
*** src/backend/snowball/dict_snowball.c 22 Aug 2007 01:39:44 -0000 1.2
--- src/backend/snowball/dict_snowball.c 24 Aug 2007 09:37:50 -0000
***************
*** 192,198 ****
ListCell *l;
d = (DictSnowball *) palloc0(sizeof(DictSnowball));
- d->stoplist.wordop = recode_and_lowerstr;
foreach(l, dictoptions)
{
--- 192,197 ----
***************
*** 204,211 ****
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters")));
! readstoplist(defGetString(defel), &d->stoplist);
! sortstoplist(&d->stoplist);
stoploaded = true;
}
else if (pg_strcasecmp("Language", defel->defname) == 0)
--- 203,209 ----
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters")));
! readstoplist(defGetString(defel), &d->stoplist, lowerstr);
stoploaded = true;
}
else if (pg_strcasecmp("Language", defel->defname) == 0)
Index: src/backend/tsearch/dict_ispell.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_ispell.c,v
retrieving revision 1.2
diff -c -r1.2 dict_ispell.c
*** src/backend/tsearch/dict_ispell.c 22 Aug 2007 01:39:44 -0000 1.2
--- src/backend/tsearch/dict_ispell.c 23 Aug 2007 21:12:33 -0000
***************
*** 39,45 ****
ListCell *l;
d = (DictISpell *) palloc0(sizeof(DictISpell));
- d->stoplist.wordop = recode_and_lowerstr;
foreach(l, dictoptions)
{
--- 39,44 ----
***************
*** 73,80 ****
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters")));
! readstoplist(defGetString(defel), &(d->stoplist));
! sortstoplist(&(d->stoplist));
stoploaded = true;
}
else
--- 72,78 ----
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters")));
! readstoplist(defGetString(defel), &(d->stoplist), lowerstr);
stoploaded = true;
}
else
Index: src/backend/tsearch/dict_simple.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_simple.c,v
retrieving revision 1.2
diff -c -r1.2 dict_simple.c
*** src/backend/tsearch/dict_simple.c 22 Aug 2007 01:39:44 -0000 1.2
--- src/backend/tsearch/dict_simple.c 23 Aug 2007 21:12:24 -0000
***************
*** 23,41 ****
typedef struct
{
StopList stoplist;
! } DictExample;
Datum
dsimple_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
! DictExample *d = (DictExample *) palloc0(sizeof(DictExample));
bool stoploaded = false;
ListCell *l;
- d->stoplist.wordop = recode_and_lowerstr;
-
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
--- 23,39 ----
typedef struct
{
StopList stoplist;
! } DictSimple;
Datum
dsimple_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
! DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
bool stoploaded = false;
ListCell *l;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
***************
*** 46,53 ****
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters")));
! readstoplist(defGetString(defel), &d->stoplist);
! sortstoplist(&d->stoplist);
stoploaded = true;
}
else
--- 44,50 ----
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters")));
! readstoplist(defGetString(defel), &d->stoplist, lowerstr);
stoploaded = true;
}
else
***************
*** 65,80 ****
Datum
dsimple_lexize(PG_FUNCTION_ARGS)
{
! DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
! char *txt = lowerstr_with_len(in, len);
TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
- {
pfree(txt);
- }
else
res[0].lexeme = txt;
--- 62,77 ----
Datum
dsimple_lexize(PG_FUNCTION_ARGS)
{
! DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
! char *txt;
TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
+ txt = lowerstr_with_len(in, len);
+
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
pfree(txt);
else
res[0].lexeme = txt;
Index: src/backend/tsearch/dict_synonym.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_synonym.c,v
retrieving revision 1.2
diff -c -r1.2 dict_synonym.c
*** src/backend/tsearch/dict_synonym.c 22 Aug 2007 04:13:15 -0000 1.2
--- src/backend/tsearch/dict_synonym.c 24 Aug 2007 10:00:05 -0000
***************
*** 20,28 ****
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
-
- #define SYNBUFLEN 4096
-
typedef struct
{
char *in;
--- 20,25 ----
***************
*** 31,53 ****
typedef struct
{
! int len;
Syn *syn;
} DictSyn;
static char *
findwrd(char *in, char **end)
{
char *start;
! *end = NULL;
while (*in && t_isspace(in))
in += pg_mblen(in);
if (*in == '\0')
return NULL;
start = in;
while (*in && !t_isspace(in))
in += pg_mblen(in);
--- 28,61 ----
typedef struct
{
! int len; /* length of syn array */
Syn *syn;
} DictSyn;
+ /*
+ * Finds the next whitespace-delimited word within the 'in' string.
+ * Returns a pointer to the first character of the word, and a pointer
+ * to the next byte after the last character in the word (in *end).
+ */
static char *
findwrd(char *in, char **end)
{
char *start;
! /* Skip leading spaces */
while (*in && t_isspace(in))
in += pg_mblen(in);
+ /* Return NULL on empty lines */
if (*in == '\0')
+ {
+ *end = NULL;
return NULL;
+ }
+
start = in;
+ /* Find end of word */
while (*in && !t_isspace(in))
in += pg_mblen(in);
***************
*** 70,81 ****
ListCell *l;
char *filename = NULL;
FILE *fin;
- char buf[SYNBUFLEN];
char *starti,
*starto,
*end = NULL;
int cur = 0;
! int slen;
foreach(l, dictoptions)
{
--- 78,88 ----
ListCell *l;
char *filename = NULL;
FILE *fin;
char *starti,
*starto,
*end = NULL;
int cur = 0;
! char *line = NULL;
foreach(l, dictoptions)
{
***************
*** 105,114 ****
d = (DictSyn *) palloc0(sizeof(DictSyn));
! while (fgets(buf, SYNBUFLEN, fin))
{
! slen = strlen(buf);
! pg_verifymbstr(buf, slen, false);
if (cur == d->len)
{
if (d->len == 0)
--- 112,144 ----
d = (DictSyn *) palloc0(sizeof(DictSyn));
! while ((line = t_readline(fin)) != NULL)
{
! starti = findwrd(line, &end);
! if (!starti)
! {
! /* Empty line */
! goto skipline;
! }
! *end = '\0';
! if (end >= line + strlen(line))
! {
! /* A line with only one word. Ignore silently. */
! goto skipline;
! }
!
! starto = findwrd(end + 1, &end);
! if (!starto)
! {
! /* A line with only one word. Ignore silently. */
! goto skipline;
! }
! *end = '\0';
!
! /* starti now points to the first word, and starto to the second
! * word on the line, with a \0 terminator at the end of both words.
! */
!
if (cur == d->len)
{
if (d->len == 0)
***************
*** 123,158 ****
}
}
! starti = findwrd(buf, &end);
! if (!starti)
! continue;
! *end = '\0';
! if (end >= buf + slen)
! continue;
!
! starto = findwrd(end + 1, &end);
! if (!starto)
! continue;
! *end = '\0';
!
! d->syn[cur].in = recode_and_lowerstr(starti);
! d->syn[cur].out = recode_and_lowerstr(starto);
! if (!(d->syn[cur].in && d->syn[cur].out))
! {
! FreeFile(fin);
! ereport(ERROR,
! (errcode(ERRCODE_OUT_OF_MEMORY),
! errmsg("out of memory")));
! }
cur++;
}
FreeFile(fin);
d->len = cur;
! if (cur > 1)
! qsort(d->syn, d->len, sizeof(Syn), compareSyn);
PG_RETURN_POINTER(d);
}
--- 153,171 ----
}
}
! d->syn[cur].in = lowerstr(starti);
! d->syn[cur].out = lowerstr(starto);
cur++;
+
+ skipline:
+ pfree(line);
}
FreeFile(fin);
d->len = cur;
! qsort(d->syn, d->len, sizeof(Syn), compareSyn);
PG_RETURN_POINTER(d);
}
***************
*** 179,186 ****
if (!found)
PG_RETURN_POINTER(NULL);
! res = palloc(sizeof(TSLexeme) * 2);
! memset(res, 0, sizeof(TSLexeme) * 2);
res[0].lexeme = pstrdup(found->out);
PG_RETURN_POINTER(res);
--- 192,198 ----
if (!found)
PG_RETURN_POINTER(NULL);
! res = palloc0(sizeof(TSLexeme) * 2);
res[0].lexeme = pstrdup(found->out);
PG_RETURN_POINTER(res);
Index: src/backend/tsearch/dict_thesaurus.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_thesaurus.c,v
retrieving revision 1.2
diff -c -r1.2 dict_thesaurus.c
*** src/backend/tsearch/dict_thesaurus.c 22 Aug 2007 01:39:44 -0000 1.2
--- src/backend/tsearch/dict_thesaurus.c 24 Aug 2007 10:02:16 -0000
***************
*** 170,179 ****
thesaurusRead(char *filename, DictThesaurus * d)
{
FILE *fh;
- char str[BUFSIZ];
int lineno = 0;
uint16 idsubst = 0;
bool useasis = false;
filename = get_tsearch_config_filename(filename, "ths");
fh = AllocateFile(filename, "r");
--- 170,179 ----
thesaurusRead(char *filename, DictThesaurus * d)
{
FILE *fh;
int lineno = 0;
uint16 idsubst = 0;
bool useasis = false;
+ char *line;
filename = get_tsearch_config_filename(filename, "ths");
fh = AllocateFile(filename, "r");
***************
*** 183,209 ****
errmsg("could not open thesaurus file \"%s\": %m",
filename)));
! while (fgets(str, sizeof(str), fh))
{
! char *ptr,
! *recoded;
int state = TR_WAITLEX;
char *beginwrd = NULL;
uint16 posinsubst = 0;
uint16 nwrd = 0;
- ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
- GetDatabaseEncoding(), PG_UTF8);
- if (recoded == NULL)
- elog(ERROR, "encoding conversion failed");
-
lineno++;
! /* is it comment ? */
! while (t_isspace(ptr))
ptr += pg_mblen(ptr);
! if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r'))
continue;
while (*ptr)
{
--- 183,210 ----
errmsg("could not open thesaurus file \"%s\": %m",
filename)));
! while ((line = t_readline(fh)) != NULL)
{
! char *ptr;
int state = TR_WAITLEX;
char *beginwrd = NULL;
uint16 posinsubst = 0;
uint16 nwrd = 0;
lineno++;
! ptr = line;
!
! /* is it a comment? */
! while (*ptr && t_isspace(ptr))
ptr += pg_mblen(ptr);
!
! if (t_iseq(ptr, '#') || *ptr == '\0' ||
! t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
! {
! pfree(line);
continue;
+ }
while (*ptr)
{
***************
*** 301,308 ****
lineno, filename)));
}
! if (recoded != str)
! pfree(recoded);
}
d->nsubst = idsubst;
--- 302,308 ----
lineno, filename)));
}
! pfree(line);
}
d->nsubst = idsubst;
Index: src/backend/tsearch/spell.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/spell.c,v
retrieving revision 1.1
diff -c -r1.1 spell.c
*** src/backend/tsearch/spell.c 21 Aug 2007 01:11:18 -0000 1.1
--- src/backend/tsearch/spell.c 24 Aug 2007 10:41:12 -0000
***************
*** 21,28 ****
/*
! * during initialization dictionary requires a lot
! * of memory, so it will use temporary context
*/
static MemoryContext tmpCtx = NULL;
--- 21,31 ----
/*
! * Initialization requires a lot of memory that's not needed
! * after the initialization is done. In init function,
! * CurrentMemoryContext is a long lived memory context associated
! * with the dictionary cache entry, so we use a temporary context
! * for the short-lived stuff.
*/
static MemoryContext tmpCtx = NULL;
***************
*** 32,37 ****
--- 35,43 ----
static void
checkTmpCtx(void)
{
+ /* XXX: This assumes that CurrentMemoryContext doesn't have
+ * any children other than the one we create here.
+ */
if (CurrentMemoryContext->firstchild == NULL)
{
tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
***************
*** 74,90 ****
static int
cmpspellaffix(const void *s1, const void *s2)
{
! return (strcmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag));
! }
!
! static char *
! strnduplicate(char *s, int len)
! {
! char *d = (char *) palloc(len + 1);
!
! memcpy(d, s, len);
! d[len] = '\0';
! return d;
}
static char *
--- 80,86 ----
static int
cmpspellaffix(const void *s1, const void *s2)
{
! return (strncmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag, MAXFLAGLEN));
}
static char *
***************
*** 185,191 ****
}
Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
strcpy(Conf->Spell[Conf->nspell]->word, word);
! strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16);
Conf->nspell++;
}
--- 181,187 ----
}
Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
strcpy(Conf->Spell[Conf->nspell]->word, word);
! strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
Conf->nspell++;
}
***************
*** 197,205 ****
void
NIImportDictionary(IspellDict * Conf, const char *filename)
{
- char str[BUFSIZ],
- *pstr;
FILE *dict;
checkTmpCtx();
--- 193,200 ----
void
NIImportDictionary(IspellDict * Conf, const char *filename)
{
FILE *dict;
+ char *line;
checkTmpCtx();
***************
*** 209,227 ****
errmsg("could not open dictionary file \"%s\": %m",
filename)));
! while (fgets(str, sizeof(str), dict))
{
! char *s,
! *recoded;
const char *flag;
! recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
! PG_UTF8, GetDatabaseEncoding());
! if (recoded == NULL)
! elog(ERROR, "encoding conversion failed");
!
flag = NULL;
! if ((s = findchar(recoded, '/')))
{
*s++ = '\0';
flag = s;
--- 204,217 ----
errmsg("could not open dictionary file \"%s\": %m",
filename)));
! while ((line = t_readline(dict)) != NULL)
{
! char *s, *pstr;
const char *flag;
! /* Extract flag from the line */
flag = NULL;
! if ((s = findchar(line, '/')))
{
*s++ = '\0';
flag = s;
***************
*** 240,247 ****
else
flag = "";
!
! s = recoded;
while (*s)
{
if (t_isspace(s))
--- 230,237 ----
else
flag = "";
! /* Remove trailing spaces */
! s = line;
while (*s)
{
if (t_isspace(s))
***************
*** 251,263 ****
}
s += pg_mblen(s);
}
! pstr = lowerstr_ctx(recoded);
NIAddSpell(Conf, pstr, flag);
pfree(pstr);
! if (recoded != str)
! pfree(recoded);
}
FreeFile(dict);
}
--- 241,252 ----
}
s += pg_mblen(s);
}
! pstr = lowerstr_ctx(line);
NIAddSpell(Conf, pstr, flag);
pfree(pstr);
! pfree(line);
}
FreeFile(dict);
}
***************
*** 402,408 ****
static bool
parse_affentry(char *str, char *mask, char *find, char *repl,
! const char *filename, int line)
{
int state = PAE_WAIT_MASK;
char *pmask = mask,
--- 391,397 ----
static bool
parse_affentry(char *str, char *mask, char *find, char *repl,
! const char *filename, int lineno)
{
int state = PAE_WAIT_MASK;
char *pmask = mask,
***************
*** 453,459 ****
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"",
! line, filename)));
}
else if (state == PAE_INFIND)
{
--- 442,448 ----
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"",
! lineno, filename)));
}
else if (state == PAE_INFIND)
{
***************
*** 471,477 ****
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"",
! line, filename)));
}
else if (state == PAE_WAIT_REPL)
{
--- 460,466 ----
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"",
! lineno, filename)));
}
else if (state == PAE_WAIT_REPL)
{
***************
*** 489,495 ****
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"",
! line, filename)));
}
else if (state == PAE_INREPL)
{
--- 478,484 ----
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"",
! lineno, filename)));
}
else if (state == PAE_INREPL)
{
***************
*** 507,513 ****
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"",
! line, filename)));
}
else
elog(ERROR, "unknown state in parse_affentry: %d", state);
--- 496,502 ----
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"",
! lineno, filename)));
}
else
elog(ERROR, "unknown state in parse_affentry: %d", state);
***************
*** 522,528 ****
static void
addFlagValue(IspellDict * Conf, char *s, uint32 val,
! const char *filename, int line)
{
while (*s && t_isspace(s))
s++;
--- 511,517 ----
static void
addFlagValue(IspellDict * Conf, char *s, uint32 val,
! const char *filename, int lineno)
{
while (*s && t_isspace(s))
s++;
***************
*** 531,543 ****
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"",
! line, filename)));
if (pg_mblen(s) != 1)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! line, filename)));
Conf->flagval[(unsigned int) *s] = (unsigned char) val;
Conf->usecompound = true;
--- 520,532 ----
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("syntax error at line %d of affix file \"%s\"",
! lineno, filename)));
if (pg_mblen(s) != 1)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! lineno, filename)));
Conf->flagval[(unsigned int) *s] = (unsigned char) val;
Conf->usecompound = true;
***************
*** 546,552 ****
static void
NIImportOOAffixes(IspellDict * Conf, const char *filename)
{
- char str[BUFSIZ];
char type[BUFSIZ],
*ptype = NULL;
char sflag[BUFSIZ];
--- 535,540 ----
***************
*** 560,568 ****
int flag = 0;
char flagflags = 0;
FILE *affix;
! int line = 0;
int scanread = 0;
char scanbuf[BUFSIZ];
checkTmpCtx();
--- 548,557 ----
int flag = 0;
char flagflags = 0;
FILE *affix;
! int lineno = 0;
int scanread = 0;
char scanbuf[BUFSIZ];
+ char *recoded;
checkTmpCtx();
***************
*** 576,620 ****
errmsg("could not open affix file \"%s\": %m",
filename)));
! while (fgets(str, sizeof(str), affix))
{
! char *recoded;
!
! recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
! PG_UTF8, GetDatabaseEncoding());
! if (recoded == NULL)
! elog(ERROR, "encoding conversion failed");
!
! line++;
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
continue;
if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
! FF_COMPOUNDFLAG, filename, line);
else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
! FF_COMPOUNDBEGIN, filename, line);
else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
! FF_COMPOUNDLAST, filename, line);
/* COMPOUNDLAST and COMPOUNDEND are synonyms */
else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
! FF_COMPOUNDLAST, filename, line);
else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
! FF_COMPOUNDMIDDLE, filename, line);
else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
! FF_COMPOUNDONLY, filename, line);
else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
! FF_COMPOUNDPERMITFLAG, filename, line);
else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
! FF_COMPOUNDFORBIDFLAG, filename, line);
else if (STRNCMP(recoded, "FLAG") == 0)
{
char *s = recoded + strlen("FLAG");
--- 565,605 ----
errmsg("could not open affix file \"%s\": %m",
filename)));
! while ((recoded = t_readline(affix)) != NULL)
{
! lineno++;
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+ {
+ pfree(recoded);
continue;
+ }
if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
! FF_COMPOUNDFLAG, filename, lineno);
else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
! FF_COMPOUNDBEGIN, filename, lineno);
else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
! FF_COMPOUNDLAST, filename, lineno);
/* COMPOUNDLAST and COMPOUNDEND are synonyms */
else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
! FF_COMPOUNDLAST, filename, lineno);
else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
! FF_COMPOUNDMIDDLE, filename, lineno);
else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
! FF_COMPOUNDONLY, filename, lineno);
else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
! FF_COMPOUNDPERMITFLAG, filename, lineno);
else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
! FF_COMPOUNDFORBIDFLAG, filename, lineno);
else if (STRNCMP(recoded, "FLAG") == 0)
{
char *s = recoded + strlen("FLAG");
***************
*** 626,639 ****
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"",
! line, filename)));
}
! if (recoded != str)
! pfree(recoded);
}
FreeFile(affix);
! line = 0;
sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
--- 611,623 ----
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"",
! lineno, filename)));
}
! pfree(recoded);
}
FreeFile(affix);
! lineno = 0;
sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
***************
*** 643,660 ****
errmsg("could not open affix file \"%s\": %m",
filename)));
! while (fgets(str, sizeof(str), affix))
{
! char *recoded;
!
! recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
! PG_UTF8, GetDatabaseEncoding());
! if (recoded == NULL)
! elog(ERROR, "encoding conversion failed");
!
! line++;
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
! continue;
scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
--- 627,637 ----
errmsg("could not open affix file \"%s\": %m",
filename)));
! while ((recoded = t_readline(affix)) != NULL)
{
! lineno++;
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
! goto nextline;
scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
***************
*** 662,673 ****
pfree(ptype);
ptype = lowerstr_ctx(type);
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
! continue;
if (scanread == 4)
{
if (strlen(sflag) != 1)
! continue;
flag = *sflag;
isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
pfind = lowerstr_ctx(find);
--- 639,650 ----
pfree(ptype);
ptype = lowerstr_ctx(type);
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
! goto nextline;
if (scanread == 4)
{
if (strlen(sflag) != 1)
! goto nextline;
flag = *sflag;
isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
pfind = lowerstr_ctx(find);
***************
*** 683,689 ****
int aflg = 0;
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
! continue;
prepl = lowerstr_ctx(repl);
/* affix flag */
if ((ptr = strchr(prepl, '/')) != NULL)
--- 660,666 ----
int aflg = 0;
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
! goto nextline;
prepl = lowerstr_ctx(repl);
/* affix flag */
if ((ptr = strchr(prepl, '/')) != NULL)
***************
*** 710,717 ****
pfree(pmask);
}
! if (recoded != str)
! pfree(recoded);
}
if (ptype)
--- 687,694 ----
pfree(pmask);
}
! nextline:
! pfree(recoded);
}
if (ptype)
***************
*** 733,745 ****
char find[BUFSIZ];
char repl[BUFSIZ];
char *s;
! int suffixes = 0;
! int prefixes = 0;
int flag = 0;
char flagflags = 0;
FILE *affix;
! int line = 0;
! int oldformat = 0;
checkTmpCtx();
--- 710,723 ----
char find[BUFSIZ];
char repl[BUFSIZ];
char *s;
! bool suffixes = false;
! bool prefixes = false;
int flag = 0;
char flagflags = 0;
FILE *affix;
! int lineno = 0;
! bool oldformat = false;
! char *recoded = NULL;
checkTmpCtx();
***************
*** 752,767 ****
memset(Conf->flagval, 0, sizeof(Conf->flagval));
Conf->usecompound = false;
! while (fgets(str, sizeof(str), affix))
{
! if (pstr)
! pfree(pstr);
! pstr = recode_and_lowerstr(str);
! line++;
if (*pstr == '#' || *pstr == '\n')
! continue;
if (STRNCMP(pstr, "compoundwords") == 0)
{
--- 730,745 ----
memset(Conf->flagval, 0, sizeof(Conf->flagval));
Conf->usecompound = false;
! while ((recoded = t_readline(affix)) != NULL)
{
! pstr = lowerstr(recoded);
! pfree(recoded);
! lineno++;
! /* Skip comments and empty lines */
if (*pstr == '#' || *pstr == '\n')
! goto nextline;
if (STRNCMP(pstr, "compoundwords") == 0)
{
***************
*** 777,799 ****
Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
Conf->usecompound = true;
}
! oldformat++;
! continue;
}
}
if (STRNCMP(pstr, "suffixes") == 0)
{
! suffixes = 1;
! prefixes = 0;
! oldformat++;
! continue;
}
if (STRNCMP(pstr, "prefixes") == 0)
{
! suffixes = 0;
! prefixes = 1;
! oldformat++;
! continue;
}
if (STRNCMP(pstr, "flag") == 0)
{
--- 755,777 ----
Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
Conf->usecompound = true;
}
! oldformat = true;
! goto nextline;
}
}
if (STRNCMP(pstr, "suffixes") == 0)
{
! suffixes = true;
! prefixes = false;
! oldformat = true;
! goto nextline;
}
if (STRNCMP(pstr, "prefixes") == 0)
{
! suffixes = false;
! prefixes = true;
! oldformat = true;
! goto nextline;
}
if (STRNCMP(pstr, "flag") == 0)
{
***************
*** 802,815 ****
while (*s && t_isspace(s))
s++;
! oldformat++;
/* allow only single-encoded flags */
if (pg_mblen(s) != 1)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! line, filename)));
if (*s == '*')
{
--- 780,793 ----
while (*s && t_isspace(s))
s++;
! oldformat = true;
/* allow only single-encoded flags */
if (pg_mblen(s) != 1)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! lineno, filename)));
if (*s == '*')
{
***************
*** 830,839 ****
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! line, filename)));
flag = (unsigned char) *s;
! continue;
}
if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 ||
STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0)
--- 808,817 ----
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! lineno, filename)));
flag = (unsigned char) *s;
! goto nextline;
}
if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 ||
STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0)
***************
*** 842,864 ****
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("wrong affix file format for flag at line %d of affix file \"%s\"",
! line, filename)));
FreeFile(affix);
NIImportOOAffixes(Conf, filename);
return;
}
if ((!suffixes) && (!prefixes))
! continue;
! if (!parse_affentry(pstr, mask, find, repl, filename, line))
! continue;
NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
- }
- FreeFile(affix);
! if (pstr)
pfree(pstr);
}
static int
--- 820,842 ----
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("wrong affix file format for flag at line %d of affix file \"%s\"",
! lineno, filename)));
FreeFile(affix);
NIImportOOAffixes(Conf, filename);
return;
}
if ((!suffixes) && (!prefixes))
! goto nextline;
! if (!parse_affentry(pstr, mask, find, repl, filename, lineno))
! goto nextline;
NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
! nextline:
pfree(pstr);
+ }
+ FreeFile(affix);
}
static int
***************
*** 975,1012 ****
return rs;
}
void
NISortDictionary(IspellDict * Conf)
{
! size_t i;
! int naffix = 3;
checkTmpCtx();
/* compress affixes */
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
! for (i = 1; i < Conf->nspell; i++)
! if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
naffix++;
Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
! naffix = 1;
! Conf->AffixData[0] = pstrdup("");
! Conf->AffixData[1] = pstrdup(Conf->Spell[0]->p.flag);
! Conf->Spell[0]->p.d.affix = 1;
! Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word);
! for (i = 1; i < Conf->nspell; i++)
{
! if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix]))
{
! naffix++;
! Conf->AffixData[naffix] = pstrdup(Conf->Spell[i]->p.flag);
}
! Conf->Spell[i]->p.d.affix = naffix;
Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
}
Conf->lenAffixData = Conf->nAffixData = naffix;
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
--- 953,1007 ----
return rs;
}
+ /*
+ * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
+ * and affixes.
+ */
void
NISortDictionary(IspellDict * Conf)
{
! int i;
! int naffix = 0;
! int curaffix;
checkTmpCtx();
/* compress affixes */
+
+ /* Count the number of different flags used in the dictionary */
+
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
!
! naffix = 0;
! for (i = 0; i < Conf->nspell; i++)
! {
! if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
naffix++;
+ }
+ /*
+ * Fill in Conf->AffixData with the affixes that were used
+ * in the dictionary. Replace textual flag-field of Conf->Spell
+ * entries with indexes into Conf->AffixData array.
+ */
Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
!
! curaffix = -1;
! for (i = 0; i < Conf->nspell; i++)
{
! if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
{
! curaffix++;
! Assert(curaffix < naffix);
! Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag);
}
!
! Conf->Spell[i]->p.d.affix = curaffix;
Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
}
Conf->lenAffixData = Conf->nAffixData = naffix;
+
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
***************
*** 1085,1091 ****
}
static void
! mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix)
{
int i,
cnt = 0;
--- 1080,1086 ----
}
static void
! mkVoidAffix(IspellDict * Conf, bool issuffix, int startsuffix)
{
int i,
cnt = 0;
***************
*** 1145,1151 ****
AFFIX *Affix;
size_t i;
CMPDAffix *ptr;
! int firstsuffix = -1;
checkTmpCtx();
--- 1140,1146 ----
AFFIX *Affix;
size_t i;
CMPDAffix *ptr;
! int firstsuffix = Conf->naffixes;
checkTmpCtx();
***************
*** 1160,1166 ****
for (i = 0; i < Conf->naffixes; i++)
{
Affix = &(((AFFIX *) Conf->Affix)[i]);
! if (Affix->type == FF_SUFFIX && firstsuffix < 0)
firstsuffix = i;
if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
--- 1155,1161 ----
for (i = 0; i < Conf->naffixes; i++)
{
Affix = &(((AFFIX *) Conf->Affix)[i]);
! if (Affix->type == FF_SUFFIX && i < firstsuffix)
firstsuffix = i;
if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
***************
*** 1185,1196 ****
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
! mkVoidAffix(Conf, 1, firstsuffix);
! mkVoidAffix(Conf, 0, firstsuffix);
}
static AffixNodeData *
! FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
{
AffixNodeData *StopLow,
*StopHigh,
--- 1180,1191 ----
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
! mkVoidAffix(Conf, true, firstsuffix);
! mkVoidAffix(Conf, false, firstsuffix);
}
static AffixNodeData *
! FindAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
{
AffixNodeData *StopLow,
*StopHigh,
***************
*** 1374,1380 ****
plevel = 0;
while (pnode)
{
! prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
if (!prefix)
break;
for (j = 0; j < prefix->naff; j++)
--- 1369,1375 ----
plevel = 0;
while (pnode)
{
! prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
if (!prefix)
break;
for (j = 0; j < prefix->naff; j++)
***************
*** 1398,1404 ****
int baselen = 0;
/* find possible suffix */
! suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
if (!suffix)
break;
/* foreach suffix check affix */
--- 1393,1399 ----
int baselen = 0;
/* find possible suffix */
! suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
if (!suffix)
break;
/* foreach suffix check affix */
***************
*** 1416,1422 ****
swrdlen = strlen(newword);
while (pnode)
{
! prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
if (!prefix)
break;
for (j = 0; j < prefix->naff; j++)
--- 1411,1417 ----
swrdlen = strlen(newword);
while (pnode)
{
! prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
if (!prefix)
break;
for (j = 0; j < prefix->naff; j++)
***************
*** 1626,1632 ****
if (wordlen == level + 1)
{
/* well, it was last word */
! var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
var->nstem++;
pfree(notprobed);
return var;
--- 1621,1627 ----
if (wordlen == level + 1)
{
/* well, it was last word */
! var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
var->nstem++;
pfree(notprobed);
return var;
***************
*** 1641,1647 ****
ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
/* we can find next word */
level++;
! var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
var->nstem++;
node = Conf->Dictionary;
startpos = level;
--- 1636,1642 ----
ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
/* we can find next word */
level++;
! var->stem[var->nstem] = pnstrdup(word + startpos, level - startpos);
var->nstem++;
node = Conf->Dictionary;
startpos = level;
***************
*** 1656,1662 ****
level++;
}
! var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
var->nstem++;
pfree(notprobed);
return var;
--- 1651,1657 ----
level++;
}
! var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
var->nstem++;
pfree(notprobed);
return var;
Index: src/backend/tsearch/ts_locale.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_locale.c,v
retrieving revision 1.1
diff -c -r1.1 ts_locale.c
*** src/backend/tsearch/ts_locale.c 21 Aug 2007 01:11:18 -0000 1.1
--- src/backend/tsearch/ts_locale.c 24 Aug 2007 09:47:44 -0000
***************
*** 125,152 ****
}
#endif /* TS_USE_WIDE */
/*
! * Convert C-string from UTF8 to server encoding and
! * lower it
*/
char *
! recode_and_lowerstr(char *str)
{
! char *recoded;
! char *ret;
!
! recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
! PG_UTF8, GetDatabaseEncoding());
if (recoded == NULL)
elog(ERROR, "encoding conversion failed");
! ret = lowerstr(recoded);
!
! if (recoded != str)
! pfree(recoded);
! return ret;
}
char *
--- 125,169 ----
}
#endif /* TS_USE_WIDE */
+
/*
! * Utility function to read a line from a tsearch data file,
! * and recode it to database encoding. The returned string
! * is palloc'd.
*/
char *
! t_readline(FILE *fp)
{
! int len;
! static char *recoded = NULL;
! static char buf[4096];
!
! if(fgets(buf, sizeof(buf), fp) == NULL)
! return NULL;
!
! len = strnlen(buf, sizeof(buf));
!
! /* Make sure the input is valid UTF-8 */
! (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
!
! recoded = (char *) pg_do_encoding_conversion(
! (unsigned char *) buf,
! len,
! PG_UTF8,
! GetDatabaseEncoding());
if (recoded == NULL)
elog(ERROR, "encoding conversion failed");
! if (recoded == buf)
! {
! /* we can use the length of the original string, because
! * no conversion was done
! */
! recoded = pnstrdup(recoded, len);
! }
! return recoded;
}
char *
***************
*** 155,160 ****
--- 172,180 ----
return lowerstr_with_len(str, strlen(str));
}
+ /*
+ * Returned string is palloc'd
+ */
char *
lowerstr_with_len(char *str, int len)
{
Index: src/backend/tsearch/ts_parse.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_parse.c,v
retrieving revision 1.1
diff -c -r1.1 ts_parse.c
*** src/backend/tsearch/ts_parse.c 21 Aug 2007 01:11:18 -0000 1.1
--- src/backend/tsearch/ts_parse.c 23 Aug 2007 12:29:51 -0000
***************
*** 308,314 ****
{
/*
* Dictionary normalizes lexemes, so we remove from stack all
! * used lexemes , return to basic mode and redo end of stack
* (if it exists)
*/
if (res)
--- 308,314 ----
{
/*
* Dictionary normalizes lexemes, so we remove from stack all
! * used lexemes, return to basic mode and redo end of stack
* (if it exists)
*/
if (res)
***************
*** 571,577 ****
}
text *
! generatHeadline(HeadlineText * prs)
{
text *out;
int len = 128;
--- 571,577 ----
}
text *
! generateHeadline(HeadlineText * prs)
{
text *out;
int len = 128;
Index: src/backend/tsearch/ts_utils.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_utils.c,v
retrieving revision 1.2
diff -c -r1.2 ts_utils.c
*** src/backend/tsearch/ts_utils.c 22 Aug 2007 01:39:44 -0000 1.2
--- src/backend/tsearch/ts_utils.c 24 Aug 2007 10:57:58 -0000
***************
*** 63,83 ****
return result;
}
! #define STOPBUFLEN 4096
void
! readstoplist(char *in, StopList * s)
{
char **stop = NULL;
s->len = 0;
if (in && *in)
{
char *filename = get_tsearch_config_filename(in, "stop");
FILE *hin;
- char buf[STOPBUFLEN];
int reallen = 0;
- int line = 0;
if ((hin = AllocateFile(filename, "r")) == NULL)
ereport(ERROR,
--- 63,90 ----
return result;
}
! static int
! comparestr(const void *a, const void *b)
! {
! return strcmp(*(char **) a, *(char **) b);
! }
+ /*
+ * Reads a stopword file. Each word is ran through 'wordop'
+ * function, if given.
+ */
void
! readstoplist(char *in, StopList * s, char *(*wordop) (char *))
{
char **stop = NULL;
+ char *line;
s->len = 0;
if (in && *in)
{
char *filename = get_tsearch_config_filename(in, "stop");
FILE *hin;
int reallen = 0;
if ((hin = AllocateFile(filename, "r")) == NULL)
ereport(ERROR,
***************
*** 85,109 ****
errmsg("could not open stopword file \"%s\": %m",
filename)));
! while (fgets(buf, STOPBUFLEN, hin))
{
! char *pbuf = buf;
! line++;
! while (*pbuf && !isspace(*pbuf))
pbuf++;
*pbuf = '\0';
! if (*buf == '\0')
! continue;
!
! if (!pg_verifymbstr(buf, strlen(buf), true))
{
! FreeFile(hin);
! ereport(ERROR,
! (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("invalid multibyte encoding at line %d in file \"%s\"",
! line, filename)));
}
if (s->len >= reallen)
--- 92,111 ----
errmsg("could not open stopword file \"%s\": %m",
filename)));
! while ((line = t_readline(hin)) != NULL)
{
! char *pbuf = line;
! /* Trim trailing space */
! while (*pbuf && !t_isspace(pbuf))
pbuf++;
*pbuf = '\0';
! /* Skip empty lines */
! if (*line == '\0')
{
! pfree(line);
! continue;
}
if (s->len >= reallen)
***************
*** 120,130 ****
}
}
!
! if (s->wordop)
! stop[s->len] = s->wordop(buf);
else
! stop[s->len] = pstrdup(buf);
(s->len)++;
}
--- 122,135 ----
}
}
! if (wordop)
! {
! stop[s->len] = wordop(line);
! if (stop[s->len] != line)
! pfree(line);
! }
else
! stop[s->len] = line;
(s->len)++;
}
***************
*** 133,149 ****
}
s->stop = stop;
- }
-
- static int
- comparestr(const void *a, const void *b)
- {
- return strcmp(*(char **) a, *(char **) b);
- }
! void
! sortstoplist(StopList * s)
! {
if (s->stop && s->len > 0)
qsort(s->stop, s->len, sizeof(char *), comparestr);
}
--- 138,145 ----
}
s->stop = stop;
! /* Sort to allow binary searching */
if (s->stop && s->len > 0)
qsort(s->stop, s->len, sizeof(char *), comparestr);
}
Index: src/backend/tsearch/wparser.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/wparser.c,v
retrieving revision 1.2
diff -c -r1.2 wparser.c
*** src/backend/tsearch/wparser.c 22 Aug 2007 01:39:45 -0000 1.2
--- src/backend/tsearch/wparser.c 23 Aug 2007 12:29:59 -0000
***************
*** 325,331 ****
PointerGetDatum(prsoptions),
PointerGetDatum(query));
! out = generatHeadline(&prs);
PG_FREE_IF_COPY(in, 1);
PG_FREE_IF_COPY(query, 2);
--- 325,331 ----
PointerGetDatum(prsoptions),
PointerGetDatum(query));
! out = generateHeadline(&prs);
PG_FREE_IF_COPY(in, 1);
PG_FREE_IF_COPY(query, 2);
Index: src/include/tsearch/ts_locale.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_locale.h,v
retrieving revision 1.1
diff -c -r1.1 ts_locale.h
*** src/include/tsearch/ts_locale.h 21 Aug 2007 01:11:29 -0000 1.1
--- src/include/tsearch/ts_locale.h 24 Aug 2007 09:48:14 -0000
***************
*** 83,88 ****
char *lowerstr(char *str);
char *lowerstr_with_len(char *str, int len);
! char *recode_and_lowerstr(char *str);
#endif /* __TSLOCALE_H__ */
--- 83,88 ----
char *lowerstr(char *str);
char *lowerstr_with_len(char *str, int len);
! char *t_readline(FILE *fp);
#endif /* __TSLOCALE_H__ */
Index: src/include/tsearch/ts_public.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_public.h,v
retrieving revision 1.2
diff -c -r1.2 ts_public.h
*** src/include/tsearch/ts_public.h 22 Aug 2007 01:39:46 -0000 1.2
--- src/include/tsearch/ts_public.h 23 Aug 2007 19:55:25 -0000
***************
*** 71,81 ****
{
int len;
char **stop;
- char *(*wordop) (char *);
} StopList;
! extern void sortstoplist(StopList * s);
! extern void readstoplist(char *in, StopList * s);
extern bool searchstoplist(StopList * s, char *key);
/*
--- 71,79 ----
{
int len;
char **stop;
} StopList;
! extern void readstoplist(char *in, StopList * s, char *(*wordop) (char *));
extern bool searchstoplist(StopList * s, char *key);
/*
Index: src/include/tsearch/ts_utils.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_utils.h,v
retrieving revision 1.1
diff -c -r1.1 ts_utils.h
*** src/include/tsearch/ts_utils.h 21 Aug 2007 01:11:29 -0000 1.1
--- src/include/tsearch/ts_utils.h 23 Aug 2007 12:30:32 -0000
***************
*** 102,108 ****
* headline framework, flow in common to generate:
* 1 parse text with hlparsetext
* 2 parser-specific function to find part
! * 3 generatHeadline to generate result text
*/
typedef struct
--- 102,108 ----
* headline framework, flow in common to generate:
* 1 parse text with hlparsetext
* 2 parser-specific function to find part
! * 3 generateHeadline to generate result text
*/
typedef struct
***************
*** 131,137 ****
extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
char *buf, int4 buflen);
! extern text *generatHeadline(HeadlineText * prs);
/*
* token/node types for parsing
--- 131,137 ----
extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
char *buf, int4 buflen);
! extern text *generateHeadline(HeadlineText * prs);
/*
* token/node types for parsing
Index: src/include/tsearch/dicts/spell.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/dicts/spell.h,v
retrieving revision 1.1
diff -c -r1.1 spell.h
*** src/include/tsearch/dicts/spell.h 21 Aug 2007 01:11:29 -0000 1.1
--- src/include/tsearch/dicts/spell.h 24 Aug 2007 10:59:49 -0000
***************
*** 18,23 ****
--- 18,29 ----
#include "tsearch/dicts/regis.h"
#include "tsearch/ts_public.h"
+ /*
+ * Max length of a flag name. Names longer than this will be truncated
+ * to the maximum.
+ */
+ #define MAXFLAGLEN 16
+
struct SPNode;
typedef struct
***************
*** 54,67 ****
{
union
{
! char flag[16];
struct
{
int affix;
int len;
} d;
} p;
! char word[1];
} SPELL;
#define SPELLHDRSZ (offsetof(SPELL, word))
--- 60,76 ----
{
union
{
! /* flag is filled in by NIImportDictionary. After NISortDictionary,
! * d is valid and flag is invalid.
! */
! char flag[MAXFLAGLEN];
struct
{
int affix;
int len;
} d;
} p;
! char word[1]; /* variable length, null-terminated */
} SPELL;
#define SPELLHDRSZ (offsetof(SPELL, word))
***************
*** 90,95 ****
--- 99,110 ----
#define FF_COMPOUNDPERMITFLAG 0x10
#define FF_COMPOUNDFORBIDFLAG 0x20
#define FF_CROSSPRODUCT 0x40
+
+ /*
+ * Don't change the order of these. Initialization
+ * sorts by because these, and expects prefixes to
+ * come first after sorting.
+ */
#define FF_SUFFIX 1
#define FF_PREFIX 0
***************
*** 126,134 ****
int naffixes;
AFFIX *Affix;
! int nspell;
! int mspell;
SPELL **Spell;
AffixNode *Suffix;
AffixNode *Prefix;
--- 141,151 ----
int naffixes;
AFFIX *Affix;
! /* Temporary array of all words in the dict file. Only used during
! * initialization */
SPELL **Spell;
+ int nspell; /* number of entries in Spell-array */
+ int mspell; /* allocated length of Spell-array */
AffixNode *Suffix;
AffixNode *Prefix;