Re: Bug with Tsearch and tsvector - Mailing list pgsql-bugs
From | Tom Lane |
---|---|
Subject | Re: Bug with Tsearch and tsvector |
Date | |
Msg-id | 4161.1272420374@sss.pgh.pa.us Whole thread Raw |
In response to | Re: Bug with Tsearch and tsvector ("Kevin Grittner" <Kevin.Grittner@wicourts.gov>) |
List | pgsql-bugs |
"Kevin Grittner" <Kevin.Grittner@wicourts.gov> writes: > reserved = gen-delims / sub-delims > gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" > sub-delims = "!" / "$" / "&" / "'" / "(" / ")" > / "*" / "+" / "," / ";" / "=" > unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" > I think that we should accept all the above characters (reserved and > unreserved) and the percent character (since it is the escape > character) as part of a URL. I've applied the attached patch to make it work that way. regards, tom lane Index: src/backend/tsearch/wparser_def.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/tsearch/wparser_def.c,v retrieving revision 1.29 diff -c -r1.29 wparser_def.c *** src/backend/tsearch/wparser_def.c 26 Apr 2010 17:10:18 -0000 1.29 --- src/backend/tsearch/wparser_def.c 28 Apr 2010 01:57:14 -0000 *************** *** 583,588 **** --- 583,617 ---- return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0; } + static int + p_isurlchar(TParser *prs) + { + char ch; + + /* no non-ASCII need apply */ + if (prs->state->charlen != 1) + return 0; + ch = *(prs->str + prs->state->posbyte); + /* no spaces or control characters */ + if (ch <= 0x20 || ch >= 0x7F) + return 0; + /* reject characters disallowed by RFC 3986 */ + switch (ch) + { + case '"': + case '<': + case '>': + case '\\': + case '^': + case '`': + case '{': + case '|': + case '}': + return 0; + } + return 1; + } + /* deliberately suppress unused-function complaints for the above */ void _make_compiler_happy(void); *************** *** 707,715 **** int res = 0; tmpprs->state = newTParserPosition(tmpprs->state); ! tmpprs->state->state = TPS_InFileFirst; ! if (TParserGet(tmpprs) && (tmpprs->type == URLPATH || tmpprs->type == FILEPATH)) { prs->state->posbyte += tmpprs->lenbytetoken; prs->state->poschar += tmpprs->lenchartoken; --- 736,744 ---- int res = 0; tmpprs->state = newTParserPosition(tmpprs->state); ! tmpprs->state->state = TPS_InURLPathFirst; ! if (TParserGet(tmpprs) && tmpprs->type == URLPATH) { prs->state->posbyte += tmpprs->lenbytetoken; prs->state->poschar += tmpprs->lenchartoken; *************** *** 1441,1447 **** {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL}, {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, - {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL}, {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; --- 1470,1475 ---- *************** *** 1488,1494 **** {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, - {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL}, {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL} }; --- 1516,1521 ---- *************** *** 1502,1510 **** static const TParserStateActionItem actionTPS_InURLPathFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, ! {p_iseqC, '"', A_POP, TPS_Null, 0, NULL}, ! {p_iseqC, '\'', A_POP, TPS_Null, 0, NULL}, ! {p_isnotspace, 0, A_CLEAR, TPS_InURLPath, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL}, }; --- 1529,1535 ---- static const TParserStateActionItem actionTPS_InURLPathFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, ! {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL}, }; *************** *** 1514,1522 **** static const TParserStateActionItem actionTPS_InURLPath[] = { {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL}, ! {p_iseqC, '"', A_BINGO, TPS_Base, URLPATH, NULL}, ! {p_iseqC, '\'', A_BINGO, TPS_Base, URLPATH, NULL}, ! {p_isnotspace, 0, A_NEXT, TPS_InURLPath, 0, NULL}, {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL} }; --- 1539,1545 ---- static const TParserStateActionItem actionTPS_InURLPath[] = { {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL}, ! {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL}, {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL} }; Index: src/test/regress/expected/tsearch.out =================================================================== RCS file: /cvsroot/pgsql/src/test/regress/expected/tsearch.out,v retrieving revision 1.17 diff -c -r1.17 tsearch.out *** src/test/regress/expected/tsearch.out 22 Nov 2009 05:20:41 -0000 1.17 --- src/test/regress/expected/tsearch.out 28 Apr 2010 01:57:14 -0000 *************** *** 287,294 **** 6 | 4aew.werc.ewr 12 | 14 | http:// 6 | 5aew.werc.ewr:8100 ! 12 | /? 1 | ad 12 | = 1 | qwe --- 287,296 ---- 6 | 4aew.werc.ewr 12 | 14 | http:// + 5 | 5aew.werc.ewr:8100/? 6 | 5aew.werc.ewr:8100 ! 18 | /? ! 12 | 1 | ad 12 | = 1 | qwe *************** *** 391,404 **** 12 | 12 | <> 1 | qwerty ! (131 rows) SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.netqwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2readline-4.2. 234 <i <b> wow < jqw <> qwerty'); ! to_tsvector ! -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ! '+4.0e-10':26 '-4.2':58,60 '/?ad=qwe&dw':7,10,14,22 '/?ad=qwe&dw=%20%32':25 '/awdf/dwqe/4325':46 '/usr/local/fff':45 '/wqe-324/ewr':49'1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':61 '234.435':30 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13'3aew.werc.ewr/?ad=qwe&dw':12 '4.2':54,55,56 '455':31 '4aew.werc.ewr':15 '5.005':32 '5aew.werc.ewr:8100':16'6aew.werc.ewr:8100':21 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100':24 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23'ad':17 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':37 'dw':19 'efd.r':3'ewr1':43 'ewri2':44 'gist.c':52 'gist.h':50 'gist.h.c':51 'hjwer':42 'jf':39 'jqw':64 'qwe':2,18,27,28,35 'qwe-wer':34'qwer':38 'qwerti':65 'qwqwe':29 'readlin':53,57,59 'rewt/ewr':47 'sdjk':40 'teodor@stack.net':33 'wefjn':48'wer':36 'wow':63 'www.com':4 (1 row) SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.netqwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> --- 393,406 ---- 12 | 12 | <> 1 | qwerty ! (133 rows) SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.netqwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2readline-4.2. 234 <i <b> wow < jqw <> qwerty'); ! to_tsvector ! ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ ! '+4.0e-10':28 '-4.2':60,62 '/?':18 '/?ad=qwe&dw':7,10,14,24 '/?ad=qwe&dw=%20%32':27 '/awdf/dwqe/4325':48 '/usr/local/fff':47'/wqe-324/ewr':51 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':63 '234.435':32 '2aew.werc.ewr':11'345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':56,57,58 '455':33 '4aew.werc.ewr':15 '5.005':34'5aew.werc.ewr:8100':17 '5aew.werc.ewr:8100/?':16 '6aew.werc.ewr:8100':23 '6aew.werc.ewr:8100/?ad=qwe&dw':22 '7aew.werc.ewr:8100':26'7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':25 'ad':19 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':39'dw':21 'efd.r':3 'ewr1':45 'ewri2':46 'gist.c':54 'gist.h':52 'gist.h.c':53 'hjwer':44 'jf':41 'jqw':66 'qwe':2,20,29,30,37'qwe-wer':36 'qwer':40 'qwerti':67 'qwqwe':31 'readlin':55,59,61 'rewt/ewr':49 'sdjk':42 'teodor@stack.net':35'wefjn':50 'wer':38 'wow':65 'www.com':4 (1 row) SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.netqwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> *************** *** 406,412 **** <i <b> wow < jqw <> qwerty')); length -------- ! 51 (1 row) -- ts_debug --- 408,414 ---- <i <b> wow < jqw <> qwerty')); length -------- ! 53 (1 row) -- ts_debug *************** *** 424,429 **** --- 426,469 ---- tag | XML tag | </myns:foo-bar_baz.blurfl> | {} | | (9 rows) + -- check parsing of URLs + SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>'); + alias | description | token | dictionaries | dictionary | lexemes + ----------+---------------+----------------------------------------+--------------+------------+------------------------------------------ + protocol | Protocol head | http:// | {} | | + url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | {www.harewoodsolutions.co.uk/press.aspx} + host | Host | www.harewoodsolutions.co.uk | {simple} | simple | {www.harewoodsolutions.co.uk} + url_path | URL path | /press.aspx | {simple} | simple | {/press.aspx} + tag | XML tag | </span> | {} | | + (5 rows) + + SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>'); + alias | description | token | dictionaries | dictionary | lexemes + ----------+---------------+----------------------------+--------------+------------+------------------------------ + protocol | Protocol head | http:// | {} | | + url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | {aew.wer0c.ewr/id?ad=qwe&dw} + host | Host | aew.wer0c.ewr | {simple} | simple | {aew.wer0c.ewr} + url_path | URL path | /id?ad=qwe&dw | {simple} | simple | {/id?ad=qwe&dw} + tag | XML tag | <span> | {} | | + (5 rows) + + SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?'); + alias | description | token | dictionaries | dictionary | lexemes + ----------+---------------+----------------------+--------------+------------+------------------------ + protocol | Protocol head | http:// | {} | | + url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | {5aew.werc.ewr:8100/?} + host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100} + url_path | URL path | /? | {simple} | simple | {/?} + (4 rows) + + SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx'); + alias | description | token | dictionaries | dictionary | lexemes + ----------+-------------+------------------------+--------------+------------+-------------------------- + url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | {5aew.werc.ewr:8100/?xx} + host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100} + url_path | URL path | /?xx | {simple} | simple | {/?xx} + (3 rows) + -- to_tsquery SELECT to_tsquery('english', 'qwe & sKies '); to_tsquery Index: src/test/regress/sql/tsearch.sql =================================================================== RCS file: /cvsroot/pgsql/src/test/regress/sql/tsearch.sql,v retrieving revision 1.11 diff -c -r1.11 tsearch.sql *** src/test/regress/sql/tsearch.sql 19 May 2009 02:48:26 -0000 1.11 --- src/test/regress/sql/tsearch.sql 28 Apr 2010 01:57:14 -0000 *************** *** 105,110 **** --- 105,116 ---- SELECT * from ts_debug('english', '<myns:foo-bar_baz.blurfl>abc&nm1;def©ghiõjkl</myns:foo-bar_baz.blurfl>'); + -- check parsing of URLs + SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>'); + SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>'); + SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?'); + SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx'); + -- to_tsquery SELECT to_tsquery('english', 'qwe & sKies ');
pgsql-bugs by date: