Re: Missing CFI in hlCover()? - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Missing CFI in hlCover()?
Date
Msg-id 1660074.1596158722@sss.pgh.pa.us
Whole thread Raw
In response to Re: Missing CFI in hlCover()?  (Stephen Frost <sfrost@snowman.net>)
Responses Re: Missing CFI in hlCover()?
List pgsql-hackers
Stephen Frost <sfrost@snowman.net> writes:
> * Tom Lane (tgl@sss.pgh.pa.us) wrote:
>> Here's a proposed patch along that line.

> I've back-patched this to 11 (which was just a bit of fuzz) and tested
> it out with a couple of different queries that were causing issues
> previously on the archive server, and they finish in a much more
> reasonable time and react faster to cancel requests/signals.

Yeah, I'd tried this locally using the data from the one test case you
showed me, and it seemed to fix that.

> So, looks good to me, and would certainly be nice to get this into the
> next set of releases, so the archive server doesn't get stuck anymore.

I'll push this tomorrow if nobody has objected to it.

BTW, I had noticed last night that hlFirstIndex is being unreasonably
stupid.  Many of the "words" have null item pointers and hence can't
possibly match any query item (I think because we have "words" for
inter-word spaces/punctuation as well as the actual words).  Checking
that, as in the attached v2 patch, makes things a bit faster yet.

            regards, tom lane

diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index 76b6f9aef0..92517e4c17 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -2011,13 +2011,18 @@ hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
     for (i = pos; i < prs->curwords; i++)
     {
         /* ... scan the query to see if this word matches any operand */
-        QueryItem  *item = GETQUERY(query);
+        QueryOperand *worditem = prs->words[i].item;
+        QueryItem  *item;
         int            j;

+        if (worditem == NULL)
+            continue;            /* certainly not a match */
+
+        item = GETQUERY(query);
         for (j = 0; j < query->size; j++)
         {
             if (item->type == QI_VAL &&
-                prs->words[i].item == &item->qoperand)
+                worditem == &item->qoperand)
                 return i;
             item++;
         }
@@ -2028,8 +2033,14 @@ hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
 /*
  * hlCover: try to find a substring of prs' word list that satisfies query
  *
- * At entry, *p must be the first word index to consider (initialize this to
- * zero, or to the next index after a previous successful search).
+ * At entry, *p must be the first word index to consider (initialize this
+ * to zero, or to the next index after a previous successful search).
+ * We will consider all substrings starting at or after that word, and
+ * containing no more than max_cover words.  (We need a length limit to
+ * keep this from taking O(N^2) time for a long document with many query
+ * words but few complete matches.  Actually, since checkcondition_HL is
+ * roughly O(N) in the length of the substring being checked, it's even
+ * worse than that.)
  *
  * On success, sets *p to first word index and *q to last word index of the
  * cover substring, and returns true.
@@ -2038,7 +2049,8 @@ hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
  * words used in the query.
  */
 static bool
-hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
+hlCover(HeadlineParsedText *prs, TSQuery query, int max_cover,
+        int *p, int *q)
 {
     int            pmin,
                 pmax,
@@ -2084,7 +2096,7 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
                 nextpmin = nextpmax;
             pmax = nextpmax;
         }
-        while (pmax >= 0);
+        while (pmax >= 0 && pmax - pmin < max_cover);
         /* No luck here, so try next feasible startpoint */
         pmin = nextpmin;
     }
@@ -2186,7 +2198,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
 static void
 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
                   int shortword, int min_words,
-                  int max_words, int max_fragments)
+                  int max_words, int max_fragments, int max_cover)
 {
     int32        poslen,
                 curlen,
@@ -2213,7 +2225,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
     covers = palloc(maxcovers * sizeof(CoverPos));

     /* get all covers */
-    while (hlCover(prs, query, &p, &q))
+    while (hlCover(prs, query, max_cover, &p, &q))
     {
         startpos = p;
         endpos = q;
@@ -2368,7 +2380,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
  */
 static void
 mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
-              int shortword, int min_words, int max_words)
+              int shortword, int min_words, int max_words, int max_cover)
 {
     int            p = 0,
                 q = 0;
@@ -2386,7 +2398,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
     if (!highlightall)
     {
         /* examine all covers, select a headline using the best one */
-        while (hlCover(prs, query, &p, &q))
+        while (hlCover(prs, query, max_cover, &p, &q))
         {
             /*
              * Count words (curlen) and interesting words (poslen) within
@@ -2542,6 +2554,7 @@ prsd_headline(PG_FUNCTION_ARGS)
     int            shortword = 3;
     int            max_fragments = 0;
     bool        highlightall = false;
+    int            max_cover;
     ListCell   *l;

     /* Extract configuration option values */
@@ -2581,6 +2594,15 @@ prsd_headline(PG_FUNCTION_ARGS)
                             defel->defname)));
     }

+    /*
+     * We might eventually make max_cover a user-settable parameter, but for
+     * now, just compute a reasonable value based on max_words and
+     * max_fragments.
+     */
+    max_cover = Max(max_words * 10, 100);
+    if (max_fragments > 0)
+        max_cover *= max_fragments;
+
     /* in HighlightAll mode these parameters are ignored */
     if (!highlightall)
     {
@@ -2605,10 +2627,10 @@ prsd_headline(PG_FUNCTION_ARGS)
     /* Apply appropriate headline selector */
     if (max_fragments == 0)
         mark_hl_words(prs, query, highlightall, shortword,
-                      min_words, max_words);
+                      min_words, max_words, max_cover);
     else
         mark_hl_fragments(prs, query, highlightall, shortword,
-                          min_words, max_words, max_fragments);
+                          min_words, max_words, max_fragments, max_cover);

     /* Fill in default values for string options */
     if (!prs->startsel)
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index f01b1ee253..756a48a167 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -1868,6 +1868,9 @@ TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
     /* since this function recurses, it could be driven to stack overflow */
     check_stack_depth();

+    /* ... and let's check for query cancel while we're at it */
+    CHECK_FOR_INTERRUPTS();
+
     if (curitem->type == QI_VAL)
         return chkcond(arg, (QueryOperand *) curitem,
                        NULL /* don't need position info */ );

pgsql-hackers by date:

Previous
From: Robert Haas
Date:
Subject: Re: new heapcheck contrib module
Next
From: Peter Geoghegan
Date:
Subject: Re: HashAgg's batching counter starts at 0, but Hash's starts at 1. (now: incremental sort)