rtree improvements - Mailing list pgsql-patches

From Kenneth Been
Subject rtree improvements
Date
Msg-id 3BB0C90E.4020600@telocity.com
Whole thread Raw
Responses Re: rtree improvements
Re: rtree improvements
List pgsql-patches
I have made three changes to the rtree code: one bug fix and
two performance improvements.  I put an explanation of the
changes at

http://cs1.cs.nyu.edu/been/postgres-rtree.html

The performance improvements are quite significant.

All the changes are in the file src/backend/access/rtree/rtree.c

I was working with the 7.1.3 code.

I'm including the diff output as an attachment.

Ken

diff -Naur postgresql-7.1.3.old/src/backend/access/rtree/rtree.c postgresql-7.1.3/src/backend/access/rtree/rtree.c
--- postgresql-7.1.3.old/src/backend/access/rtree/rtree.c    Wed Mar 21 22:59:16 2001
+++ postgresql-7.1.3/src/backend/access/rtree/rtree.c    Tue Sep 25 13:56:31 2001
@@ -55,6 +55,14 @@
     Datum        spl_rdatum;
 } SPLITVEC;

+/* for sorting tuples by cost, for picking split */
+typedef struct SPLITCOST
+{
+    OffsetNumber    offset_number;
+    float            cost_differential;
+    bool            choose_left;
+} SPLITCOST;
+
 typedef struct RTSTATE
 {
     FmgrInfo    unionFn;        /* union function */
@@ -79,6 +87,7 @@
        RTSTATE *rtstate);
 static int    nospace(Page p, IndexTuple it);
 static void initRtstate(RTSTATE *rtstate, Relation index);
+static int qsort_comp_splitcost(const void *a, const void *b);


 Datum
@@ -427,7 +436,12 @@
     FunctionCall2(&rtstate->sizeFn, datum,
                   PointerGetDatum(&newd_size));

-    if (newd_size != old_size)
+    /*
+     * If newd_size == 0 we have degenerate rectangles, so we
+     * don't know if there was any change, so we have to
+     * assume there was.
+     */
+    if ((newd_size == 0) || (newd_size != old_size))
     {
         TupleDesc    td = RelationGetDescr(r);

@@ -503,6 +517,8 @@
     OffsetNumber *spl_left,
                *spl_right;
     TupleDesc    tupDesc;
+    int            n;
+    OffsetNumber newitemoff;

     p = (Page) BufferGetPage(buffer);
     opaque = (RTreePageOpaque) PageGetSpecialPointer(p);
@@ -539,56 +555,64 @@
     spl_right = v.spl_right;
     leftoff = rightoff = FirstOffsetNumber;
     maxoff = PageGetMaxOffsetNumber(p);
-    for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
-    {
-        itemid = PageGetItemId(p, i);
-        item = (IndexTuple) PageGetItem(p, itemid);
-
-        if (i == *spl_left)
-        {
-            if (PageAddItem(left, (Item) item, IndexTupleSize(item),
-                            leftoff, LP_USED) == InvalidOffsetNumber)
-                elog(ERROR, "rtdosplit: failed to copy index item in %s",
-                     RelationGetRelationName(r));
-            leftoff = OffsetNumberNext(leftoff);
-            spl_left++;            /* advance in left split vector */
-        }
-        else
-        {
-            Assert(i == *spl_right);
-            if (PageAddItem(right, (Item) item, IndexTupleSize(item),
-                            rightoff, LP_USED) == InvalidOffsetNumber)
-                elog(ERROR, "rtdosplit: failed to copy index item in %s",
-                     RelationGetRelationName(r));
-            rightoff = OffsetNumberNext(rightoff);
-            spl_right++;        /* advance in right split vector */
-        }
-    }
+    newitemoff = OffsetNumberNext(maxoff);

     /* build an InsertIndexResult for this insertion */
     res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));

-    /* now insert the new index tuple */
-    if (*spl_left == maxoff + 1)
+    /*
+     * spl_left contains a list of the offset numbers of the
+     * tuples that will go to the left page.  For each offset
+     * number, get the tuple item, then add the item to the
+     * left page.  Similarly for the right side.
+     */
+
+    /* fill left node */
+    for (n = 0; n < v.spl_nleft; n++)
     {
-        if (PageAddItem(left, (Item) itup, IndexTupleSize(itup),
+        i = *spl_left;
+        if (i == newitemoff)
+            item = itup;
+        else
+        {
+            itemid = PageGetItemId(p, i);
+            item = (IndexTuple) PageGetItem(p, itemid);
+        }
+
+        if (PageAddItem(left, (Item) item, IndexTupleSize(item),
                         leftoff, LP_USED) == InvalidOffsetNumber)
             elog(ERROR, "rtdosplit: failed to add index item to %s",
                  RelationGetRelationName(r));
         leftoff = OffsetNumberNext(leftoff);
-        ItemPointerSet(&(res->pointerData), lbknum, leftoff);
-        spl_left++;
+
+        if (i == newitemoff)
+            ItemPointerSet(&(res->pointerData), lbknum, leftoff);
+
+        spl_left++;        /* advance in left split vector */
     }
-    else
+
+    /* fill right node */
+    for (n = 0; n < v.spl_nright; n++)
     {
-        Assert(*spl_right == maxoff + 1);
-        if (PageAddItem(right, (Item) itup, IndexTupleSize(itup),
+        i = *spl_right;
+        if (i == newitemoff)
+            item = itup;
+        else
+        {
+            itemid = PageGetItemId(p, i);
+            item = (IndexTuple) PageGetItem(p, itemid);
+        }
+
+        if (PageAddItem(right, (Item) item, IndexTupleSize(item),
                         rightoff, LP_USED) == InvalidOffsetNumber)
             elog(ERROR, "rtdosplit: failed to add index item to %s",
                  RelationGetRelationName(r));
         rightoff = OffsetNumberNext(rightoff);
-        ItemPointerSet(&(res->pointerData), rbknum, rightoff);
-        spl_right++;
+
+        if (i == newitemoff)
+            ItemPointerSet(&(res->pointerData), rbknum, rightoff);
+
+        spl_right++;        /* advance in right split vector */
     }

     /* Make sure we consumed all of the split vectors, and release 'em */
@@ -741,8 +765,10 @@
  * In addition, the item to be added (itup) is listed in the appropriate
  * vector.    It is represented by item number N+1 (N = # of items on page).
  *
- * Both vectors appear in sequence order with a terminating sentinel value
- * of InvalidOffsetNumber.
+ * Both vectors have a terminating sentinel value of InvalidOffsetNumber,
+ * but the sentinal value is no longer used, because the SPLITVEC
+ * vector also contains the length of each vector, and that information
+ * is now used to iterate over them in rtdosplit(). --kbb, 21 Sept 2001
  *
  * The bounding-box datums for the two new pages are also returned in *v.
  *
@@ -797,6 +823,12 @@
                 item_2_sz,
                 left_avail_space,
                 right_avail_space;
+    int            total_num_tuples,
+                num_tuples_without_seeds,
+                max_after_split; /* in Guttman's lingo, (M - m) */
+    float        diff; /* diff between cost of putting tuple left or right */
+    SPLITCOST   *cost_vector;
+    int            n;

     /*
      * First, make sure the new item is not so large that we can't
@@ -812,6 +844,9 @@
     maxoff = PageGetMaxOffsetNumber(page);
     newitemoff = OffsetNumberNext(maxoff);        /* phony index for new
                                                  * item */
+    total_num_tuples = newitemoff;
+    num_tuples_without_seeds = total_num_tuples - 2;
+    max_after_split = total_num_tuples / 2;        /* works for m = M/2 */

     /* Make arrays big enough for worst case, including sentinel */
     nbytes = (maxoff + 2) * sizeof(OffsetNumber);
@@ -909,47 +944,111 @@
     right_avail_space = RTPageAvailSpace - IndexTupleTotalSize(item_2);

     /*
-     * Now split up the regions between the two seeds.    An important
-     * property of this split algorithm is that the split vector v has the
-     * indices of items to be split in order in its left and right
-     * vectors.  We exploit this property by doing a merge in the code
-     * that actually splits the page.
+     * Now split up the regions between the two seeds.
+     *
+     * The cost_vector array will contain hints for determining where
+     * each tuple should go.  Each record in the array will contain
+     * a boolean, choose_left, that indicates which node the tuple
+     * prefers to be on, and the absolute difference in cost between
+     * putting the tuple in its favored node and in the other node.
      *
-     * For efficiency, we also place the new index tuple in this loop. This
-     * is handled at the very end, when we have placed all the existing
-     * tuples and i == maxoff + 1.
+     * Later, we will sort the cost_vector in descending order by cost
+     * difference, and consider the tuples in that order for
+     * placement.  That way, the tuples that *really* want to be in
+     * one node or the other get to choose first, and the tuples that
+     * don't really care choose last.
+     *
+     * First, build the cost_vector array.  The new index tuple will
+     * also be handled in this loop, and represented in the array,
+     * with i==newitemoff.
+     *
+     * In the case of variable size tuples it is possible that we only
+     * have the two seeds and no other tuples, in which case we don't
+     * do any of this cost_vector stuff.
      */
+
+    /* to keep compiler quiet */
+    cost_vector = (SPLITCOST *) NULL;
+
+    if (num_tuples_without_seeds > 0)
+    {
+        cost_vector =
+            (SPLITCOST *) palloc(num_tuples_without_seeds * sizeof(SPLITCOST));
+        n = 0;
+        for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
+        {
+            /* Compute new union datums and sizes for both choices */
+
+            if ((i == seed_1) || (i == seed_2))
+                continue;
+            else if (i == newitemoff)
+                item_1 = itup;
+            else
+                item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+
+            datum_alpha = IndexTupleGetDatum(item_1);
+            union_dl = FunctionCall2(&rtstate->unionFn, datum_l, datum_alpha);
+            union_dr = FunctionCall2(&rtstate->unionFn, datum_r, datum_alpha);
+            FunctionCall2(&rtstate->sizeFn, union_dl,
+                          PointerGetDatum(&size_alpha));
+            FunctionCall2(&rtstate->sizeFn, union_dr,
+                          PointerGetDatum(&size_beta));
+
+            diff = (size_alpha - size_l) - (size_beta - size_r);
+
+            cost_vector[n].offset_number = i;
+            cost_vector[n].cost_differential = fabs(diff);
+            cost_vector[n].choose_left = (diff < 0);
+
+            n++;
+        }
+
+        /*
+         * Sort the array.  The function qsort_comp_splitcost is
+         * set up "backwards", to provided descending order.
+         */
+        qsort(cost_vector, num_tuples_without_seeds, sizeof(SPLITCOST),
+              &qsort_comp_splitcost);
+    }
+
+    /*
+     * Now make the final decisions about where each tuple will go,
+     * and build the vectors to return in the SPLITVEC record.
+     *
+     * The cost_vector array contains (descriptions of) all the
+     * tuples, in the order that we want to consider them, so we
+     * we just iterate through it and place each tuple in left
+     * or right nodes, according to the criteria described below.
+     */
+
     left = v->spl_left;
     v->spl_nleft = 0;
     right = v->spl_right;
     v->spl_nright = 0;

-    for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
+    /* Place the seeds first.
+     * left avail space, left union, right avail space, and right
+     * union have already been adjusted for the seeds.
+     */
+
+    *left++ = seed_1;
+    v->spl_nleft++;
+
+    *right++ = seed_2;
+    v->spl_nright++;
+
+    for (n = 0; n < num_tuples_without_seeds; n++)
     {
         bool        left_feasible,
                     right_feasible,
                     choose_left;

         /*
-         * If we've already decided where to place this item, just put it
-         * on the correct list.  Otherwise, we need to figure out which
-         * page needs the least enlargement in order to store the item.
+         * We need to figure out which page needs the least
+         * enlargement in order to store the item.
          */

-        if (i == seed_1)
-        {
-            *left++ = i;
-            v->spl_nleft++;
-            /* left avail_space & union already includes this one */
-            continue;
-        }
-        if (i == seed_2)
-        {
-            *right++ = i;
-            v->spl_nright++;
-            /* right avail_space & union already includes this one */
-            continue;
-        }
+        i = cost_vector[n].offset_number;

         /* Compute new union datums and sizes for both possible additions */
         if (i == newitemoff)
@@ -979,6 +1078,24 @@
          * (We know that all the old items together can fit on one page, so
          * we need not worry about any other problem than failing to fit
          * the new item.)
+         *
+         * Guttman's algorithm actually has two factors to consider (in
+         * order):  1. if one node has so many tuples already assigned to
+         * it that the other needs all the rest in order to satisfy the
+         * condition that neither node has fewer than m tuples, then
+         * that is decisive; 2. otherwise, choose the page that shows
+         * the smaller enlargement of its union area.
+         *
+         * I have chosen m = M/2, where M is the maximum number of
+         * tuples on a page.  (Actually, this is only strictly
+         * true for fixed size tuples.  For variable size tuples,
+         * there still might have to be only one tuple on a page,
+         * if it is really big.  But even with variable size
+         * tuples we still try to get m as close as possible to M/2.)
+         *
+         * The question of which page shows the smaller enlargement of
+         * its union area has already been answered, and the answer
+         * stored in the choose_left field of the SPLITCOST record.
          */
         left_feasible = (left_avail_space >= item_1_sz &&
                          ((left_avail_space - item_1_sz) >= newitemsz ||
@@ -988,8 +1105,18 @@
                            left_avail_space >= newitemsz));
         if (left_feasible && right_feasible)
         {
-            /* Both feasible, use Guttman's algorithm */
-            choose_left = (size_alpha - size_l < size_beta - size_r);
+            /*
+             * Both feasible, use Guttman's algorithm.
+             * First check the m condition described above, and if
+             * that doesn't apply, choose the page with the smaller
+             * enlargement of its union area.
+             */
+            if (v->spl_nleft > max_after_split)
+                choose_left = false;
+            else if (v->spl_nright > max_after_split)
+                choose_left = true;
+            else
+                choose_left = cost_vector[n].choose_left;
         }
         else if (left_feasible)
             choose_left = true;
@@ -1023,6 +1150,11 @@
         }
     }

+    if (num_tuples_without_seeds > 0)
+    {
+        pfree(cost_vector);
+    }
+
     *left = *right = InvalidOffsetNumber;        /* add ending sentinels */

     v->spl_ldatum = datum_l;
@@ -1152,6 +1284,21 @@
     fmgr_info(size_proc, &rtstate->sizeFn);
     fmgr_info(inter_proc, &rtstate->interFn);
     return;
+}
+
+/* for sorting SPLITCOST records in descending order */
+static int
+qsort_comp_splitcost(const void *a, const void *b)
+{
+    float diff =
+        ((SPLITCOST *)a)->cost_differential -
+        ((SPLITCOST *)b)->cost_differential;
+    if (diff < 0)
+        return 1;
+    else if (diff > 0)
+        return -1;
+    else
+        return 0;
 }

 #ifdef RTDEBUG

pgsql-patches by date:

Previous
From: Marko Kreen
Date:
Subject: pgcrypto Makefile update
Next
From: yasen
Date:
Subject: troubles with setuid patch