Thread: rtree improvements

rtree improvements

From

Kenneth Been

Date:

25 September 2001, 17:16:40

I have made three changes to the rtree code: one bug fix and
two performance improvements.  I put an explanation of the
changes at

http://cs1.cs.nyu.edu/been/postgres-rtree.html

The performance improvements are quite significant.

All the changes are in the file src/backend/access/rtree/rtree.c

I was working with the 7.1.3 code.

I'm including the diff output as an attachment.

Ken

diff -Naur postgresql-7.1.3.old/src/backend/access/rtree/rtree.c postgresql-7.1.3/src/backend/access/rtree/rtree.c
--- postgresql-7.1.3.old/src/backend/access/rtree/rtree.c    Wed Mar 21 22:59:16 2001
+++ postgresql-7.1.3/src/backend/access/rtree/rtree.c    Tue Sep 25 13:56:31 2001
@@ -55,6 +55,14 @@
     Datum        spl_rdatum;
 } SPLITVEC;

+/* for sorting tuples by cost, for picking split */
+typedef struct SPLITCOST
+{
+    OffsetNumber    offset_number;
+    float            cost_differential;
+    bool            choose_left;
+} SPLITCOST;
+
 typedef struct RTSTATE
 {
     FmgrInfo    unionFn;        /* union function */
@@ -79,6 +87,7 @@
        RTSTATE *rtstate);
 static int    nospace(Page p, IndexTuple it);
 static void initRtstate(RTSTATE *rtstate, Relation index);
+static int qsort_comp_splitcost(const void *a, const void *b);


 Datum
@@ -427,7 +436,12 @@
     FunctionCall2(&rtstate->sizeFn, datum,
                   PointerGetDatum(&newd_size));

-    if (newd_size != old_size)
+    /*
+     * If newd_size == 0 we have degenerate rectangles, so we
+     * don't know if there was any change, so we have to
+     * assume there was.
+     */
+    if ((newd_size == 0) || (newd_size != old_size))
     {
         TupleDesc    td = RelationGetDescr(r);

@@ -503,6 +517,8 @@
     OffsetNumber *spl_left,
                *spl_right;
     TupleDesc    tupDesc;
+    int            n;
+    OffsetNumber newitemoff;

     p = (Page) BufferGetPage(buffer);
     opaque = (RTreePageOpaque) PageGetSpecialPointer(p);
@@ -539,56 +555,64 @@
     spl_right = v.spl_right;
     leftoff = rightoff = FirstOffsetNumber;
     maxoff = PageGetMaxOffsetNumber(p);
-    for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
-    {
-        itemid = PageGetItemId(p, i);
-        item = (IndexTuple) PageGetItem(p, itemid);
-
-        if (i == *spl_left)
-        {
-            if (PageAddItem(left, (Item) item, IndexTupleSize(item),
-                            leftoff, LP_USED) == InvalidOffsetNumber)
-                elog(ERROR, "rtdosplit: failed to copy index item in %s",
-                     RelationGetRelationName(r));
-            leftoff = OffsetNumberNext(leftoff);
-            spl_left++;            /* advance in left split vector */
-        }
-        else
-        {
-            Assert(i == *spl_right);
-            if (PageAddItem(right, (Item) item, IndexTupleSize(item),
-                            rightoff, LP_USED) == InvalidOffsetNumber)
-                elog(ERROR, "rtdosplit: failed to copy index item in %s",
-                     RelationGetRelationName(r));
-            rightoff = OffsetNumberNext(rightoff);
-            spl_right++;        /* advance in right split vector */
-        }
-    }
+    newitemoff = OffsetNumberNext(maxoff);

     /* build an InsertIndexResult for this insertion */
     res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));

-    /* now insert the new index tuple */
-    if (*spl_left == maxoff + 1)
+    /*
+     * spl_left contains a list of the offset numbers of the
+     * tuples that will go to the left page.  For each offset
+     * number, get the tuple item, then add the item to the
+     * left page.  Similarly for the right side.
+     */
+
+    /* fill left node */
+    for (n = 0; n < v.spl_nleft; n++)
     {
-        if (PageAddItem(left, (Item) itup, IndexTupleSize(itup),
+        i = *spl_left;
+        if (i == newitemoff)
+            item = itup;
+        else
+        {
+            itemid = PageGetItemId(p, i);
+            item = (IndexTuple) PageGetItem(p, itemid);
+        }
+
+        if (PageAddItem(left, (Item) item, IndexTupleSize(item),
                         leftoff, LP_USED) == InvalidOffsetNumber)
             elog(ERROR, "rtdosplit: failed to add index item to %s",
                  RelationGetRelationName(r));
         leftoff = OffsetNumberNext(leftoff);
-        ItemPointerSet(&(res->pointerData), lbknum, leftoff);
-        spl_left++;
+
+        if (i == newitemoff)
+            ItemPointerSet(&(res->pointerData), lbknum, leftoff);
+
+        spl_left++;        /* advance in left split vector */
     }
-    else
+
+    /* fill right node */
+    for (n = 0; n < v.spl_nright; n++)
     {
-        Assert(*spl_right == maxoff + 1);
-        if (PageAddItem(right, (Item) itup, IndexTupleSize(itup),
+        i = *spl_right;
+        if (i == newitemoff)
+            item = itup;
+        else
+        {
+            itemid = PageGetItemId(p, i);
+            item = (IndexTuple) PageGetItem(p, itemid);
+        }
+
+        if (PageAddItem(right, (Item) item, IndexTupleSize(item),
                         rightoff, LP_USED) == InvalidOffsetNumber)
             elog(ERROR, "rtdosplit: failed to add index item to %s",
                  RelationGetRelationName(r));
         rightoff = OffsetNumberNext(rightoff);
-        ItemPointerSet(&(res->pointerData), rbknum, rightoff);
-        spl_right++;
+
+        if (i == newitemoff)
+            ItemPointerSet(&(res->pointerData), rbknum, rightoff);
+
+        spl_right++;        /* advance in right split vector */
     }

     /* Make sure we consumed all of the split vectors, and release 'em */
@@ -741,8 +765,10 @@
  * In addition, the item to be added (itup) is listed in the appropriate
  * vector.    It is represented by item number N+1 (N = # of items on page).
  *
- * Both vectors appear in sequence order with a terminating sentinel value
- * of InvalidOffsetNumber.
+ * Both vectors have a terminating sentinel value of InvalidOffsetNumber,
+ * but the sentinal value is no longer used, because the SPLITVEC
+ * vector also contains the length of each vector, and that information
+ * is now used to iterate over them in rtdosplit(). --kbb, 21 Sept 2001
  *
  * The bounding-box datums for the two new pages are also returned in *v.
  *
@@ -797,6 +823,12 @@
                 item_2_sz,
                 left_avail_space,
                 right_avail_space;
+    int            total_num_tuples,
+                num_tuples_without_seeds,
+                max_after_split; /* in Guttman's lingo, (M - m) */
+    float        diff; /* diff between cost of putting tuple left or right */
+    SPLITCOST   *cost_vector;
+    int            n;

     /*
      * First, make sure the new item is not so large that we can't
@@ -812,6 +844,9 @@
     maxoff = PageGetMaxOffsetNumber(page);
     newitemoff = OffsetNumberNext(maxoff);        /* phony index for new
                                                  * item */
+    total_num_tuples = newitemoff;
+    num_tuples_without_seeds = total_num_tuples - 2;
+    max_after_split = total_num_tuples / 2;        /* works for m = M/2 */

     /* Make arrays big enough for worst case, including sentinel */
     nbytes = (maxoff + 2) * sizeof(OffsetNumber);
@@ -909,47 +944,111 @@
     right_avail_space = RTPageAvailSpace - IndexTupleTotalSize(item_2);

     /*
-     * Now split up the regions between the two seeds.    An important
-     * property of this split algorithm is that the split vector v has the
-     * indices of items to be split in order in its left and right
-     * vectors.  We exploit this property by doing a merge in the code
-     * that actually splits the page.
+     * Now split up the regions between the two seeds.
+     *
+     * The cost_vector array will contain hints for determining where
+     * each tuple should go.  Each record in the array will contain
+     * a boolean, choose_left, that indicates which node the tuple
+     * prefers to be on, and the absolute difference in cost between
+     * putting the tuple in its favored node and in the other node.
      *
-     * For efficiency, we also place the new index tuple in this loop. This
-     * is handled at the very end, when we have placed all the existing
-     * tuples and i == maxoff + 1.
+     * Later, we will sort the cost_vector in descending order by cost
+     * difference, and consider the tuples in that order for
+     * placement.  That way, the tuples that *really* want to be in
+     * one node or the other get to choose first, and the tuples that
+     * don't really care choose last.
+     *
+     * First, build the cost_vector array.  The new index tuple will
+     * also be handled in this loop, and represented in the array,
+     * with i==newitemoff.
+     *
+     * In the case of variable size tuples it is possible that we only
+     * have the two seeds and no other tuples, in which case we don't
+     * do any of this cost_vector stuff.
      */
+
+    /* to keep compiler quiet */
+    cost_vector = (SPLITCOST *) NULL;
+
+    if (num_tuples_without_seeds > 0)
+    {
+        cost_vector =
+            (SPLITCOST *) palloc(num_tuples_without_seeds * sizeof(SPLITCOST));
+        n = 0;
+        for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
+        {
+            /* Compute new union datums and sizes for both choices */
+
+            if ((i == seed_1) || (i == seed_2))
+                continue;
+            else if (i == newitemoff)
+                item_1 = itup;
+            else
+                item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+
+            datum_alpha = IndexTupleGetDatum(item_1);
+            union_dl = FunctionCall2(&rtstate->unionFn, datum_l, datum_alpha);
+            union_dr = FunctionCall2(&rtstate->unionFn, datum_r, datum_alpha);
+            FunctionCall2(&rtstate->sizeFn, union_dl,
+                          PointerGetDatum(&size_alpha));
+            FunctionCall2(&rtstate->sizeFn, union_dr,
+                          PointerGetDatum(&size_beta));
+
+            diff = (size_alpha - size_l) - (size_beta - size_r);
+
+            cost_vector[n].offset_number = i;
+            cost_vector[n].cost_differential = fabs(diff);
+            cost_vector[n].choose_left = (diff < 0);
+
+            n++;
+        }
+
+        /*
+         * Sort the array.  The function qsort_comp_splitcost is
+         * set up "backwards", to provided descending order.
+         */
+        qsort(cost_vector, num_tuples_without_seeds, sizeof(SPLITCOST),
+              &qsort_comp_splitcost);
+    }
+
+    /*
+     * Now make the final decisions about where each tuple will go,
+     * and build the vectors to return in the SPLITVEC record.
+     *
+     * The cost_vector array contains (descriptions of) all the
+     * tuples, in the order that we want to consider them, so we
+     * we just iterate through it and place each tuple in left
+     * or right nodes, according to the criteria described below.
+     */
+
     left = v->spl_left;
     v->spl_nleft = 0;
     right = v->spl_right;
     v->spl_nright = 0;

-    for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
+    /* Place the seeds first.
+     * left avail space, left union, right avail space, and right
+     * union have already been adjusted for the seeds.
+     */
+
+    *left++ = seed_1;
+    v->spl_nleft++;
+
+    *right++ = seed_2;
+    v->spl_nright++;
+
+    for (n = 0; n < num_tuples_without_seeds; n++)
     {
         bool        left_feasible,
                     right_feasible,
                     choose_left;

         /*
-         * If we've already decided where to place this item, just put it
-         * on the correct list.  Otherwise, we need to figure out which
-         * page needs the least enlargement in order to store the item.
+         * We need to figure out which page needs the least
+         * enlargement in order to store the item.
          */

-        if (i == seed_1)
-        {
-            *left++ = i;
-            v->spl_nleft++;
-            /* left avail_space & union already includes this one */
-            continue;
-        }
-        if (i == seed_2)
-        {
-            *right++ = i;
-            v->spl_nright++;
-            /* right avail_space & union already includes this one */
-            continue;
-        }
+        i = cost_vector[n].offset_number;

         /* Compute new union datums and sizes for both possible additions */
         if (i == newitemoff)
@@ -979,6 +1078,24 @@
          * (We know that all the old items together can fit on one page, so
          * we need not worry about any other problem than failing to fit
          * the new item.)
+         *
+         * Guttman's algorithm actually has two factors to consider (in
+         * order):  1. if one node has so many tuples already assigned to
+         * it that the other needs all the rest in order to satisfy the
+         * condition that neither node has fewer than m tuples, then
+         * that is decisive; 2. otherwise, choose the page that shows
+         * the smaller enlargement of its union area.
+         *
+         * I have chosen m = M/2, where M is the maximum number of
+         * tuples on a page.  (Actually, this is only strictly
+         * true for fixed size tuples.  For variable size tuples,
+         * there still might have to be only one tuple on a page,
+         * if it is really big.  But even with variable size
+         * tuples we still try to get m as close as possible to M/2.)
+         *
+         * The question of which page shows the smaller enlargement of
+         * its union area has already been answered, and the answer
+         * stored in the choose_left field of the SPLITCOST record.
          */
         left_feasible = (left_avail_space >= item_1_sz &&
                          ((left_avail_space - item_1_sz) >= newitemsz ||
@@ -988,8 +1105,18 @@
                            left_avail_space >= newitemsz));
         if (left_feasible && right_feasible)
         {
-            /* Both feasible, use Guttman's algorithm */
-            choose_left = (size_alpha - size_l < size_beta - size_r);
+            /*
+             * Both feasible, use Guttman's algorithm.
+             * First check the m condition described above, and if
+             * that doesn't apply, choose the page with the smaller
+             * enlargement of its union area.
+             */
+            if (v->spl_nleft > max_after_split)
+                choose_left = false;
+            else if (v->spl_nright > max_after_split)
+                choose_left = true;
+            else
+                choose_left = cost_vector[n].choose_left;
         }
         else if (left_feasible)
             choose_left = true;
@@ -1023,6 +1150,11 @@
         }
     }

+    if (num_tuples_without_seeds > 0)
+    {
+        pfree(cost_vector);
+    }
+
     *left = *right = InvalidOffsetNumber;        /* add ending sentinels */

     v->spl_ldatum = datum_l;
@@ -1152,6 +1284,21 @@
     fmgr_info(size_proc, &rtstate->sizeFn);
     fmgr_info(inter_proc, &rtstate->interFn);
     return;
+}
+
+/* for sorting SPLITCOST records in descending order */
+static int
+qsort_comp_splitcost(const void *a, const void *b)
+{
+    float diff =
+        ((SPLITCOST *)a)->cost_differential -
+        ((SPLITCOST *)b)->cost_differential;
+    if (diff < 0)
+        return 1;
+    else if (diff > 0)
+        return -1;
+    else
+        return 0;
 }

 #ifdef RTDEBUG

Re: rtree improvements

From

Bruce Momjian

Date:

26 September 2001, 15:30:23

Your patch has been added to the PostgreSQL unapplied patches list at:

    http://candle.pha.pa.us/cgi-bin/pgpatches

I will try to apply it within the next 48 hours.


> I have made three changes to the rtree code: one bug fix and
> two performance improvements.  I put an explanation of the
> changes at
>
> http://cs1.cs.nyu.edu/been/postgres-rtree.html
>
> The performance improvements are quite significant.
>
> All the changes are in the file src/backend/access/rtree/rtree.c
>
> I was working with the 7.1.3 code.
>
> I'm including the diff output as an attachment.
>
> Ken
>

> diff -Naur postgresql-7.1.3.old/src/backend/access/rtree/rtree.c postgresql-7.1.3/src/backend/access/rtree/rtree.c
> --- postgresql-7.1.3.old/src/backend/access/rtree/rtree.c    Wed Mar 21 22:59:16 2001
> +++ postgresql-7.1.3/src/backend/access/rtree/rtree.c    Tue Sep 25 13:56:31 2001
> @@ -55,6 +55,14 @@
>      Datum        spl_rdatum;
>  } SPLITVEC;
>
> +/* for sorting tuples by cost, for picking split */
> +typedef struct SPLITCOST
> +{
> +    OffsetNumber    offset_number;
> +    float            cost_differential;
> +    bool            choose_left;
> +} SPLITCOST;
> +
>  typedef struct RTSTATE
>  {
>      FmgrInfo    unionFn;        /* union function */
> @@ -79,6 +87,7 @@
>         RTSTATE *rtstate);
>  static int    nospace(Page p, IndexTuple it);
>  static void initRtstate(RTSTATE *rtstate, Relation index);
> +static int qsort_comp_splitcost(const void *a, const void *b);
>
>
>  Datum
> @@ -427,7 +436,12 @@
>      FunctionCall2(&rtstate->sizeFn, datum,
>                    PointerGetDatum(&newd_size));
>
> -    if (newd_size != old_size)
> +    /*
> +     * If newd_size == 0 we have degenerate rectangles, so we
> +     * don't know if there was any change, so we have to
> +     * assume there was.
> +     */
> +    if ((newd_size == 0) || (newd_size != old_size))
>      {
>          TupleDesc    td = RelationGetDescr(r);
>
> @@ -503,6 +517,8 @@
>      OffsetNumber *spl_left,
>                 *spl_right;
>      TupleDesc    tupDesc;
> +    int            n;
> +    OffsetNumber newitemoff;
>
>      p = (Page) BufferGetPage(buffer);
>      opaque = (RTreePageOpaque) PageGetSpecialPointer(p);
> @@ -539,56 +555,64 @@
>      spl_right = v.spl_right;
>      leftoff = rightoff = FirstOffsetNumber;
>      maxoff = PageGetMaxOffsetNumber(p);
> -    for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
> -    {
> -        itemid = PageGetItemId(p, i);
> -        item = (IndexTuple) PageGetItem(p, itemid);
> -
> -        if (i == *spl_left)
> -        {
> -            if (PageAddItem(left, (Item) item, IndexTupleSize(item),
> -                            leftoff, LP_USED) == InvalidOffsetNumber)
> -                elog(ERROR, "rtdosplit: failed to copy index item in %s",
> -                     RelationGetRelationName(r));
> -            leftoff = OffsetNumberNext(leftoff);
> -            spl_left++;            /* advance in left split vector */
> -        }
> -        else
> -        {
> -            Assert(i == *spl_right);
> -            if (PageAddItem(right, (Item) item, IndexTupleSize(item),
> -                            rightoff, LP_USED) == InvalidOffsetNumber)
> -                elog(ERROR, "rtdosplit: failed to copy index item in %s",
> -                     RelationGetRelationName(r));
> -            rightoff = OffsetNumberNext(rightoff);
> -            spl_right++;        /* advance in right split vector */
> -        }
> -    }
> +    newitemoff = OffsetNumberNext(maxoff);
>
>      /* build an InsertIndexResult for this insertion */
>      res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
>
> -    /* now insert the new index tuple */
> -    if (*spl_left == maxoff + 1)
> +    /*
> +     * spl_left contains a list of the offset numbers of the
> +     * tuples that will go to the left page.  For each offset
> +     * number, get the tuple item, then add the item to the
> +     * left page.  Similarly for the right side.
> +     */
> +
> +    /* fill left node */
> +    for (n = 0; n < v.spl_nleft; n++)
>      {
> -        if (PageAddItem(left, (Item) itup, IndexTupleSize(itup),
> +        i = *spl_left;
> +        if (i == newitemoff)
> +            item = itup;
> +        else
> +        {
> +            itemid = PageGetItemId(p, i);
> +            item = (IndexTuple) PageGetItem(p, itemid);
> +        }
> +
> +        if (PageAddItem(left, (Item) item, IndexTupleSize(item),
>                          leftoff, LP_USED) == InvalidOffsetNumber)
>              elog(ERROR, "rtdosplit: failed to add index item to %s",
>                   RelationGetRelationName(r));
>          leftoff = OffsetNumberNext(leftoff);
> -        ItemPointerSet(&(res->pointerData), lbknum, leftoff);
> -        spl_left++;
> +
> +        if (i == newitemoff)
> +            ItemPointerSet(&(res->pointerData), lbknum, leftoff);
> +
> +        spl_left++;        /* advance in left split vector */
>      }
> -    else
> +
> +    /* fill right node */
> +    for (n = 0; n < v.spl_nright; n++)
>      {
> -        Assert(*spl_right == maxoff + 1);
> -        if (PageAddItem(right, (Item) itup, IndexTupleSize(itup),
> +        i = *spl_right;
> +        if (i == newitemoff)
> +            item = itup;
> +        else
> +        {
> +            itemid = PageGetItemId(p, i);
> +            item = (IndexTuple) PageGetItem(p, itemid);
> +        }
> +
> +        if (PageAddItem(right, (Item) item, IndexTupleSize(item),
>                          rightoff, LP_USED) == InvalidOffsetNumber)
>              elog(ERROR, "rtdosplit: failed to add index item to %s",
>                   RelationGetRelationName(r));
>          rightoff = OffsetNumberNext(rightoff);
> -        ItemPointerSet(&(res->pointerData), rbknum, rightoff);
> -        spl_right++;
> +
> +        if (i == newitemoff)
> +            ItemPointerSet(&(res->pointerData), rbknum, rightoff);
> +
> +        spl_right++;        /* advance in right split vector */
>      }
>
>      /* Make sure we consumed all of the split vectors, and release 'em */
> @@ -741,8 +765,10 @@
>   * In addition, the item to be added (itup) is listed in the appropriate
>   * vector.    It is represented by item number N+1 (N = # of items on page).
>   *
> - * Both vectors appear in sequence order with a terminating sentinel value
> - * of InvalidOffsetNumber.
> + * Both vectors have a terminating sentinel value of InvalidOffsetNumber,
> + * but the sentinal value is no longer used, because the SPLITVEC
> + * vector also contains the length of each vector, and that information
> + * is now used to iterate over them in rtdosplit(). --kbb, 21 Sept 2001
>   *
>   * The bounding-box datums for the two new pages are also returned in *v.
>   *
> @@ -797,6 +823,12 @@
>                  item_2_sz,
>                  left_avail_space,
>                  right_avail_space;
> +    int            total_num_tuples,
> +                num_tuples_without_seeds,
> +                max_after_split; /* in Guttman's lingo, (M - m) */
> +    float        diff; /* diff between cost of putting tuple left or right */
> +    SPLITCOST   *cost_vector;
> +    int            n;
>
>      /*
>       * First, make sure the new item is not so large that we can't
> @@ -812,6 +844,9 @@
>      maxoff = PageGetMaxOffsetNumber(page);
>      newitemoff = OffsetNumberNext(maxoff);        /* phony index for new
>                                                   * item */
> +    total_num_tuples = newitemoff;
> +    num_tuples_without_seeds = total_num_tuples - 2;
> +    max_after_split = total_num_tuples / 2;        /* works for m = M/2 */
>
>      /* Make arrays big enough for worst case, including sentinel */
>      nbytes = (maxoff + 2) * sizeof(OffsetNumber);
> @@ -909,47 +944,111 @@
>      right_avail_space = RTPageAvailSpace - IndexTupleTotalSize(item_2);
>
>      /*
> -     * Now split up the regions between the two seeds.    An important
> -     * property of this split algorithm is that the split vector v has the
> -     * indices of items to be split in order in its left and right
> -     * vectors.  We exploit this property by doing a merge in the code
> -     * that actually splits the page.
> +     * Now split up the regions between the two seeds.
> +     *
> +     * The cost_vector array will contain hints for determining where
> +     * each tuple should go.  Each record in the array will contain
> +     * a boolean, choose_left, that indicates which node the tuple
> +     * prefers to be on, and the absolute difference in cost between
> +     * putting the tuple in its favored node and in the other node.
>       *
> -     * For efficiency, we also place the new index tuple in this loop. This
> -     * is handled at the very end, when we have placed all the existing
> -     * tuples and i == maxoff + 1.
> +     * Later, we will sort the cost_vector in descending order by cost
> +     * difference, and consider the tuples in that order for
> +     * placement.  That way, the tuples that *really* want to be in
> +     * one node or the other get to choose first, and the tuples that
> +     * don't really care choose last.
> +     *
> +     * First, build the cost_vector array.  The new index tuple will
> +     * also be handled in this loop, and represented in the array,
> +     * with i==newitemoff.
> +     *
> +     * In the case of variable size tuples it is possible that we only
> +     * have the two seeds and no other tuples, in which case we don't
> +     * do any of this cost_vector stuff.
>       */
> +
> +    /* to keep compiler quiet */
> +    cost_vector = (SPLITCOST *) NULL;
> +
> +    if (num_tuples_without_seeds > 0)
> +    {
> +        cost_vector =
> +            (SPLITCOST *) palloc(num_tuples_without_seeds * sizeof(SPLITCOST));
> +        n = 0;
> +        for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
> +        {
> +            /* Compute new union datums and sizes for both choices */
> +
> +            if ((i == seed_1) || (i == seed_2))
> +                continue;
> +            else if (i == newitemoff)
> +                item_1 = itup;
> +            else
> +                item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
> +
> +            datum_alpha = IndexTupleGetDatum(item_1);
> +            union_dl = FunctionCall2(&rtstate->unionFn, datum_l, datum_alpha);
> +            union_dr = FunctionCall2(&rtstate->unionFn, datum_r, datum_alpha);
> +            FunctionCall2(&rtstate->sizeFn, union_dl,
> +                          PointerGetDatum(&size_alpha));
> +            FunctionCall2(&rtstate->sizeFn, union_dr,
> +                          PointerGetDatum(&size_beta));
> +
> +            diff = (size_alpha - size_l) - (size_beta - size_r);
> +
> +            cost_vector[n].offset_number = i;
> +            cost_vector[n].cost_differential = fabs(diff);
> +            cost_vector[n].choose_left = (diff < 0);
> +
> +            n++;
> +        }
> +
> +        /*
> +         * Sort the array.  The function qsort_comp_splitcost is
> +         * set up "backwards", to provided descending order.
> +         */
> +        qsort(cost_vector, num_tuples_without_seeds, sizeof(SPLITCOST),
> +              &qsort_comp_splitcost);
> +    }
> +
> +    /*
> +     * Now make the final decisions about where each tuple will go,
> +     * and build the vectors to return in the SPLITVEC record.
> +     *
> +     * The cost_vector array contains (descriptions of) all the
> +     * tuples, in the order that we want to consider them, so we
> +     * we just iterate through it and place each tuple in left
> +     * or right nodes, according to the criteria described below.
> +     */
> +
>      left = v->spl_left;
>      v->spl_nleft = 0;
>      right = v->spl_right;
>      v->spl_nright = 0;
>
> -    for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
> +    /* Place the seeds first.
> +     * left avail space, left union, right avail space, and right
> +     * union have already been adjusted for the seeds.
> +     */
> +
> +    *left++ = seed_1;
> +    v->spl_nleft++;
> +
> +    *right++ = seed_2;
> +    v->spl_nright++;
> +
> +    for (n = 0; n < num_tuples_without_seeds; n++)
>      {
>          bool        left_feasible,
>                      right_feasible,
>                      choose_left;
>
>          /*
> -         * If we've already decided where to place this item, just put it
> -         * on the correct list.  Otherwise, we need to figure out which
> -         * page needs the least enlargement in order to store the item.
> +         * We need to figure out which page needs the least
> +         * enlargement in order to store the item.
>           */
>
> -        if (i == seed_1)
> -        {
> -            *left++ = i;
> -            v->spl_nleft++;
> -            /* left avail_space & union already includes this one */
> -            continue;
> -        }
> -        if (i == seed_2)
> -        {
> -            *right++ = i;
> -            v->spl_nright++;
> -            /* right avail_space & union already includes this one */
> -            continue;
> -        }
> +        i = cost_vector[n].offset_number;
>
>          /* Compute new union datums and sizes for both possible additions */
>          if (i == newitemoff)
> @@ -979,6 +1078,24 @@
>           * (We know that all the old items together can fit on one page, so
>           * we need not worry about any other problem than failing to fit
>           * the new item.)
> +         *
> +         * Guttman's algorithm actually has two factors to consider (in
> +         * order):  1. if one node has so many tuples already assigned to
> +         * it that the other needs all the rest in order to satisfy the
> +         * condition that neither node has fewer than m tuples, then
> +         * that is decisive; 2. otherwise, choose the page that shows
> +         * the smaller enlargement of its union area.
> +         *
> +         * I have chosen m = M/2, where M is the maximum number of
> +         * tuples on a page.  (Actually, this is only strictly
> +         * true for fixed size tuples.  For variable size tuples,
> +         * there still might have to be only one tuple on a page,
> +         * if it is really big.  But even with variable size
> +         * tuples we still try to get m as close as possible to M/2.)
> +         *
> +         * The question of which page shows the smaller enlargement of
> +         * its union area has already been answered, and the answer
> +         * stored in the choose_left field of the SPLITCOST record.
>           */
>          left_feasible = (left_avail_space >= item_1_sz &&
>                           ((left_avail_space - item_1_sz) >= newitemsz ||
> @@ -988,8 +1105,18 @@
>                             left_avail_space >= newitemsz));
>          if (left_feasible && right_feasible)
>          {
> -            /* Both feasible, use Guttman's algorithm */
> -            choose_left = (size_alpha - size_l < size_beta - size_r);
> +            /*
> +             * Both feasible, use Guttman's algorithm.
> +             * First check the m condition described above, and if
> +             * that doesn't apply, choose the page with the smaller
> +             * enlargement of its union area.
> +             */
> +            if (v->spl_nleft > max_after_split)
> +                choose_left = false;
> +            else if (v->spl_nright > max_after_split)
> +                choose_left = true;
> +            else
> +                choose_left = cost_vector[n].choose_left;
>          }
>          else if (left_feasible)
>              choose_left = true;
> @@ -1023,6 +1150,11 @@
>          }
>      }
>
> +    if (num_tuples_without_seeds > 0)
> +    {
> +        pfree(cost_vector);
> +    }
> +
>      *left = *right = InvalidOffsetNumber;        /* add ending sentinels */
>
>      v->spl_ldatum = datum_l;
> @@ -1152,6 +1284,21 @@
>      fmgr_info(size_proc, &rtstate->sizeFn);
>      fmgr_info(inter_proc, &rtstate->interFn);
>      return;
> +}
> +
> +/* for sorting SPLITCOST records in descending order */
> +static int
> +qsort_comp_splitcost(const void *a, const void *b)
> +{
> +    float diff =
> +        ((SPLITCOST *)a)->cost_differential -
> +        ((SPLITCOST *)b)->cost_differential;
> +    if (diff < 0)
> +        return 1;
> +    else if (diff > 0)
> +        return -1;
> +    else
> +        return 0;
>  }
>
>  #ifdef RTDEBUG

>
> ---------------------------(end of broadcast)---------------------------
> TIP 6: Have you searched our list archives?
>
> http://archives.postgresql.org

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026

Re: rtree improvements

From

Bruce Momjian

Date:

29 September 2001, 04:37:59

Patch applied.  Thanks.

> I have made three changes to the rtree code: one bug fix and
> two performance improvements.  I put an explanation of the
> changes at
>
> http://cs1.cs.nyu.edu/been/postgres-rtree.html
>
> The performance improvements are quite significant.
>
> All the changes are in the file src/backend/access/rtree/rtree.c
>
> I was working with the 7.1.3 code.
>
> I'm including the diff output as an attachment.
>
> Ken
>

> diff -Naur postgresql-7.1.3.old/src/backend/access/rtree/rtree.c postgresql-7.1.3/src/backend/access/rtree/rtree.c
> --- postgresql-7.1.3.old/src/backend/access/rtree/rtree.c    Wed Mar 21 22:59:16 2001
> +++ postgresql-7.1.3/src/backend/access/rtree/rtree.c    Tue Sep 25 13:56:31 2001
> @@ -55,6 +55,14 @@
>      Datum        spl_rdatum;
>  } SPLITVEC;
>
> +/* for sorting tuples by cost, for picking split */
> +typedef struct SPLITCOST
> +{
> +    OffsetNumber    offset_number;
> +    float            cost_differential;
> +    bool            choose_left;
> +} SPLITCOST;
> +
>  typedef struct RTSTATE
>  {
>      FmgrInfo    unionFn;        /* union function */
> @@ -79,6 +87,7 @@
>         RTSTATE *rtstate);
>  static int    nospace(Page p, IndexTuple it);
>  static void initRtstate(RTSTATE *rtstate, Relation index);
> +static int qsort_comp_splitcost(const void *a, const void *b);
>
>
>  Datum
> @@ -427,7 +436,12 @@
>      FunctionCall2(&rtstate->sizeFn, datum,
>                    PointerGetDatum(&newd_size));
>
> -    if (newd_size != old_size)
> +    /*
> +     * If newd_size == 0 we have degenerate rectangles, so we
> +     * don't know if there was any change, so we have to
> +     * assume there was.
> +     */
> +    if ((newd_size == 0) || (newd_size != old_size))
>      {
>          TupleDesc    td = RelationGetDescr(r);
>
> @@ -503,6 +517,8 @@
>      OffsetNumber *spl_left,
>                 *spl_right;
>      TupleDesc    tupDesc;
> +    int            n;
> +    OffsetNumber newitemoff;
>
>      p = (Page) BufferGetPage(buffer);
>      opaque = (RTreePageOpaque) PageGetSpecialPointer(p);
> @@ -539,56 +555,64 @@
>      spl_right = v.spl_right;
>      leftoff = rightoff = FirstOffsetNumber;
>      maxoff = PageGetMaxOffsetNumber(p);
> -    for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
> -    {
> -        itemid = PageGetItemId(p, i);
> -        item = (IndexTuple) PageGetItem(p, itemid);
> -
> -        if (i == *spl_left)
> -        {
> -            if (PageAddItem(left, (Item) item, IndexTupleSize(item),
> -                            leftoff, LP_USED) == InvalidOffsetNumber)
> -                elog(ERROR, "rtdosplit: failed to copy index item in %s",
> -                     RelationGetRelationName(r));
> -            leftoff = OffsetNumberNext(leftoff);
> -            spl_left++;            /* advance in left split vector */
> -        }
> -        else
> -        {
> -            Assert(i == *spl_right);
> -            if (PageAddItem(right, (Item) item, IndexTupleSize(item),
> -                            rightoff, LP_USED) == InvalidOffsetNumber)
> -                elog(ERROR, "rtdosplit: failed to copy index item in %s",
> -                     RelationGetRelationName(r));
> -            rightoff = OffsetNumberNext(rightoff);
> -            spl_right++;        /* advance in right split vector */
> -        }
> -    }
> +    newitemoff = OffsetNumberNext(maxoff);
>
>      /* build an InsertIndexResult for this insertion */
>      res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
>
> -    /* now insert the new index tuple */
> -    if (*spl_left == maxoff + 1)
> +    /*
> +     * spl_left contains a list of the offset numbers of the
> +     * tuples that will go to the left page.  For each offset
> +     * number, get the tuple item, then add the item to the
> +     * left page.  Similarly for the right side.
> +     */
> +
> +    /* fill left node */
> +    for (n = 0; n < v.spl_nleft; n++)
>      {
> -        if (PageAddItem(left, (Item) itup, IndexTupleSize(itup),
> +        i = *spl_left;
> +        if (i == newitemoff)
> +            item = itup;
> +        else
> +        {
> +            itemid = PageGetItemId(p, i);
> +            item = (IndexTuple) PageGetItem(p, itemid);
> +        }
> +
> +        if (PageAddItem(left, (Item) item, IndexTupleSize(item),
>                          leftoff, LP_USED) == InvalidOffsetNumber)
>              elog(ERROR, "rtdosplit: failed to add index item to %s",
>                   RelationGetRelationName(r));
>          leftoff = OffsetNumberNext(leftoff);
> -        ItemPointerSet(&(res->pointerData), lbknum, leftoff);
> -        spl_left++;
> +
> +        if (i == newitemoff)
> +            ItemPointerSet(&(res->pointerData), lbknum, leftoff);
> +
> +        spl_left++;        /* advance in left split vector */
>      }
> -    else
> +
> +    /* fill right node */
> +    for (n = 0; n < v.spl_nright; n++)
>      {
> -        Assert(*spl_right == maxoff + 1);
> -        if (PageAddItem(right, (Item) itup, IndexTupleSize(itup),
> +        i = *spl_right;
> +        if (i == newitemoff)
> +            item = itup;
> +        else
> +        {
> +            itemid = PageGetItemId(p, i);
> +            item = (IndexTuple) PageGetItem(p, itemid);
> +        }
> +
> +        if (PageAddItem(right, (Item) item, IndexTupleSize(item),
>                          rightoff, LP_USED) == InvalidOffsetNumber)
>              elog(ERROR, "rtdosplit: failed to add index item to %s",
>                   RelationGetRelationName(r));
>          rightoff = OffsetNumberNext(rightoff);
> -        ItemPointerSet(&(res->pointerData), rbknum, rightoff);
> -        spl_right++;
> +
> +        if (i == newitemoff)
> +            ItemPointerSet(&(res->pointerData), rbknum, rightoff);
> +
> +        spl_right++;        /* advance in right split vector */
>      }
>
>      /* Make sure we consumed all of the split vectors, and release 'em */
> @@ -741,8 +765,10 @@
>   * In addition, the item to be added (itup) is listed in the appropriate
>   * vector.    It is represented by item number N+1 (N = # of items on page).
>   *
> - * Both vectors appear in sequence order with a terminating sentinel value
> - * of InvalidOffsetNumber.
> + * Both vectors have a terminating sentinel value of InvalidOffsetNumber,
> + * but the sentinal value is no longer used, because the SPLITVEC
> + * vector also contains the length of each vector, and that information
> + * is now used to iterate over them in rtdosplit(). --kbb, 21 Sept 2001
>   *
>   * The bounding-box datums for the two new pages are also returned in *v.
>   *
> @@ -797,6 +823,12 @@
>                  item_2_sz,
>                  left_avail_space,
>                  right_avail_space;
> +    int            total_num_tuples,
> +                num_tuples_without_seeds,
> +                max_after_split; /* in Guttman's lingo, (M - m) */
> +    float        diff; /* diff between cost of putting tuple left or right */
> +    SPLITCOST   *cost_vector;
> +    int            n;
>
>      /*
>       * First, make sure the new item is not so large that we can't
> @@ -812,6 +844,9 @@
>      maxoff = PageGetMaxOffsetNumber(page);
>      newitemoff = OffsetNumberNext(maxoff);        /* phony index for new
>                                                   * item */
> +    total_num_tuples = newitemoff;
> +    num_tuples_without_seeds = total_num_tuples - 2;
> +    max_after_split = total_num_tuples / 2;        /* works for m = M/2 */
>
>      /* Make arrays big enough for worst case, including sentinel */
>      nbytes = (maxoff + 2) * sizeof(OffsetNumber);
> @@ -909,47 +944,111 @@
>      right_avail_space = RTPageAvailSpace - IndexTupleTotalSize(item_2);
>
>      /*
> -     * Now split up the regions between the two seeds.    An important
> -     * property of this split algorithm is that the split vector v has the
> -     * indices of items to be split in order in its left and right
> -     * vectors.  We exploit this property by doing a merge in the code
> -     * that actually splits the page.
> +     * Now split up the regions between the two seeds.
> +     *
> +     * The cost_vector array will contain hints for determining where
> +     * each tuple should go.  Each record in the array will contain
> +     * a boolean, choose_left, that indicates which node the tuple
> +     * prefers to be on, and the absolute difference in cost between
> +     * putting the tuple in its favored node and in the other node.
>       *
> -     * For efficiency, we also place the new index tuple in this loop. This
> -     * is handled at the very end, when we have placed all the existing
> -     * tuples and i == maxoff + 1.
> +     * Later, we will sort the cost_vector in descending order by cost
> +     * difference, and consider the tuples in that order for
> +     * placement.  That way, the tuples that *really* want to be in
> +     * one node or the other get to choose first, and the tuples that
> +     * don't really care choose last.
> +     *
> +     * First, build the cost_vector array.  The new index tuple will
> +     * also be handled in this loop, and represented in the array,
> +     * with i==newitemoff.
> +     *
> +     * In the case of variable size tuples it is possible that we only
> +     * have the two seeds and no other tuples, in which case we don't
> +     * do any of this cost_vector stuff.
>       */
> +
> +    /* to keep compiler quiet */
> +    cost_vector = (SPLITCOST *) NULL;
> +
> +    if (num_tuples_without_seeds > 0)
> +    {
> +        cost_vector =
> +            (SPLITCOST *) palloc(num_tuples_without_seeds * sizeof(SPLITCOST));
> +        n = 0;
> +        for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
> +        {
> +            /* Compute new union datums and sizes for both choices */
> +
> +            if ((i == seed_1) || (i == seed_2))
> +                continue;
> +            else if (i == newitemoff)
> +                item_1 = itup;
> +            else
> +                item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
> +
> +            datum_alpha = IndexTupleGetDatum(item_1);
> +            union_dl = FunctionCall2(&rtstate->unionFn, datum_l, datum_alpha);
> +            union_dr = FunctionCall2(&rtstate->unionFn, datum_r, datum_alpha);
> +            FunctionCall2(&rtstate->sizeFn, union_dl,
> +                          PointerGetDatum(&size_alpha));
> +            FunctionCall2(&rtstate->sizeFn, union_dr,
> +                          PointerGetDatum(&size_beta));
> +
> +            diff = (size_alpha - size_l) - (size_beta - size_r);
> +
> +            cost_vector[n].offset_number = i;
> +            cost_vector[n].cost_differential = fabs(diff);
> +            cost_vector[n].choose_left = (diff < 0);
> +
> +            n++;
> +        }
> +
> +        /*
> +         * Sort the array.  The function qsort_comp_splitcost is
> +         * set up "backwards", to provided descending order.
> +         */
> +        qsort(cost_vector, num_tuples_without_seeds, sizeof(SPLITCOST),
> +              &qsort_comp_splitcost);
> +    }
> +
> +    /*
> +     * Now make the final decisions about where each tuple will go,
> +     * and build the vectors to return in the SPLITVEC record.
> +     *
> +     * The cost_vector array contains (descriptions of) all the
> +     * tuples, in the order that we want to consider them, so we
> +     * we just iterate through it and place each tuple in left
> +     * or right nodes, according to the criteria described below.
> +     */
> +
>      left = v->spl_left;
>      v->spl_nleft = 0;
>      right = v->spl_right;
>      v->spl_nright = 0;
>
> -    for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
> +    /* Place the seeds first.
> +     * left avail space, left union, right avail space, and right
> +     * union have already been adjusted for the seeds.
> +     */
> +
> +    *left++ = seed_1;
> +    v->spl_nleft++;
> +
> +    *right++ = seed_2;
> +    v->spl_nright++;
> +
> +    for (n = 0; n < num_tuples_without_seeds; n++)
>      {
>          bool        left_feasible,
>                      right_feasible,
>                      choose_left;
>
>          /*
> -         * If we've already decided where to place this item, just put it
> -         * on the correct list.  Otherwise, we need to figure out which
> -         * page needs the least enlargement in order to store the item.
> +         * We need to figure out which page needs the least
> +         * enlargement in order to store the item.
>           */
>
> -        if (i == seed_1)
> -        {
> -            *left++ = i;
> -            v->spl_nleft++;
> -            /* left avail_space & union already includes this one */
> -            continue;
> -        }
> -        if (i == seed_2)
> -        {
> -            *right++ = i;
> -            v->spl_nright++;
> -            /* right avail_space & union already includes this one */
> -            continue;
> -        }
> +        i = cost_vector[n].offset_number;
>
>          /* Compute new union datums and sizes for both possible additions */
>          if (i == newitemoff)
> @@ -979,6 +1078,24 @@
>           * (We know that all the old items together can fit on one page, so
>           * we need not worry about any other problem than failing to fit
>           * the new item.)
> +         *
> +         * Guttman's algorithm actually has two factors to consider (in
> +         * order):  1. if one node has so many tuples already assigned to
> +         * it that the other needs all the rest in order to satisfy the
> +         * condition that neither node has fewer than m tuples, then
> +         * that is decisive; 2. otherwise, choose the page that shows
> +         * the smaller enlargement of its union area.
> +         *
> +         * I have chosen m = M/2, where M is the maximum number of
> +         * tuples on a page.  (Actually, this is only strictly
> +         * true for fixed size tuples.  For variable size tuples,
> +         * there still might have to be only one tuple on a page,
> +         * if it is really big.  But even with variable size
> +         * tuples we still try to get m as close as possible to M/2.)
> +         *
> +         * The question of which page shows the smaller enlargement of
> +         * its union area has already been answered, and the answer
> +         * stored in the choose_left field of the SPLITCOST record.
>           */
>          left_feasible = (left_avail_space >= item_1_sz &&
>                           ((left_avail_space - item_1_sz) >= newitemsz ||
> @@ -988,8 +1105,18 @@
>                             left_avail_space >= newitemsz));
>          if (left_feasible && right_feasible)
>          {
> -            /* Both feasible, use Guttman's algorithm */
> -            choose_left = (size_alpha - size_l < size_beta - size_r);
> +            /*
> +             * Both feasible, use Guttman's algorithm.
> +             * First check the m condition described above, and if
> +             * that doesn't apply, choose the page with the smaller
> +             * enlargement of its union area.
> +             */
> +            if (v->spl_nleft > max_after_split)
> +                choose_left = false;
> +            else if (v->spl_nright > max_after_split)
> +                choose_left = true;
> +            else
> +                choose_left = cost_vector[n].choose_left;
>          }
>          else if (left_feasible)
>              choose_left = true;
> @@ -1023,6 +1150,11 @@
>          }
>      }
>
> +    if (num_tuples_without_seeds > 0)
> +    {
> +        pfree(cost_vector);
> +    }
> +
>      *left = *right = InvalidOffsetNumber;        /* add ending sentinels */
>
>      v->spl_ldatum = datum_l;
> @@ -1152,6 +1284,21 @@
>      fmgr_info(size_proc, &rtstate->sizeFn);
>      fmgr_info(inter_proc, &rtstate->interFn);
>      return;
> +}
> +
> +/* for sorting SPLITCOST records in descending order */
> +static int
> +qsort_comp_splitcost(const void *a, const void *b)
> +{
> +    float diff =
> +        ((SPLITCOST *)a)->cost_differential -
> +        ((SPLITCOST *)b)->cost_differential;
> +    if (diff < 0)
> +        return 1;
> +    else if (diff > 0)
> +        return -1;
> +    else
> +        return 0;
>  }
>
>  #ifdef RTDEBUG

>
> ---------------------------(end of broadcast)---------------------------
> TIP 6: Have you searched our list archives?
>
> http://archives.postgresql.org

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026