Thread: rtree improvements
I have made three changes to the rtree code: one bug fix and two performance improvements. I put an explanation of the changes at http://cs1.cs.nyu.edu/been/postgres-rtree.html The performance improvements are quite significant. All the changes are in the file src/backend/access/rtree/rtree.c I was working with the 7.1.3 code. I'm including the diff output as an attachment. Ken diff -Naur postgresql-7.1.3.old/src/backend/access/rtree/rtree.c postgresql-7.1.3/src/backend/access/rtree/rtree.c --- postgresql-7.1.3.old/src/backend/access/rtree/rtree.c Wed Mar 21 22:59:16 2001 +++ postgresql-7.1.3/src/backend/access/rtree/rtree.c Tue Sep 25 13:56:31 2001 @@ -55,6 +55,14 @@ Datum spl_rdatum; } SPLITVEC; +/* for sorting tuples by cost, for picking split */ +typedef struct SPLITCOST +{ + OffsetNumber offset_number; + float cost_differential; + bool choose_left; +} SPLITCOST; + typedef struct RTSTATE { FmgrInfo unionFn; /* union function */ @@ -79,6 +87,7 @@ RTSTATE *rtstate); static int nospace(Page p, IndexTuple it); static void initRtstate(RTSTATE *rtstate, Relation index); +static int qsort_comp_splitcost(const void *a, const void *b); Datum @@ -427,7 +436,12 @@ FunctionCall2(&rtstate->sizeFn, datum, PointerGetDatum(&newd_size)); - if (newd_size != old_size) + /* + * If newd_size == 0 we have degenerate rectangles, so we + * don't know if there was any change, so we have to + * assume there was. + */ + if ((newd_size == 0) || (newd_size != old_size)) { TupleDesc td = RelationGetDescr(r); @@ -503,6 +517,8 @@ OffsetNumber *spl_left, *spl_right; TupleDesc tupDesc; + int n; + OffsetNumber newitemoff; p = (Page) BufferGetPage(buffer); opaque = (RTreePageOpaque) PageGetSpecialPointer(p); @@ -539,56 +555,64 @@ spl_right = v.spl_right; leftoff = rightoff = FirstOffsetNumber; maxoff = PageGetMaxOffsetNumber(p); - for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) - { - itemid = PageGetItemId(p, i); - item = (IndexTuple) PageGetItem(p, itemid); - - if (i == *spl_left) - { - if (PageAddItem(left, (Item) item, IndexTupleSize(item), - leftoff, LP_USED) == InvalidOffsetNumber) - elog(ERROR, "rtdosplit: failed to copy index item in %s", - RelationGetRelationName(r)); - leftoff = OffsetNumberNext(leftoff); - spl_left++; /* advance in left split vector */ - } - else - { - Assert(i == *spl_right); - if (PageAddItem(right, (Item) item, IndexTupleSize(item), - rightoff, LP_USED) == InvalidOffsetNumber) - elog(ERROR, "rtdosplit: failed to copy index item in %s", - RelationGetRelationName(r)); - rightoff = OffsetNumberNext(rightoff); - spl_right++; /* advance in right split vector */ - } - } + newitemoff = OffsetNumberNext(maxoff); /* build an InsertIndexResult for this insertion */ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); - /* now insert the new index tuple */ - if (*spl_left == maxoff + 1) + /* + * spl_left contains a list of the offset numbers of the + * tuples that will go to the left page. For each offset + * number, get the tuple item, then add the item to the + * left page. Similarly for the right side. + */ + + /* fill left node */ + for (n = 0; n < v.spl_nleft; n++) { - if (PageAddItem(left, (Item) itup, IndexTupleSize(itup), + i = *spl_left; + if (i == newitemoff) + item = itup; + else + { + itemid = PageGetItemId(p, i); + item = (IndexTuple) PageGetItem(p, itemid); + } + + if (PageAddItem(left, (Item) item, IndexTupleSize(item), leftoff, LP_USED) == InvalidOffsetNumber) elog(ERROR, "rtdosplit: failed to add index item to %s", RelationGetRelationName(r)); leftoff = OffsetNumberNext(leftoff); - ItemPointerSet(&(res->pointerData), lbknum, leftoff); - spl_left++; + + if (i == newitemoff) + ItemPointerSet(&(res->pointerData), lbknum, leftoff); + + spl_left++; /* advance in left split vector */ } - else + + /* fill right node */ + for (n = 0; n < v.spl_nright; n++) { - Assert(*spl_right == maxoff + 1); - if (PageAddItem(right, (Item) itup, IndexTupleSize(itup), + i = *spl_right; + if (i == newitemoff) + item = itup; + else + { + itemid = PageGetItemId(p, i); + item = (IndexTuple) PageGetItem(p, itemid); + } + + if (PageAddItem(right, (Item) item, IndexTupleSize(item), rightoff, LP_USED) == InvalidOffsetNumber) elog(ERROR, "rtdosplit: failed to add index item to %s", RelationGetRelationName(r)); rightoff = OffsetNumberNext(rightoff); - ItemPointerSet(&(res->pointerData), rbknum, rightoff); - spl_right++; + + if (i == newitemoff) + ItemPointerSet(&(res->pointerData), rbknum, rightoff); + + spl_right++; /* advance in right split vector */ } /* Make sure we consumed all of the split vectors, and release 'em */ @@ -741,8 +765,10 @@ * In addition, the item to be added (itup) is listed in the appropriate * vector. It is represented by item number N+1 (N = # of items on page). * - * Both vectors appear in sequence order with a terminating sentinel value - * of InvalidOffsetNumber. + * Both vectors have a terminating sentinel value of InvalidOffsetNumber, + * but the sentinal value is no longer used, because the SPLITVEC + * vector also contains the length of each vector, and that information + * is now used to iterate over them in rtdosplit(). --kbb, 21 Sept 2001 * * The bounding-box datums for the two new pages are also returned in *v. * @@ -797,6 +823,12 @@ item_2_sz, left_avail_space, right_avail_space; + int total_num_tuples, + num_tuples_without_seeds, + max_after_split; /* in Guttman's lingo, (M - m) */ + float diff; /* diff between cost of putting tuple left or right */ + SPLITCOST *cost_vector; + int n; /* * First, make sure the new item is not so large that we can't @@ -812,6 +844,9 @@ maxoff = PageGetMaxOffsetNumber(page); newitemoff = OffsetNumberNext(maxoff); /* phony index for new * item */ + total_num_tuples = newitemoff; + num_tuples_without_seeds = total_num_tuples - 2; + max_after_split = total_num_tuples / 2; /* works for m = M/2 */ /* Make arrays big enough for worst case, including sentinel */ nbytes = (maxoff + 2) * sizeof(OffsetNumber); @@ -909,47 +944,111 @@ right_avail_space = RTPageAvailSpace - IndexTupleTotalSize(item_2); /* - * Now split up the regions between the two seeds. An important - * property of this split algorithm is that the split vector v has the - * indices of items to be split in order in its left and right - * vectors. We exploit this property by doing a merge in the code - * that actually splits the page. + * Now split up the regions between the two seeds. + * + * The cost_vector array will contain hints for determining where + * each tuple should go. Each record in the array will contain + * a boolean, choose_left, that indicates which node the tuple + * prefers to be on, and the absolute difference in cost between + * putting the tuple in its favored node and in the other node. * - * For efficiency, we also place the new index tuple in this loop. This - * is handled at the very end, when we have placed all the existing - * tuples and i == maxoff + 1. + * Later, we will sort the cost_vector in descending order by cost + * difference, and consider the tuples in that order for + * placement. That way, the tuples that *really* want to be in + * one node or the other get to choose first, and the tuples that + * don't really care choose last. + * + * First, build the cost_vector array. The new index tuple will + * also be handled in this loop, and represented in the array, + * with i==newitemoff. + * + * In the case of variable size tuples it is possible that we only + * have the two seeds and no other tuples, in which case we don't + * do any of this cost_vector stuff. */ + + /* to keep compiler quiet */ + cost_vector = (SPLITCOST *) NULL; + + if (num_tuples_without_seeds > 0) + { + cost_vector = + (SPLITCOST *) palloc(num_tuples_without_seeds * sizeof(SPLITCOST)); + n = 0; + for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i)) + { + /* Compute new union datums and sizes for both choices */ + + if ((i == seed_1) || (i == seed_2)) + continue; + else if (i == newitemoff) + item_1 = itup; + else + item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + + datum_alpha = IndexTupleGetDatum(item_1); + union_dl = FunctionCall2(&rtstate->unionFn, datum_l, datum_alpha); + union_dr = FunctionCall2(&rtstate->unionFn, datum_r, datum_alpha); + FunctionCall2(&rtstate->sizeFn, union_dl, + PointerGetDatum(&size_alpha)); + FunctionCall2(&rtstate->sizeFn, union_dr, + PointerGetDatum(&size_beta)); + + diff = (size_alpha - size_l) - (size_beta - size_r); + + cost_vector[n].offset_number = i; + cost_vector[n].cost_differential = fabs(diff); + cost_vector[n].choose_left = (diff < 0); + + n++; + } + + /* + * Sort the array. The function qsort_comp_splitcost is + * set up "backwards", to provided descending order. + */ + qsort(cost_vector, num_tuples_without_seeds, sizeof(SPLITCOST), + &qsort_comp_splitcost); + } + + /* + * Now make the final decisions about where each tuple will go, + * and build the vectors to return in the SPLITVEC record. + * + * The cost_vector array contains (descriptions of) all the + * tuples, in the order that we want to consider them, so we + * we just iterate through it and place each tuple in left + * or right nodes, according to the criteria described below. + */ + left = v->spl_left; v->spl_nleft = 0; right = v->spl_right; v->spl_nright = 0; - for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i)) + /* Place the seeds first. + * left avail space, left union, right avail space, and right + * union have already been adjusted for the seeds. + */ + + *left++ = seed_1; + v->spl_nleft++; + + *right++ = seed_2; + v->spl_nright++; + + for (n = 0; n < num_tuples_without_seeds; n++) { bool left_feasible, right_feasible, choose_left; /* - * If we've already decided where to place this item, just put it - * on the correct list. Otherwise, we need to figure out which - * page needs the least enlargement in order to store the item. + * We need to figure out which page needs the least + * enlargement in order to store the item. */ - if (i == seed_1) - { - *left++ = i; - v->spl_nleft++; - /* left avail_space & union already includes this one */ - continue; - } - if (i == seed_2) - { - *right++ = i; - v->spl_nright++; - /* right avail_space & union already includes this one */ - continue; - } + i = cost_vector[n].offset_number; /* Compute new union datums and sizes for both possible additions */ if (i == newitemoff) @@ -979,6 +1078,24 @@ * (We know that all the old items together can fit on one page, so * we need not worry about any other problem than failing to fit * the new item.) + * + * Guttman's algorithm actually has two factors to consider (in + * order): 1. if one node has so many tuples already assigned to + * it that the other needs all the rest in order to satisfy the + * condition that neither node has fewer than m tuples, then + * that is decisive; 2. otherwise, choose the page that shows + * the smaller enlargement of its union area. + * + * I have chosen m = M/2, where M is the maximum number of + * tuples on a page. (Actually, this is only strictly + * true for fixed size tuples. For variable size tuples, + * there still might have to be only one tuple on a page, + * if it is really big. But even with variable size + * tuples we still try to get m as close as possible to M/2.) + * + * The question of which page shows the smaller enlargement of + * its union area has already been answered, and the answer + * stored in the choose_left field of the SPLITCOST record. */ left_feasible = (left_avail_space >= item_1_sz && ((left_avail_space - item_1_sz) >= newitemsz || @@ -988,8 +1105,18 @@ left_avail_space >= newitemsz)); if (left_feasible && right_feasible) { - /* Both feasible, use Guttman's algorithm */ - choose_left = (size_alpha - size_l < size_beta - size_r); + /* + * Both feasible, use Guttman's algorithm. + * First check the m condition described above, and if + * that doesn't apply, choose the page with the smaller + * enlargement of its union area. + */ + if (v->spl_nleft > max_after_split) + choose_left = false; + else if (v->spl_nright > max_after_split) + choose_left = true; + else + choose_left = cost_vector[n].choose_left; } else if (left_feasible) choose_left = true; @@ -1023,6 +1150,11 @@ } } + if (num_tuples_without_seeds > 0) + { + pfree(cost_vector); + } + *left = *right = InvalidOffsetNumber; /* add ending sentinels */ v->spl_ldatum = datum_l; @@ -1152,6 +1284,21 @@ fmgr_info(size_proc, &rtstate->sizeFn); fmgr_info(inter_proc, &rtstate->interFn); return; +} + +/* for sorting SPLITCOST records in descending order */ +static int +qsort_comp_splitcost(const void *a, const void *b) +{ + float diff = + ((SPLITCOST *)a)->cost_differential - + ((SPLITCOST *)b)->cost_differential; + if (diff < 0) + return 1; + else if (diff > 0) + return -1; + else + return 0; } #ifdef RTDEBUG
Your patch has been added to the PostgreSQL unapplied patches list at: http://candle.pha.pa.us/cgi-bin/pgpatches I will try to apply it within the next 48 hours. > I have made three changes to the rtree code: one bug fix and > two performance improvements. I put an explanation of the > changes at > > http://cs1.cs.nyu.edu/been/postgres-rtree.html > > The performance improvements are quite significant. > > All the changes are in the file src/backend/access/rtree/rtree.c > > I was working with the 7.1.3 code. > > I'm including the diff output as an attachment. > > Ken > > diff -Naur postgresql-7.1.3.old/src/backend/access/rtree/rtree.c postgresql-7.1.3/src/backend/access/rtree/rtree.c > --- postgresql-7.1.3.old/src/backend/access/rtree/rtree.c Wed Mar 21 22:59:16 2001 > +++ postgresql-7.1.3/src/backend/access/rtree/rtree.c Tue Sep 25 13:56:31 2001 > @@ -55,6 +55,14 @@ > Datum spl_rdatum; > } SPLITVEC; > > +/* for sorting tuples by cost, for picking split */ > +typedef struct SPLITCOST > +{ > + OffsetNumber offset_number; > + float cost_differential; > + bool choose_left; > +} SPLITCOST; > + > typedef struct RTSTATE > { > FmgrInfo unionFn; /* union function */ > @@ -79,6 +87,7 @@ > RTSTATE *rtstate); > static int nospace(Page p, IndexTuple it); > static void initRtstate(RTSTATE *rtstate, Relation index); > +static int qsort_comp_splitcost(const void *a, const void *b); > > > Datum > @@ -427,7 +436,12 @@ > FunctionCall2(&rtstate->sizeFn, datum, > PointerGetDatum(&newd_size)); > > - if (newd_size != old_size) > + /* > + * If newd_size == 0 we have degenerate rectangles, so we > + * don't know if there was any change, so we have to > + * assume there was. > + */ > + if ((newd_size == 0) || (newd_size != old_size)) > { > TupleDesc td = RelationGetDescr(r); > > @@ -503,6 +517,8 @@ > OffsetNumber *spl_left, > *spl_right; > TupleDesc tupDesc; > + int n; > + OffsetNumber newitemoff; > > p = (Page) BufferGetPage(buffer); > opaque = (RTreePageOpaque) PageGetSpecialPointer(p); > @@ -539,56 +555,64 @@ > spl_right = v.spl_right; > leftoff = rightoff = FirstOffsetNumber; > maxoff = PageGetMaxOffsetNumber(p); > - for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) > - { > - itemid = PageGetItemId(p, i); > - item = (IndexTuple) PageGetItem(p, itemid); > - > - if (i == *spl_left) > - { > - if (PageAddItem(left, (Item) item, IndexTupleSize(item), > - leftoff, LP_USED) == InvalidOffsetNumber) > - elog(ERROR, "rtdosplit: failed to copy index item in %s", > - RelationGetRelationName(r)); > - leftoff = OffsetNumberNext(leftoff); > - spl_left++; /* advance in left split vector */ > - } > - else > - { > - Assert(i == *spl_right); > - if (PageAddItem(right, (Item) item, IndexTupleSize(item), > - rightoff, LP_USED) == InvalidOffsetNumber) > - elog(ERROR, "rtdosplit: failed to copy index item in %s", > - RelationGetRelationName(r)); > - rightoff = OffsetNumberNext(rightoff); > - spl_right++; /* advance in right split vector */ > - } > - } > + newitemoff = OffsetNumberNext(maxoff); > > /* build an InsertIndexResult for this insertion */ > res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); > > - /* now insert the new index tuple */ > - if (*spl_left == maxoff + 1) > + /* > + * spl_left contains a list of the offset numbers of the > + * tuples that will go to the left page. For each offset > + * number, get the tuple item, then add the item to the > + * left page. Similarly for the right side. > + */ > + > + /* fill left node */ > + for (n = 0; n < v.spl_nleft; n++) > { > - if (PageAddItem(left, (Item) itup, IndexTupleSize(itup), > + i = *spl_left; > + if (i == newitemoff) > + item = itup; > + else > + { > + itemid = PageGetItemId(p, i); > + item = (IndexTuple) PageGetItem(p, itemid); > + } > + > + if (PageAddItem(left, (Item) item, IndexTupleSize(item), > leftoff, LP_USED) == InvalidOffsetNumber) > elog(ERROR, "rtdosplit: failed to add index item to %s", > RelationGetRelationName(r)); > leftoff = OffsetNumberNext(leftoff); > - ItemPointerSet(&(res->pointerData), lbknum, leftoff); > - spl_left++; > + > + if (i == newitemoff) > + ItemPointerSet(&(res->pointerData), lbknum, leftoff); > + > + spl_left++; /* advance in left split vector */ > } > - else > + > + /* fill right node */ > + for (n = 0; n < v.spl_nright; n++) > { > - Assert(*spl_right == maxoff + 1); > - if (PageAddItem(right, (Item) itup, IndexTupleSize(itup), > + i = *spl_right; > + if (i == newitemoff) > + item = itup; > + else > + { > + itemid = PageGetItemId(p, i); > + item = (IndexTuple) PageGetItem(p, itemid); > + } > + > + if (PageAddItem(right, (Item) item, IndexTupleSize(item), > rightoff, LP_USED) == InvalidOffsetNumber) > elog(ERROR, "rtdosplit: failed to add index item to %s", > RelationGetRelationName(r)); > rightoff = OffsetNumberNext(rightoff); > - ItemPointerSet(&(res->pointerData), rbknum, rightoff); > - spl_right++; > + > + if (i == newitemoff) > + ItemPointerSet(&(res->pointerData), rbknum, rightoff); > + > + spl_right++; /* advance in right split vector */ > } > > /* Make sure we consumed all of the split vectors, and release 'em */ > @@ -741,8 +765,10 @@ > * In addition, the item to be added (itup) is listed in the appropriate > * vector. It is represented by item number N+1 (N = # of items on page). > * > - * Both vectors appear in sequence order with a terminating sentinel value > - * of InvalidOffsetNumber. > + * Both vectors have a terminating sentinel value of InvalidOffsetNumber, > + * but the sentinal value is no longer used, because the SPLITVEC > + * vector also contains the length of each vector, and that information > + * is now used to iterate over them in rtdosplit(). --kbb, 21 Sept 2001 > * > * The bounding-box datums for the two new pages are also returned in *v. > * > @@ -797,6 +823,12 @@ > item_2_sz, > left_avail_space, > right_avail_space; > + int total_num_tuples, > + num_tuples_without_seeds, > + max_after_split; /* in Guttman's lingo, (M - m) */ > + float diff; /* diff between cost of putting tuple left or right */ > + SPLITCOST *cost_vector; > + int n; > > /* > * First, make sure the new item is not so large that we can't > @@ -812,6 +844,9 @@ > maxoff = PageGetMaxOffsetNumber(page); > newitemoff = OffsetNumberNext(maxoff); /* phony index for new > * item */ > + total_num_tuples = newitemoff; > + num_tuples_without_seeds = total_num_tuples - 2; > + max_after_split = total_num_tuples / 2; /* works for m = M/2 */ > > /* Make arrays big enough for worst case, including sentinel */ > nbytes = (maxoff + 2) * sizeof(OffsetNumber); > @@ -909,47 +944,111 @@ > right_avail_space = RTPageAvailSpace - IndexTupleTotalSize(item_2); > > /* > - * Now split up the regions between the two seeds. An important > - * property of this split algorithm is that the split vector v has the > - * indices of items to be split in order in its left and right > - * vectors. We exploit this property by doing a merge in the code > - * that actually splits the page. > + * Now split up the regions between the two seeds. > + * > + * The cost_vector array will contain hints for determining where > + * each tuple should go. Each record in the array will contain > + * a boolean, choose_left, that indicates which node the tuple > + * prefers to be on, and the absolute difference in cost between > + * putting the tuple in its favored node and in the other node. > * > - * For efficiency, we also place the new index tuple in this loop. This > - * is handled at the very end, when we have placed all the existing > - * tuples and i == maxoff + 1. > + * Later, we will sort the cost_vector in descending order by cost > + * difference, and consider the tuples in that order for > + * placement. That way, the tuples that *really* want to be in > + * one node or the other get to choose first, and the tuples that > + * don't really care choose last. > + * > + * First, build the cost_vector array. The new index tuple will > + * also be handled in this loop, and represented in the array, > + * with i==newitemoff. > + * > + * In the case of variable size tuples it is possible that we only > + * have the two seeds and no other tuples, in which case we don't > + * do any of this cost_vector stuff. > */ > + > + /* to keep compiler quiet */ > + cost_vector = (SPLITCOST *) NULL; > + > + if (num_tuples_without_seeds > 0) > + { > + cost_vector = > + (SPLITCOST *) palloc(num_tuples_without_seeds * sizeof(SPLITCOST)); > + n = 0; > + for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i)) > + { > + /* Compute new union datums and sizes for both choices */ > + > + if ((i == seed_1) || (i == seed_2)) > + continue; > + else if (i == newitemoff) > + item_1 = itup; > + else > + item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); > + > + datum_alpha = IndexTupleGetDatum(item_1); > + union_dl = FunctionCall2(&rtstate->unionFn, datum_l, datum_alpha); > + union_dr = FunctionCall2(&rtstate->unionFn, datum_r, datum_alpha); > + FunctionCall2(&rtstate->sizeFn, union_dl, > + PointerGetDatum(&size_alpha)); > + FunctionCall2(&rtstate->sizeFn, union_dr, > + PointerGetDatum(&size_beta)); > + > + diff = (size_alpha - size_l) - (size_beta - size_r); > + > + cost_vector[n].offset_number = i; > + cost_vector[n].cost_differential = fabs(diff); > + cost_vector[n].choose_left = (diff < 0); > + > + n++; > + } > + > + /* > + * Sort the array. The function qsort_comp_splitcost is > + * set up "backwards", to provided descending order. > + */ > + qsort(cost_vector, num_tuples_without_seeds, sizeof(SPLITCOST), > + &qsort_comp_splitcost); > + } > + > + /* > + * Now make the final decisions about where each tuple will go, > + * and build the vectors to return in the SPLITVEC record. > + * > + * The cost_vector array contains (descriptions of) all the > + * tuples, in the order that we want to consider them, so we > + * we just iterate through it and place each tuple in left > + * or right nodes, according to the criteria described below. > + */ > + > left = v->spl_left; > v->spl_nleft = 0; > right = v->spl_right; > v->spl_nright = 0; > > - for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i)) > + /* Place the seeds first. > + * left avail space, left union, right avail space, and right > + * union have already been adjusted for the seeds. > + */ > + > + *left++ = seed_1; > + v->spl_nleft++; > + > + *right++ = seed_2; > + v->spl_nright++; > + > + for (n = 0; n < num_tuples_without_seeds; n++) > { > bool left_feasible, > right_feasible, > choose_left; > > /* > - * If we've already decided where to place this item, just put it > - * on the correct list. Otherwise, we need to figure out which > - * page needs the least enlargement in order to store the item. > + * We need to figure out which page needs the least > + * enlargement in order to store the item. > */ > > - if (i == seed_1) > - { > - *left++ = i; > - v->spl_nleft++; > - /* left avail_space & union already includes this one */ > - continue; > - } > - if (i == seed_2) > - { > - *right++ = i; > - v->spl_nright++; > - /* right avail_space & union already includes this one */ > - continue; > - } > + i = cost_vector[n].offset_number; > > /* Compute new union datums and sizes for both possible additions */ > if (i == newitemoff) > @@ -979,6 +1078,24 @@ > * (We know that all the old items together can fit on one page, so > * we need not worry about any other problem than failing to fit > * the new item.) > + * > + * Guttman's algorithm actually has two factors to consider (in > + * order): 1. if one node has so many tuples already assigned to > + * it that the other needs all the rest in order to satisfy the > + * condition that neither node has fewer than m tuples, then > + * that is decisive; 2. otherwise, choose the page that shows > + * the smaller enlargement of its union area. > + * > + * I have chosen m = M/2, where M is the maximum number of > + * tuples on a page. (Actually, this is only strictly > + * true for fixed size tuples. For variable size tuples, > + * there still might have to be only one tuple on a page, > + * if it is really big. But even with variable size > + * tuples we still try to get m as close as possible to M/2.) > + * > + * The question of which page shows the smaller enlargement of > + * its union area has already been answered, and the answer > + * stored in the choose_left field of the SPLITCOST record. > */ > left_feasible = (left_avail_space >= item_1_sz && > ((left_avail_space - item_1_sz) >= newitemsz || > @@ -988,8 +1105,18 @@ > left_avail_space >= newitemsz)); > if (left_feasible && right_feasible) > { > - /* Both feasible, use Guttman's algorithm */ > - choose_left = (size_alpha - size_l < size_beta - size_r); > + /* > + * Both feasible, use Guttman's algorithm. > + * First check the m condition described above, and if > + * that doesn't apply, choose the page with the smaller > + * enlargement of its union area. > + */ > + if (v->spl_nleft > max_after_split) > + choose_left = false; > + else if (v->spl_nright > max_after_split) > + choose_left = true; > + else > + choose_left = cost_vector[n].choose_left; > } > else if (left_feasible) > choose_left = true; > @@ -1023,6 +1150,11 @@ > } > } > > + if (num_tuples_without_seeds > 0) > + { > + pfree(cost_vector); > + } > + > *left = *right = InvalidOffsetNumber; /* add ending sentinels */ > > v->spl_ldatum = datum_l; > @@ -1152,6 +1284,21 @@ > fmgr_info(size_proc, &rtstate->sizeFn); > fmgr_info(inter_proc, &rtstate->interFn); > return; > +} > + > +/* for sorting SPLITCOST records in descending order */ > +static int > +qsort_comp_splitcost(const void *a, const void *b) > +{ > + float diff = > + ((SPLITCOST *)a)->cost_differential - > + ((SPLITCOST *)b)->cost_differential; > + if (diff < 0) > + return 1; > + else if (diff > 0) > + return -1; > + else > + return 0; > } > > #ifdef RTDEBUG > > ---------------------------(end of broadcast)--------------------------- > TIP 6: Have you searched our list archives? > > http://archives.postgresql.org -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026
Patch applied. Thanks. > I have made three changes to the rtree code: one bug fix and > two performance improvements. I put an explanation of the > changes at > > http://cs1.cs.nyu.edu/been/postgres-rtree.html > > The performance improvements are quite significant. > > All the changes are in the file src/backend/access/rtree/rtree.c > > I was working with the 7.1.3 code. > > I'm including the diff output as an attachment. > > Ken > > diff -Naur postgresql-7.1.3.old/src/backend/access/rtree/rtree.c postgresql-7.1.3/src/backend/access/rtree/rtree.c > --- postgresql-7.1.3.old/src/backend/access/rtree/rtree.c Wed Mar 21 22:59:16 2001 > +++ postgresql-7.1.3/src/backend/access/rtree/rtree.c Tue Sep 25 13:56:31 2001 > @@ -55,6 +55,14 @@ > Datum spl_rdatum; > } SPLITVEC; > > +/* for sorting tuples by cost, for picking split */ > +typedef struct SPLITCOST > +{ > + OffsetNumber offset_number; > + float cost_differential; > + bool choose_left; > +} SPLITCOST; > + > typedef struct RTSTATE > { > FmgrInfo unionFn; /* union function */ > @@ -79,6 +87,7 @@ > RTSTATE *rtstate); > static int nospace(Page p, IndexTuple it); > static void initRtstate(RTSTATE *rtstate, Relation index); > +static int qsort_comp_splitcost(const void *a, const void *b); > > > Datum > @@ -427,7 +436,12 @@ > FunctionCall2(&rtstate->sizeFn, datum, > PointerGetDatum(&newd_size)); > > - if (newd_size != old_size) > + /* > + * If newd_size == 0 we have degenerate rectangles, so we > + * don't know if there was any change, so we have to > + * assume there was. > + */ > + if ((newd_size == 0) || (newd_size != old_size)) > { > TupleDesc td = RelationGetDescr(r); > > @@ -503,6 +517,8 @@ > OffsetNumber *spl_left, > *spl_right; > TupleDesc tupDesc; > + int n; > + OffsetNumber newitemoff; > > p = (Page) BufferGetPage(buffer); > opaque = (RTreePageOpaque) PageGetSpecialPointer(p); > @@ -539,56 +555,64 @@ > spl_right = v.spl_right; > leftoff = rightoff = FirstOffsetNumber; > maxoff = PageGetMaxOffsetNumber(p); > - for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) > - { > - itemid = PageGetItemId(p, i); > - item = (IndexTuple) PageGetItem(p, itemid); > - > - if (i == *spl_left) > - { > - if (PageAddItem(left, (Item) item, IndexTupleSize(item), > - leftoff, LP_USED) == InvalidOffsetNumber) > - elog(ERROR, "rtdosplit: failed to copy index item in %s", > - RelationGetRelationName(r)); > - leftoff = OffsetNumberNext(leftoff); > - spl_left++; /* advance in left split vector */ > - } > - else > - { > - Assert(i == *spl_right); > - if (PageAddItem(right, (Item) item, IndexTupleSize(item), > - rightoff, LP_USED) == InvalidOffsetNumber) > - elog(ERROR, "rtdosplit: failed to copy index item in %s", > - RelationGetRelationName(r)); > - rightoff = OffsetNumberNext(rightoff); > - spl_right++; /* advance in right split vector */ > - } > - } > + newitemoff = OffsetNumberNext(maxoff); > > /* build an InsertIndexResult for this insertion */ > res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); > > - /* now insert the new index tuple */ > - if (*spl_left == maxoff + 1) > + /* > + * spl_left contains a list of the offset numbers of the > + * tuples that will go to the left page. For each offset > + * number, get the tuple item, then add the item to the > + * left page. Similarly for the right side. > + */ > + > + /* fill left node */ > + for (n = 0; n < v.spl_nleft; n++) > { > - if (PageAddItem(left, (Item) itup, IndexTupleSize(itup), > + i = *spl_left; > + if (i == newitemoff) > + item = itup; > + else > + { > + itemid = PageGetItemId(p, i); > + item = (IndexTuple) PageGetItem(p, itemid); > + } > + > + if (PageAddItem(left, (Item) item, IndexTupleSize(item), > leftoff, LP_USED) == InvalidOffsetNumber) > elog(ERROR, "rtdosplit: failed to add index item to %s", > RelationGetRelationName(r)); > leftoff = OffsetNumberNext(leftoff); > - ItemPointerSet(&(res->pointerData), lbknum, leftoff); > - spl_left++; > + > + if (i == newitemoff) > + ItemPointerSet(&(res->pointerData), lbknum, leftoff); > + > + spl_left++; /* advance in left split vector */ > } > - else > + > + /* fill right node */ > + for (n = 0; n < v.spl_nright; n++) > { > - Assert(*spl_right == maxoff + 1); > - if (PageAddItem(right, (Item) itup, IndexTupleSize(itup), > + i = *spl_right; > + if (i == newitemoff) > + item = itup; > + else > + { > + itemid = PageGetItemId(p, i); > + item = (IndexTuple) PageGetItem(p, itemid); > + } > + > + if (PageAddItem(right, (Item) item, IndexTupleSize(item), > rightoff, LP_USED) == InvalidOffsetNumber) > elog(ERROR, "rtdosplit: failed to add index item to %s", > RelationGetRelationName(r)); > rightoff = OffsetNumberNext(rightoff); > - ItemPointerSet(&(res->pointerData), rbknum, rightoff); > - spl_right++; > + > + if (i == newitemoff) > + ItemPointerSet(&(res->pointerData), rbknum, rightoff); > + > + spl_right++; /* advance in right split vector */ > } > > /* Make sure we consumed all of the split vectors, and release 'em */ > @@ -741,8 +765,10 @@ > * In addition, the item to be added (itup) is listed in the appropriate > * vector. It is represented by item number N+1 (N = # of items on page). > * > - * Both vectors appear in sequence order with a terminating sentinel value > - * of InvalidOffsetNumber. > + * Both vectors have a terminating sentinel value of InvalidOffsetNumber, > + * but the sentinal value is no longer used, because the SPLITVEC > + * vector also contains the length of each vector, and that information > + * is now used to iterate over them in rtdosplit(). --kbb, 21 Sept 2001 > * > * The bounding-box datums for the two new pages are also returned in *v. > * > @@ -797,6 +823,12 @@ > item_2_sz, > left_avail_space, > right_avail_space; > + int total_num_tuples, > + num_tuples_without_seeds, > + max_after_split; /* in Guttman's lingo, (M - m) */ > + float diff; /* diff between cost of putting tuple left or right */ > + SPLITCOST *cost_vector; > + int n; > > /* > * First, make sure the new item is not so large that we can't > @@ -812,6 +844,9 @@ > maxoff = PageGetMaxOffsetNumber(page); > newitemoff = OffsetNumberNext(maxoff); /* phony index for new > * item */ > + total_num_tuples = newitemoff; > + num_tuples_without_seeds = total_num_tuples - 2; > + max_after_split = total_num_tuples / 2; /* works for m = M/2 */ > > /* Make arrays big enough for worst case, including sentinel */ > nbytes = (maxoff + 2) * sizeof(OffsetNumber); > @@ -909,47 +944,111 @@ > right_avail_space = RTPageAvailSpace - IndexTupleTotalSize(item_2); > > /* > - * Now split up the regions between the two seeds. An important > - * property of this split algorithm is that the split vector v has the > - * indices of items to be split in order in its left and right > - * vectors. We exploit this property by doing a merge in the code > - * that actually splits the page. > + * Now split up the regions between the two seeds. > + * > + * The cost_vector array will contain hints for determining where > + * each tuple should go. Each record in the array will contain > + * a boolean, choose_left, that indicates which node the tuple > + * prefers to be on, and the absolute difference in cost between > + * putting the tuple in its favored node and in the other node. > * > - * For efficiency, we also place the new index tuple in this loop. This > - * is handled at the very end, when we have placed all the existing > - * tuples and i == maxoff + 1. > + * Later, we will sort the cost_vector in descending order by cost > + * difference, and consider the tuples in that order for > + * placement. That way, the tuples that *really* want to be in > + * one node or the other get to choose first, and the tuples that > + * don't really care choose last. > + * > + * First, build the cost_vector array. The new index tuple will > + * also be handled in this loop, and represented in the array, > + * with i==newitemoff. > + * > + * In the case of variable size tuples it is possible that we only > + * have the two seeds and no other tuples, in which case we don't > + * do any of this cost_vector stuff. > */ > + > + /* to keep compiler quiet */ > + cost_vector = (SPLITCOST *) NULL; > + > + if (num_tuples_without_seeds > 0) > + { > + cost_vector = > + (SPLITCOST *) palloc(num_tuples_without_seeds * sizeof(SPLITCOST)); > + n = 0; > + for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i)) > + { > + /* Compute new union datums and sizes for both choices */ > + > + if ((i == seed_1) || (i == seed_2)) > + continue; > + else if (i == newitemoff) > + item_1 = itup; > + else > + item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); > + > + datum_alpha = IndexTupleGetDatum(item_1); > + union_dl = FunctionCall2(&rtstate->unionFn, datum_l, datum_alpha); > + union_dr = FunctionCall2(&rtstate->unionFn, datum_r, datum_alpha); > + FunctionCall2(&rtstate->sizeFn, union_dl, > + PointerGetDatum(&size_alpha)); > + FunctionCall2(&rtstate->sizeFn, union_dr, > + PointerGetDatum(&size_beta)); > + > + diff = (size_alpha - size_l) - (size_beta - size_r); > + > + cost_vector[n].offset_number = i; > + cost_vector[n].cost_differential = fabs(diff); > + cost_vector[n].choose_left = (diff < 0); > + > + n++; > + } > + > + /* > + * Sort the array. The function qsort_comp_splitcost is > + * set up "backwards", to provided descending order. > + */ > + qsort(cost_vector, num_tuples_without_seeds, sizeof(SPLITCOST), > + &qsort_comp_splitcost); > + } > + > + /* > + * Now make the final decisions about where each tuple will go, > + * and build the vectors to return in the SPLITVEC record. > + * > + * The cost_vector array contains (descriptions of) all the > + * tuples, in the order that we want to consider them, so we > + * we just iterate through it and place each tuple in left > + * or right nodes, according to the criteria described below. > + */ > + > left = v->spl_left; > v->spl_nleft = 0; > right = v->spl_right; > v->spl_nright = 0; > > - for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i)) > + /* Place the seeds first. > + * left avail space, left union, right avail space, and right > + * union have already been adjusted for the seeds. > + */ > + > + *left++ = seed_1; > + v->spl_nleft++; > + > + *right++ = seed_2; > + v->spl_nright++; > + > + for (n = 0; n < num_tuples_without_seeds; n++) > { > bool left_feasible, > right_feasible, > choose_left; > > /* > - * If we've already decided where to place this item, just put it > - * on the correct list. Otherwise, we need to figure out which > - * page needs the least enlargement in order to store the item. > + * We need to figure out which page needs the least > + * enlargement in order to store the item. > */ > > - if (i == seed_1) > - { > - *left++ = i; > - v->spl_nleft++; > - /* left avail_space & union already includes this one */ > - continue; > - } > - if (i == seed_2) > - { > - *right++ = i; > - v->spl_nright++; > - /* right avail_space & union already includes this one */ > - continue; > - } > + i = cost_vector[n].offset_number; > > /* Compute new union datums and sizes for both possible additions */ > if (i == newitemoff) > @@ -979,6 +1078,24 @@ > * (We know that all the old items together can fit on one page, so > * we need not worry about any other problem than failing to fit > * the new item.) > + * > + * Guttman's algorithm actually has two factors to consider (in > + * order): 1. if one node has so many tuples already assigned to > + * it that the other needs all the rest in order to satisfy the > + * condition that neither node has fewer than m tuples, then > + * that is decisive; 2. otherwise, choose the page that shows > + * the smaller enlargement of its union area. > + * > + * I have chosen m = M/2, where M is the maximum number of > + * tuples on a page. (Actually, this is only strictly > + * true for fixed size tuples. For variable size tuples, > + * there still might have to be only one tuple on a page, > + * if it is really big. But even with variable size > + * tuples we still try to get m as close as possible to M/2.) > + * > + * The question of which page shows the smaller enlargement of > + * its union area has already been answered, and the answer > + * stored in the choose_left field of the SPLITCOST record. > */ > left_feasible = (left_avail_space >= item_1_sz && > ((left_avail_space - item_1_sz) >= newitemsz || > @@ -988,8 +1105,18 @@ > left_avail_space >= newitemsz)); > if (left_feasible && right_feasible) > { > - /* Both feasible, use Guttman's algorithm */ > - choose_left = (size_alpha - size_l < size_beta - size_r); > + /* > + * Both feasible, use Guttman's algorithm. > + * First check the m condition described above, and if > + * that doesn't apply, choose the page with the smaller > + * enlargement of its union area. > + */ > + if (v->spl_nleft > max_after_split) > + choose_left = false; > + else if (v->spl_nright > max_after_split) > + choose_left = true; > + else > + choose_left = cost_vector[n].choose_left; > } > else if (left_feasible) > choose_left = true; > @@ -1023,6 +1150,11 @@ > } > } > > + if (num_tuples_without_seeds > 0) > + { > + pfree(cost_vector); > + } > + > *left = *right = InvalidOffsetNumber; /* add ending sentinels */ > > v->spl_ldatum = datum_l; > @@ -1152,6 +1284,21 @@ > fmgr_info(size_proc, &rtstate->sizeFn); > fmgr_info(inter_proc, &rtstate->interFn); > return; > +} > + > +/* for sorting SPLITCOST records in descending order */ > +static int > +qsort_comp_splitcost(const void *a, const void *b) > +{ > + float diff = > + ((SPLITCOST *)a)->cost_differential - > + ((SPLITCOST *)b)->cost_differential; > + if (diff < 0) > + return 1; > + else if (diff > 0) > + return -1; > + else > + return 0; > } > > #ifdef RTDEBUG > > ---------------------------(end of broadcast)--------------------------- > TIP 6: Have you searched our list archives? > > http://archives.postgresql.org -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 853-3000 + If your life is a hard drive, | 830 Blythe Avenue + Christ can be your backup. | Drexel Hill, Pennsylvania 19026