Robert Haas <robertmhaas@gmail.com> writes:
> On Sat, Jul 31, 2010 at 12:32 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:
>> So it'd definitely be a lot better than now. Maybe there'd be some
>> remaining performance gap for non-pathological cases, but I think we
>> would be willing to pay that in order to avoid bad behavior for the
>> pathological cases. It's difficult to say for sure of course
>> without going to the trouble of coding and testing it.
> Well, it sounds like a reasonable thing to try, then. You going to
> take a crack at it?
This worked out pretty well, so I've applied the attached patch.
I observed the following timings running the index build from
Artur Dobrowski's example:
8.4 9.0b4 HEAD
maintenance_work_mem setting 100MB 60MB 100MB
actual peak process image size 118M 126M 112M
elapsed time 964s 1289s 937s
so the memory bloat problem is definitely fixed and the speed
seems satisfactory.
It might be worth commenting that after I'd rewritten the rbtree code,
I spent awhile pulling my hair out because the code *still* blew past
the expected memory usage. It turned out that there were some
significant leaks in ginEntryInsert and subsidiary routines, causing
memory usage to continue to grow even once we realized we'd hit
maintenance_work_mem and started to dump data to disk. These leaks were
there before, but were masked in 8.4 because the old ginGetEntry code
incrementally pfreed itempointer-list storage as it walked the data
structure; this caused storage to be released faster than the leaks
would consume it. The new code doesn't release any of that storage
until the MemoryContextReset after the dump loop, so any leakage in
the dump loop becomes a visible increase in the peak space consumption.
I wasn't able to remove all of the leakage --- there's some fairly ugly
coding associated with index page splits that leaks an indextuple or so
per split, and I'm not sure where the extra tuple could safely be
pfree'd. That seems to be small enough to live with though; it's less
than 0.5% of the total memory usage in the example I'm working with.
I also cleaned up some other stuff that would've bit us eventually,
like unnecessary use of recursion in the rbtree iteration code.
regards, tom lane
Index: src/backend/access/gin/ginbtree.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/gin/ginbtree.c,v
retrieving revision 1.15
diff -c -r1.15 ginbtree.c
*** src/backend/access/gin/ginbtree.c 2 Jan 2010 16:57:33 -0000 1.15
--- src/backend/access/gin/ginbtree.c 1 Aug 2010 01:44:08 -0000
***************
*** 267,272 ****
--- 267,274 ----
/*
* Insert value (stored in GinBtree) to tree described by stack
+ *
+ * NB: the passed-in stack is freed, as though by freeGinBtreeStack.
*/
void
ginInsertValue(GinBtree btree, GinBtreeStack *stack)
***************
*** 308,317 ****
PageSetTLI(page, ThisTimeLineID);
}
! UnlockReleaseBuffer(stack->buffer);
END_CRIT_SECTION();
! freeGinBtreeStack(stack->parent);
return;
}
else
--- 310,320 ----
PageSetTLI(page, ThisTimeLineID);
}
! LockBuffer(stack->buffer, GIN_UNLOCK);
END_CRIT_SECTION();
! freeGinBtreeStack(stack);
!
return;
}
else
***************
*** 325,331 ****
*/
newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata);
-
((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno;
parent = stack->parent;
--- 328,333 ----
***************
*** 341,347 ****
((ginxlogSplit *) (rdata->data))->isRootSplit = TRUE;
((ginxlogSplit *) (rdata->data))->rrlink = InvalidBlockNumber;
-
page = BufferGetPage(stack->buffer);
lpage = BufferGetPage(lbuffer);
rpage = BufferGetPage(rbuffer);
--- 343,348 ----
***************
*** 375,384 ****
UnlockReleaseBuffer(rbuffer);
UnlockReleaseBuffer(lbuffer);
! UnlockReleaseBuffer(stack->buffer);
!
END_CRIT_SECTION();
return;
}
else
--- 376,386 ----
UnlockReleaseBuffer(rbuffer);
UnlockReleaseBuffer(lbuffer);
! LockBuffer(stack->buffer, GIN_UNLOCK);
END_CRIT_SECTION();
+ freeGinBtreeStack(stack);
+
return;
}
else
Index: src/backend/access/gin/ginbulk.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/gin/ginbulk.c,v
retrieving revision 1.19
diff -c -r1.19 ginbulk.c
*** src/backend/access/gin/ginbulk.c 26 Feb 2010 02:00:33 -0000 1.19
--- src/backend/access/gin/ginbulk.c 1 Aug 2010 01:44:08 -0000
***************
*** 19,35 ****
#include "utils/memutils.h"
! #define DEF_NENTRY 2048
! #define DEF_NPTR 4
- static void *
- ginAppendData(void *old, void *new, void *arg)
- {
- EntryAccumulator *eo = (EntryAccumulator *) old,
- *en = (EntryAccumulator *) new;
BuildAccumulator *accum = (BuildAccumulator *) arg;
if (eo->number >= eo->length)
{
accum->allocatedMemory -= GetMemoryChunkSpace(eo->list);
--- 19,39 ----
#include "utils/memutils.h"
! #define DEF_NENTRY 2048 /* EntryAccumulator allocation quantum */
! #define DEF_NPTR 5 /* ItemPointer initial allocation quantum */
+ /* Combiner function for rbtree.c */
+ static void
+ ginCombineData(RBNode *existing, const RBNode *newdata, void *arg)
+ {
+ EntryAccumulator *eo = (EntryAccumulator *) existing;
+ const EntryAccumulator *en = (const EntryAccumulator *) newdata;
BuildAccumulator *accum = (BuildAccumulator *) arg;
+ /*
+ * Note this code assumes that newdata contains only one itempointer.
+ */
if (eo->number >= eo->length)
{
accum->allocatedMemory -= GetMemoryChunkSpace(eo->list);
***************
*** 53,81 ****
eo->list[eo->number] = en->list[0];
eo->number++;
-
- return old;
}
static int
! cmpEntryAccumulator(const void *a, const void *b, void *arg)
{
! EntryAccumulator *ea = (EntryAccumulator *) a;
! EntryAccumulator *eb = (EntryAccumulator *) b;
BuildAccumulator *accum = (BuildAccumulator *) arg;
return compareAttEntries(accum->ginstate, ea->attnum, ea->value,
eb->attnum, eb->value);
}
void
ginInitBA(BuildAccumulator *accum)
{
accum->allocatedMemory = 0;
accum->entryallocator = NULL;
! accum->tree = rb_create(cmpEntryAccumulator, ginAppendData, NULL, accum);
! accum->iterator = NULL;
! accum->tmpList = NULL;
}
/*
--- 57,113 ----
eo->list[eo->number] = en->list[0];
eo->number++;
}
+ /* Comparator function for rbtree.c */
static int
! cmpEntryAccumulator(const RBNode *a, const RBNode *b, void *arg)
{
! const EntryAccumulator *ea = (const EntryAccumulator *) a;
! const EntryAccumulator *eb = (const EntryAccumulator *) b;
BuildAccumulator *accum = (BuildAccumulator *) arg;
return compareAttEntries(accum->ginstate, ea->attnum, ea->value,
eb->attnum, eb->value);
}
+ /* Allocator function for rbtree.c */
+ static RBNode *
+ ginAllocEntryAccumulator(void *arg)
+ {
+ BuildAccumulator *accum = (BuildAccumulator *) arg;
+ EntryAccumulator *ea;
+
+ /*
+ * Allocate memory by rather big chunks to decrease overhead. We have
+ * no need to reclaim RBNodes individually, so this costs nothing.
+ */
+ if (accum->entryallocator == NULL || accum->length >= DEF_NENTRY)
+ {
+ accum->entryallocator = palloc(sizeof(EntryAccumulator) * DEF_NENTRY);
+ accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator);
+ accum->length = 0;
+ }
+
+ /* Allocate new RBNode from current chunk */
+ ea = accum->entryallocator + accum->length;
+ accum->length++;
+
+ return (RBNode *) ea;
+ }
+
void
ginInitBA(BuildAccumulator *accum)
{
accum->allocatedMemory = 0;
+ accum->length = 0;
accum->entryallocator = NULL;
! accum->tree = rb_create(sizeof(EntryAccumulator),
! cmpEntryAccumulator,
! ginCombineData,
! ginAllocEntryAccumulator,
! NULL, /* no freefunc needed */
! (void *) accum);
}
/*
***************
*** 104,158 ****
static void
ginInsertEntry(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum, Datum entry)
{
! EntryAccumulator *key,
! *ea;
/*
! * Allocate memory by rather big chunk to decrease overhead, we don't keep
! * pointer to previously allocated chunks because they will free by
! * MemoryContextReset() call.
*/
! if (accum->entryallocator == NULL || accum->length >= DEF_NENTRY)
! {
! accum->entryallocator = palloc(sizeof(EntryAccumulator) * DEF_NENTRY);
! accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator);
! accum->length = 0;
! }
! /* "Allocate" new key in chunk */
! key = accum->entryallocator + accum->length;
! accum->length++;
! key->attnum = attnum;
! key->value = entry;
! /* To prevent multiple palloc/pfree cycles, we reuse array */
! if (accum->tmpList == NULL)
! accum->tmpList =
! (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR);
! key->list = accum->tmpList;
! key->list[0] = *heapptr;
!
! ea = rb_insert(accum->tree, key);
!
! if (ea == NULL)
{
/*
! * The key has been inserted, so continue initialization.
*/
! key->value = getDatumCopy(accum, attnum, entry);
! key->length = DEF_NPTR;
! key->number = 1;
! key->shouldSort = FALSE;
! accum->allocatedMemory += GetMemoryChunkSpace(key->list);
! accum->tmpList = NULL;
}
else
{
/*
! * The key has been appended, so "free" allocated key by decrementing
! * chunk's counter.
*/
- accum->length--;
}
}
--- 136,176 ----
static void
ginInsertEntry(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum, Datum entry)
{
! EntryAccumulator key;
! EntryAccumulator *ea;
! bool isNew;
/*
! * For the moment, fill only the fields of key that will be looked at
! * by cmpEntryAccumulator or ginCombineData.
*/
! key.attnum = attnum;
! key.value = entry;
! /* temporarily set up single-entry itempointer list */
! key.list = heapptr;
! ea = (EntryAccumulator *) rb_insert(accum->tree, (RBNode *) &key, &isNew);
! if (isNew)
{
/*
! * Finish initializing new tree entry, including making permanent
! * copies of the datum and itempointer.
*/
! ea->value = getDatumCopy(accum, attnum, entry);
! ea->length = DEF_NPTR;
! ea->number = 1;
! ea->shouldSort = FALSE;
! ea->list =
! (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR);
! ea->list[0] = *heapptr;
! accum->allocatedMemory += GetMemoryChunkSpace(ea->list);
}
else
{
/*
! * ginCombineData did everything needed.
*/
}
}
***************
*** 214,229 ****
return res;
}
ItemPointerData *
ginGetEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *value, uint32 *n)
{
EntryAccumulator *entry;
ItemPointerData *list;
! if (accum->iterator == NULL)
! accum->iterator = rb_begin_iterate(accum->tree, LeftRightWalk);
!
! entry = rb_iterate(accum->iterator);
if (entry == NULL)
return NULL;
--- 232,251 ----
return res;
}
+ /* Prepare to read out the rbtree contents using ginGetEntry */
+ void
+ ginBeginBAScan(BuildAccumulator *accum)
+ {
+ rb_begin_iterate(accum->tree, LeftRightWalk);
+ }
+
ItemPointerData *
ginGetEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *value, uint32 *n)
{
EntryAccumulator *entry;
ItemPointerData *list;
! entry = (EntryAccumulator *) rb_iterate(accum->tree);
if (entry == NULL)
return NULL;
Index: src/backend/access/gin/ginentrypage.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/gin/ginentrypage.c,v
retrieving revision 1.24
diff -c -r1.24 ginentrypage.c
*** src/backend/access/gin/ginentrypage.c 26 Feb 2010 02:00:33 -0000 1.24
--- src/backend/access/gin/ginentrypage.c 1 Aug 2010 01:44:08 -0000
***************
*** 615,621 ****
}
/*
! * return newly allocate rightmost tuple
*/
IndexTuple
ginPageGetLinkItup(Buffer buf)
--- 615,621 ----
}
/*
! * return newly allocated rightmost tuple
*/
IndexTuple
ginPageGetLinkItup(Buffer buf)
***************
*** 646,655 ****
--- 646,657 ----
itup = ginPageGetLinkItup(lbuf);
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) ==
InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
+ pfree(itup);
itup = ginPageGetLinkItup(rbuf);
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) ==
InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
+ pfree(itup);
}
void
Index: src/backend/access/gin/ginfast.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/gin/ginfast.c,v
retrieving revision 1.7
diff -c -r1.7 ginfast.c
*** src/backend/access/gin/ginfast.c 11 Feb 2010 14:29:50 -0000 1.7
--- src/backend/access/gin/ginfast.c 1 Aug 2010 01:44:08 -0000
***************
*** 786,791 ****
--- 786,792 ----
* significant amount of time - so, run it without locking pending
* list.
*/
+ ginBeginBAScan(&accum);
while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL)
{
ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE);
***************
*** 820,825 ****
--- 821,827 ----
ginInitBA(&accum);
processPendingPage(&accum, &datums, page, maxoff + 1);
+ ginBeginBAScan(&accum);
while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL)
ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE);
}
Index: src/backend/access/gin/gininsert.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/gin/gininsert.c,v
retrieving revision 1.26
diff -c -r1.26 gininsert.c
*** src/backend/access/gin/gininsert.c 11 Feb 2010 14:29:50 -0000 1.26
--- src/backend/access/gin/gininsert.c 1 Aug 2010 01:44:08 -0000
***************
*** 176,181 ****
--- 176,182 ----
gdi = prepareScanPostingTree(index, rootPostingTree, FALSE);
gdi->btree.isBuild = isBuild;
insertItemPointer(gdi, items, nitem);
+ pfree(gdi);
return;
}
***************
*** 254,259 ****
--- 255,261 ----
uint32 nlist;
OffsetNumber attnum;
+ ginBeginBAScan(&buildstate->accum);
while ((list = ginGetEntry(&buildstate->accum, &attnum, &entry, &nlist)) != NULL)
{
/* there could be many entries, so be willing to abort here */
***************
*** 360,365 ****
--- 362,368 ----
/* dump remaining entries to the index */
oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
+ ginBeginBAScan(&buildstate.accum);
while ((list = ginGetEntry(&buildstate.accum, &attnum, &entry, &nlist)) != NULL)
{
/* there could be many entries, so be willing to abort here */
Index: src/backend/utils/misc/rbtree.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/misc/rbtree.c,v
retrieving revision 1.3
diff -c -r1.3 rbtree.c
*** src/backend/utils/misc/rbtree.c 26 Feb 2010 02:01:14 -0000 1.3
--- src/backend/utils/misc/rbtree.c 1 Aug 2010 01:44:08 -0000
***************
*** 17,23 ****
* longest path from root to leaf is only about twice as long as the shortest,
* so lookups are guaranteed to run in O(lg n) time.
*
! * Copyright (c) 1996-2009, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/misc/rbtree.c,v 1.3 2010/02/26 02:01:14 momjian Exp $
--- 17,23 ----
* longest path from root to leaf is only about twice as long as the shortest,
* so lookups are guaranteed to run in O(lg n) time.
*
! * Copyright (c) 2009-2010, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/misc/rbtree.c,v 1.3 2010/02/26 02:01:14 momjian Exp $
***************
*** 28,39 ****
#include "utils/rbtree.h"
- /**********************************************************************
- * Declarations *
- **********************************************************************/
/*
! * Values for RBNode->iteratorState
*/
#define InitialState (0)
#define FirstStepDone (1)
--- 28,39 ----
#include "utils/rbtree.h"
/*
! * Values of RBNode.iteratorState
! *
! * Note that iteratorState has an undefined value except in nodes that are
! * currently being visited by an active iteration.
*/
#define InitialState (0)
#define FirstStepDone (1)
***************
*** 41,121 ****
#define ThirdStepDone (3)
/*
! * Colors of node
*/
#define RBBLACK (0)
#define RBRED (1)
! typedef struct RBNode
! {
! uint32 iteratorState:2,
! color: 1,
! unused:29;
! struct RBNode *left;
! struct RBNode *right;
! struct RBNode *parent;
! void *data;
! } RBNode;
!
struct RBTree
{
! RBNode *root;
rb_comparator comparator;
! rb_appendator appendator;
rb_freefunc freefunc;
void *arg;
};
- struct RBTreeIterator
- {
- RBNode *node;
- void *(*iterate) (RBTreeIterator *iterator);
- };
-
/*
* all leafs are sentinels, use customized NIL name to prevent
! * collision with sytem-wide NIL which is actually NULL
*/
! #define RBNIL &sentinel
! RBNode sentinel = {InitialState, RBBLACK, 0, RBNIL, RBNIL, NULL, NULL};
- /**********************************************************************
- * Create *
- **********************************************************************/
RBTree *
! rb_create(rb_comparator comparator, rb_appendator appendator,
! rb_freefunc freefunc, void *arg)
{
! RBTree *tree = palloc(sizeof(RBTree));
tree->root = RBNIL;
tree->comparator = comparator;
! tree->appendator = appendator;
tree->freefunc = freefunc;
-
tree->arg = arg;
return tree;
}
/**********************************************************************
* Search *
**********************************************************************/
! void *
! rb_find(RBTree *rb, void *data)
{
RBNode *node = rb->root;
- int cmp;
while (node != RBNIL)
{
! cmp = rb->comparator(data, node->data, rb->arg);
if (cmp == 0)
! return node->data;
else if (cmp < 0)
node = node->left;
else
--- 41,170 ----
#define ThirdStepDone (3)
/*
! * Colors of nodes (values of RBNode.color)
*/
#define RBBLACK (0)
#define RBRED (1)
! /*
! * RBTree control structure
! */
struct RBTree
{
! RBNode *root; /* root node, or RBNIL if tree is empty */
!
! /* Iteration state */
! RBNode *cur; /* current iteration node */
! RBNode *(*iterate) (RBTree *rb);
!
! /* Remaining fields are constant after rb_create */
!
! Size node_size; /* actual size of tree nodes */
! /* The caller-supplied manipulation functions */
rb_comparator comparator;
! rb_combiner combiner;
! rb_allocfunc allocfunc;
rb_freefunc freefunc;
+ /* Passthrough arg passed to all manipulation functions */
void *arg;
};
/*
* all leafs are sentinels, use customized NIL name to prevent
! * collision with system-wide constant NIL which is actually NULL
*/
! #define RBNIL (&sentinel)
! static RBNode sentinel = {InitialState, RBBLACK, RBNIL, RBNIL, NULL};
+ /*
+ * rb_create: create an empty RBTree
+ *
+ * Arguments are:
+ * node_size: actual size of tree nodes (> sizeof(RBNode))
+ * The manipulation functions:
+ * comparator: compare two RBNodes for less/equal/greater
+ * combiner: merge an existing tree entry with a new one
+ * allocfunc: allocate a new RBNode
+ * freefunc: free an old RBNode
+ * arg: passthrough pointer that will be passed to the manipulation functions
+ *
+ * Note that the combiner's righthand argument will be a "proposed" tree node,
+ * ie the input to rb_insert, in which the RBNode fields themselves aren't
+ * valid. Similarly, either input to the comparator may be a "proposed" node.
+ * This shouldn't matter since the functions aren't supposed to look at the
+ * RBNode fields, only the extra fields of the struct the RBNode is embedded
+ * in.
+ *
+ * The freefunc should just be pfree or equivalent; it should NOT attempt
+ * to free any subsidiary data, because the node passed to it may not contain
+ * valid data! freefunc can be NULL if caller doesn't require retail
+ * space reclamation.
+ *
+ * The RBTree node is palloc'd in the caller's memory context. Note that
+ * all contents of the tree are actually allocated by the caller, not here.
+ *
+ * Since tree contents are managed by the caller, there is currently not
+ * an explicit "destroy" operation; typically a tree would be freed by
+ * resetting or deleting the memory context it's stored in. You can pfree
+ * the RBTree node if you feel the urge.
+ */
RBTree *
! rb_create(Size node_size,
! rb_comparator comparator,
! rb_combiner combiner,
! rb_allocfunc allocfunc,
! rb_freefunc freefunc,
! void *arg)
{
! RBTree *tree = (RBTree *) palloc(sizeof(RBTree));
!
! Assert(node_size > sizeof(RBNode));
tree->root = RBNIL;
+ tree->cur = RBNIL;
+ tree->iterate = NULL;
+ tree->node_size = node_size;
tree->comparator = comparator;
! tree->combiner = combiner;
! tree->allocfunc = allocfunc;
tree->freefunc = freefunc;
tree->arg = arg;
return tree;
}
+ /* Copy the additional data fields from one RBNode to another */
+ static inline void
+ rb_copy_data(RBTree *rb, RBNode *dest, const RBNode *src)
+ {
+ memcpy(dest + 1, src + 1, rb->node_size - sizeof(RBNode));
+ }
+
/**********************************************************************
* Search *
**********************************************************************/
! /*
! * rb_find: search for a value in an RBTree
! *
! * data represents the value to try to find. Its RBNode fields need not
! * be valid, it's the extra data in the larger struct that is of interest.
! *
! * Returns the matching tree entry, or NULL if no match is found.
! */
! RBNode *
! rb_find(RBTree *rb, const RBNode *data)
{
RBNode *node = rb->root;
while (node != RBNIL)
{
! int cmp = rb->comparator(data, node, rb->arg);
if (cmp == 0)
! return node;
else if (cmp < 0)
node = node->left;
else
***************
*** 125,130 ****
--- 174,205 ----
return NULL;
}
+ /*
+ * rb_leftmost: fetch the leftmost (smallest-valued) tree node.
+ * Returns NULL if tree is empty.
+ *
+ * Note: in the original implementation this included an unlink step, but
+ * that's a bit awkward. Just call rb_delete on the result if that's what
+ * you want.
+ */
+ RBNode *
+ rb_leftmost(RBTree *rb)
+ {
+ RBNode *node = rb->root;
+ RBNode *leftmost = rb->root;
+
+ while (node != RBNIL)
+ {
+ leftmost = node;
+ node = node->left;
+ }
+
+ if (leftmost != RBNIL)
+ return leftmost;
+
+ return NULL;
+ }
+
/**********************************************************************
* Insertion *
**********************************************************************/
***************
*** 309,321 ****
}
/*
! * Allocate node for data and insert in tree.
*
! * Return old data (or result of appendator method) if it exists and NULL
! * otherwise.
*/
! void *
! rb_insert(RBTree *rb, void *data)
{
RBNode *current,
*parent,
--- 384,407 ----
}
/*
! * rb_insert: insert a new value into the tree.
*
! * data represents the value to insert. Its RBNode fields need not
! * be valid, it's the extra data in the larger struct that is of interest.
! *
! * If the value represented by "data" is not present in the tree, then
! * we copy "data" into a new tree entry and return that node, setting *isNew
! * to true.
! *
! * If the value represented by "data" is already present, then we call the
! * combiner function to merge data into the existing node, and return the
! * existing node, setting *isNew to false.
! *
! * "data" is unmodified in either case; it's typically just a local
! * variable in the caller.
*/
! RBNode *
! rb_insert(RBTree *rb, const RBNode *data, bool *isNew)
{
RBNode *current,
*parent,
***************
*** 325,367 ****
/* find where node belongs */
current = rb->root;
parent = NULL;
! cmp = 0;
while (current != RBNIL)
{
! cmp = rb->comparator(data, current->data, rb->arg);
if (cmp == 0)
{
/*
! * Found node with given key. If appendator method is provided,
! * call it to join old and new data; else, new data replaces old
! * data.
*/
! if (rb->appendator)
! {
! current->data = rb->appendator(current->data, data, rb->arg);
! return current->data;
! }
! else
! {
! void *old = current->data;
!
! current->data = data;
! return old;
! }
}
parent = current;
current = (cmp < 0) ? current->left : current->right;
}
! /* setup new node in tree */
! x = palloc(sizeof(RBNode));
! x->data = data;
! x->parent = parent;
! x->left = RBNIL;
! x->right = RBNIL;
! x->color = RBRED;
x->iteratorState = InitialState;
/* insert node in tree */
if (parent)
--- 411,447 ----
/* find where node belongs */
current = rb->root;
parent = NULL;
! cmp = 0; /* just to prevent compiler warning */
!
while (current != RBNIL)
{
! cmp = rb->comparator(data, current, rb->arg);
if (cmp == 0)
{
/*
! * Found node with given key. Apply combiner.
*/
! rb->combiner(current, data, rb->arg);
! *isNew = false;
! return current;
}
parent = current;
current = (cmp < 0) ? current->left : current->right;
}
! /*
! * Value is not present, so create a new node containing data.
! */
! *isNew = true;
!
! x = rb->allocfunc(rb->arg);
x->iteratorState = InitialState;
+ x->color = RBRED;
+ x->left = RBNIL;
+ x->right = RBNIL;
+ x->parent = parent;
+ rb_copy_data(rb, x, data);
/* insert node in tree */
if (parent)
***************
*** 377,383 ****
}
rb_insert_fixup(rb, x);
! return NULL;
}
/**********************************************************************
--- 457,464 ----
}
rb_insert_fixup(rb, x);
!
! return x;
}
/**********************************************************************
***************
*** 533,543 ****
}
/*
! * If we removed the tree successor of z rather than z itself, then attach
* the data for the removed node to the one we were supposed to remove.
*/
if (y != z)
! z->data = y->data;
/*
* Removing a black node might make some paths from root to leaf contain
--- 614,624 ----
}
/*
! * If we removed the tree successor of z rather than z itself, then move
* the data for the removed node to the one we were supposed to remove.
*/
if (y != z)
! rb_copy_data(rb, z, y);
/*
* Removing a black node might make some paths from root to leaf contain
***************
*** 546,805 ****
if (y->color == RBBLACK)
rb_delete_fixup(rb, x);
! pfree(y);
! }
!
! extern void
! rb_delete(RBTree *rb, void *data)
! {
! RBNode *node = rb->root;
! int cmp;
!
! while (node != RBNIL)
! {
! cmp = rb->comparator(data, node->data, rb->arg);
!
! if (cmp == 0)
! {
! /* found node to delete */
! if (rb->freefunc)
! rb->freefunc (node->data);
!
! node->data = NULL;
! rb_delete_node(rb, node);
! return;
! }
! else if (cmp < 0)
! node = node->left;
! else
! node = node->right;
! }
}
/*
! * Return data on left most node and delete
! * that node
*/
! extern void *
! rb_leftmost(RBTree *rb)
{
! RBNode *node = rb->root;
! RBNode *leftmost = rb->root;
! void *res = NULL;
!
! while (node != RBNIL)
! {
! leftmost = node;
! node = node->left;
! }
!
! if (leftmost != RBNIL)
! {
! res = leftmost->data;
! leftmost->data = NULL;
! rb_delete_node(rb, leftmost);
! }
!
! return res;
}
/**********************************************************************
* Traverse *
**********************************************************************/
! static void *
! rb_next_node(RBTreeIterator *iterator, RBNode *node)
! {
! node->iteratorState = InitialState;
! iterator->node = node;
! return iterator->iterate(iterator);
! }
! static void *
! rb_left_right_iterator(RBTreeIterator *iterator)
{
! RBNode *node = iterator->node;
switch (node->iteratorState)
{
case InitialState:
if (node->left != RBNIL)
{
node->iteratorState = FirstStepDone;
! return rb_next_node(iterator, node->left);
}
case FirstStepDone:
node->iteratorState = SecondStepDone;
! return node->data;
case SecondStepDone:
if (node->right != RBNIL)
{
node->iteratorState = ThirdStepDone;
! return rb_next_node(iterator, node->right);
}
case ThirdStepDone:
if (node->parent)
! {
! iterator->node = node->parent;
! return iterator->iterate(iterator);
! }
break;
default:
! elog(ERROR, "Unknow node state: %d", node->iteratorState);
}
return NULL;
}
! static void *
! rb_right_left_iterator(RBTreeIterator *iterator)
{
! RBNode *node = iterator->node;
switch (node->iteratorState)
{
case InitialState:
if (node->right != RBNIL)
{
node->iteratorState = FirstStepDone;
! return rb_next_node(iterator, node->right);
}
case FirstStepDone:
node->iteratorState = SecondStepDone;
! return node->data;
case SecondStepDone:
if (node->left != RBNIL)
{
node->iteratorState = ThirdStepDone;
! return rb_next_node(iterator, node->left);
}
case ThirdStepDone:
if (node->parent)
! {
! iterator->node = node->parent;
! return iterator->iterate(iterator);
! }
break;
default:
! elog(ERROR, "Unknow node state: %d", node->iteratorState);
}
return NULL;
}
! static void *
! rb_direct_iterator(RBTreeIterator *iterator)
{
! RBNode *node = iterator->node;
switch (node->iteratorState)
{
case InitialState:
node->iteratorState = FirstStepDone;
! return node->data;
case FirstStepDone:
if (node->left != RBNIL)
{
node->iteratorState = SecondStepDone;
! return rb_next_node(iterator, node->left);
}
case SecondStepDone:
if (node->right != RBNIL)
{
node->iteratorState = ThirdStepDone;
! return rb_next_node(iterator, node->right);
}
case ThirdStepDone:
if (node->parent)
! {
! iterator->node = node->parent;
! return iterator->iterate(iterator);
! }
break;
default:
! elog(ERROR, "Unknow node state: %d", node->iteratorState);
}
return NULL;
}
! static void *
! rb_inverted_iterator(RBTreeIterator *iterator)
{
! RBNode *node = iterator->node;
switch (node->iteratorState)
{
case InitialState:
if (node->left != RBNIL)
{
node->iteratorState = FirstStepDone;
! return rb_next_node(iterator, node->left);
}
case FirstStepDone:
if (node->right != RBNIL)
{
node->iteratorState = SecondStepDone;
! return rb_next_node(iterator, node->right);
}
case SecondStepDone:
node->iteratorState = ThirdStepDone;
! return node->data;
case ThirdStepDone:
if (node->parent)
! {
! iterator->node = node->parent;
! return iterator->iterate(iterator);
! }
break;
default:
! elog(ERROR, "Unknow node state: %d", node->iteratorState);
}
return NULL;
}
! RBTreeIterator *
rb_begin_iterate(RBTree *rb, RBOrderControl ctrl)
{
! RBTreeIterator *iterator = palloc(sizeof(RBTreeIterator));
!
! iterator->node = rb->root;
! if (iterator->node != RBNIL)
! iterator->node->iteratorState = InitialState;
switch (ctrl)
{
case LeftRightWalk: /* visit left, then self, then right */
! iterator->iterate = rb_left_right_iterator;
break;
case RightLeftWalk: /* visit right, then self, then left */
! iterator->iterate = rb_right_left_iterator;
break;
case DirectWalk: /* visit self, then left, then right */
! iterator->iterate = rb_direct_iterator;
break;
case InvertedWalk: /* visit left, then right, then self */
! iterator->iterate = rb_inverted_iterator;
break;
default:
! elog(ERROR, "Unknown iterator order: %d", ctrl);
}
-
- return iterator;
}
! void *
! rb_iterate(RBTreeIterator *iterator)
{
! if (iterator->node == RBNIL)
return NULL;
! return iterator->iterate(iterator);
! }
!
! void
! rb_free_iterator(RBTreeIterator *iterator)
! {
! pfree(iterator);
}
--- 627,871 ----
if (y->color == RBBLACK)
rb_delete_fixup(rb, x);
! /* Now we can recycle the y node */
! if (rb->freefunc)
! rb->freefunc(y, rb->arg);
}
/*
! * rb_delete: remove the given tree entry
! *
! * "node" must have previously been found via rb_find or rb_leftmost.
! * It is caller's responsibility to free any subsidiary data attached
! * to the node before calling rb_delete. (Do *not* try to push that
! * responsibility off to the freefunc, as some other physical node
! * may be the one actually freed!)
*/
! void
! rb_delete(RBTree *rb, RBNode *node)
{
! rb_delete_node(rb, node);
}
/**********************************************************************
* Traverse *
**********************************************************************/
! /*
! * The iterator routines were originally coded in tail-recursion style,
! * which is nice to look at, but is trouble if your compiler isn't smart
! * enough to optimize it. Now we just use looping.
! */
! #define descend(next_node) \
! do { \
! (next_node)->iteratorState = InitialState; \
! node = rb->cur = (next_node); \
! goto restart; \
! } while (0)
!
! #define ascend(next_node) \
! do { \
! node = rb->cur = (next_node); \
! goto restart; \
! } while (0)
!
! static RBNode *
! rb_left_right_iterator(RBTree *rb)
{
! RBNode *node = rb->cur;
+ restart:
switch (node->iteratorState)
{
case InitialState:
if (node->left != RBNIL)
{
node->iteratorState = FirstStepDone;
! descend(node->left);
}
+ /* FALL THROUGH */
case FirstStepDone:
node->iteratorState = SecondStepDone;
! return node;
case SecondStepDone:
if (node->right != RBNIL)
{
node->iteratorState = ThirdStepDone;
! descend(node->right);
}
+ /* FALL THROUGH */
case ThirdStepDone:
if (node->parent)
! ascend(node->parent);
break;
default:
! elog(ERROR, "unrecognized rbtree node state: %d",
! node->iteratorState);
}
return NULL;
}
! static RBNode *
! rb_right_left_iterator(RBTree *rb)
{
! RBNode *node = rb->cur;
+ restart:
switch (node->iteratorState)
{
case InitialState:
if (node->right != RBNIL)
{
node->iteratorState = FirstStepDone;
! descend(node->right);
}
+ /* FALL THROUGH */
case FirstStepDone:
node->iteratorState = SecondStepDone;
! return node;
case SecondStepDone:
if (node->left != RBNIL)
{
node->iteratorState = ThirdStepDone;
! descend(node->left);
}
+ /* FALL THROUGH */
case ThirdStepDone:
if (node->parent)
! ascend(node->parent);
break;
default:
! elog(ERROR, "unrecognized rbtree node state: %d",
! node->iteratorState);
}
return NULL;
}
! static RBNode *
! rb_direct_iterator(RBTree *rb)
{
! RBNode *node = rb->cur;
+ restart:
switch (node->iteratorState)
{
case InitialState:
node->iteratorState = FirstStepDone;
! return node;
case FirstStepDone:
if (node->left != RBNIL)
{
node->iteratorState = SecondStepDone;
! descend(node->left);
}
+ /* FALL THROUGH */
case SecondStepDone:
if (node->right != RBNIL)
{
node->iteratorState = ThirdStepDone;
! descend(node->right);
}
+ /* FALL THROUGH */
case ThirdStepDone:
if (node->parent)
! ascend(node->parent);
break;
default:
! elog(ERROR, "unrecognized rbtree node state: %d",
! node->iteratorState);
}
return NULL;
}
! static RBNode *
! rb_inverted_iterator(RBTree *rb)
{
! RBNode *node = rb->cur;
+ restart:
switch (node->iteratorState)
{
case InitialState:
if (node->left != RBNIL)
{
node->iteratorState = FirstStepDone;
! descend(node->left);
}
+ /* FALL THROUGH */
case FirstStepDone:
if (node->right != RBNIL)
{
node->iteratorState = SecondStepDone;
! descend(node->right);
}
+ /* FALL THROUGH */
case SecondStepDone:
node->iteratorState = ThirdStepDone;
! return node;
case ThirdStepDone:
if (node->parent)
! ascend(node->parent);
break;
default:
! elog(ERROR, "unrecognized rbtree node state: %d",
! node->iteratorState);
}
return NULL;
}
! /*
! * rb_begin_iterate: prepare to traverse the tree in any of several orders
! *
! * After calling rb_begin_iterate, call rb_iterate repeatedly until it
! * returns NULL or the traversal stops being of interest.
! *
! * If the tree is changed during traversal, results of further calls to
! * rb_iterate are unspecified.
! *
! * Note: this used to return a separately palloc'd iterator control struct,
! * but that's a bit pointless since the data structure is incapable of
! * supporting multiple concurrent traversals. Now we just keep the state
! * in RBTree.
! */
! void
rb_begin_iterate(RBTree *rb, RBOrderControl ctrl)
{
! rb->cur = rb->root;
! if (rb->cur != RBNIL)
! rb->cur->iteratorState = InitialState;
switch (ctrl)
{
case LeftRightWalk: /* visit left, then self, then right */
! rb->iterate = rb_left_right_iterator;
break;
case RightLeftWalk: /* visit right, then self, then left */
! rb->iterate = rb_right_left_iterator;
break;
case DirectWalk: /* visit self, then left, then right */
! rb->iterate = rb_direct_iterator;
break;
case InvertedWalk: /* visit left, then right, then self */
! rb->iterate = rb_inverted_iterator;
break;
default:
! elog(ERROR, "unrecognized rbtree iteration order: %d", ctrl);
}
}
! /*
! * rb_iterate: return the next node in traversal order, or NULL if no more
! */
! RBNode *
! rb_iterate(RBTree *rb)
{
! if (rb->cur == RBNIL)
return NULL;
! return rb->iterate(rb);
}
Index: src/include/access/gin.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/access/gin.h,v
retrieving revision 1.39
diff -c -r1.39 gin.h
*** src/include/access/gin.h 31 Jul 2010 00:30:54 -0000 1.39
--- src/include/access/gin.h 1 Aug 2010 01:44:08 -0000
***************
*** 565,570 ****
--- 565,571 ----
/* ginbulk.c */
typedef struct EntryAccumulator
{
+ RBNode rbnode;
Datum value;
uint32 length;
uint32 number;
***************
*** 579,593 ****
long allocatedMemory;
uint32 length;
EntryAccumulator *entryallocator;
- ItemPointerData *tmpList;
RBTree *tree;
- RBTreeIterator *iterator;
} BuildAccumulator;
extern void ginInitBA(BuildAccumulator *accum);
extern void ginInsertRecordBA(BuildAccumulator *accum,
ItemPointer heapptr,
OffsetNumber attnum, Datum *entries, int32 nentry);
extern ItemPointerData *ginGetEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *entry, uint32 *n);
/* ginfast.c */
--- 580,593 ----
long allocatedMemory;
uint32 length;
EntryAccumulator *entryallocator;
RBTree *tree;
} BuildAccumulator;
extern void ginInitBA(BuildAccumulator *accum);
extern void ginInsertRecordBA(BuildAccumulator *accum,
ItemPointer heapptr,
OffsetNumber attnum, Datum *entries, int32 nentry);
+ extern void ginBeginBAScan(BuildAccumulator *accum);
extern ItemPointerData *ginGetEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *entry, uint32 *n);
/* ginfast.c */
Index: src/include/utils/rbtree.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/utils/rbtree.h,v
retrieving revision 1.3
diff -c -r1.3 rbtree.h
*** src/include/utils/rbtree.h 11 May 2010 18:14:01 -0000 1.3
--- src/include/utils/rbtree.h 1 Aug 2010 01:44:08 -0000
***************
*** 3,46 ****
* rbtree.h
* interface for PostgreSQL generic Red-Black binary tree package
*
! * Copyright (c) 1996-2009, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/include/utils/rbtree.h,v 1.3 2010/05/11 18:14:01 rhaas Exp $
*
*-------------------------------------------------------------------------
*/
-
#ifndef RBTREE_H
#define RBTREE_H
typedef struct RBTree RBTree;
- typedef struct RBTreeIterator RBTreeIterator;
! typedef int (*rb_comparator) (const void *a, const void *b, void *arg);
! typedef void *(*rb_appendator) (void *currentdata, void *newval, void *arg);
! typedef void (*rb_freefunc) (void *a);
! extern RBTree *rb_create(rb_comparator comparator,
! rb_appendator appendator,
rb_freefunc freefunc,
void *arg);
! extern void *rb_find(RBTree *rb, void *data);
! extern void *rb_insert(RBTree *rb, void *data);
! extern void rb_delete(RBTree *rb, void *data);
! extern void *rb_leftmost(RBTree *rb);
! typedef enum RBOrderControl
! {
! LeftRightWalk,
! RightLeftWalk,
! DirectWalk,
! InvertedWalk
! } RBOrderControl;
! extern RBTreeIterator *rb_begin_iterate(RBTree *rb, RBOrderControl ctrl);
! extern void *rb_iterate(RBTreeIterator *iterator);
! extern void rb_free_iterator(RBTreeIterator *iterator);
! #endif
--- 3,66 ----
* rbtree.h
* interface for PostgreSQL generic Red-Black binary tree package
*
! * Copyright (c) 2009-2010, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/include/utils/rbtree.h,v 1.3 2010/05/11 18:14:01 rhaas Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef RBTREE_H
#define RBTREE_H
+ /*
+ * RBNode is intended to be used as the first field of a larger struct,
+ * whose additional fields carry whatever payload data the caller needs
+ * for a tree entry. (The total size of that larger struct is passed to
+ * rb_create.) RBNode is declared here to support this usage, but
+ * callers must treat it as an opaque struct.
+ */
+ typedef struct RBNode
+ {
+ char iteratorState; /* workspace for iterating through tree */
+ char color; /* node's current color, red or black */
+ struct RBNode *left; /* left child, or RBNIL if none */
+ struct RBNode *right; /* right child, or RBNIL if none */
+ struct RBNode *parent; /* parent, or NULL (not RBNIL!) if none */
+ } RBNode;
+
+ /* Opaque struct representing a whole tree */
typedef struct RBTree RBTree;
! /* Available tree iteration orderings */
! typedef enum RBOrderControl
! {
! LeftRightWalk, /* inorder: left child, node, right child */
! RightLeftWalk, /* reverse inorder: right, node, left */
! DirectWalk, /* preorder: node, left child, right child */
! InvertedWalk /* postorder: left child, right child, node */
! } RBOrderControl;
! /* Support functions to be provided by caller */
! typedef int (*rb_comparator) (const RBNode *a, const RBNode *b, void *arg);
! typedef void (*rb_combiner) (RBNode *existing, const RBNode *newdata, void *arg);
! typedef RBNode *(*rb_allocfunc) (void *arg);
! typedef void (*rb_freefunc) (RBNode *x, void *arg);
!
! extern RBTree *rb_create(Size node_size,
! rb_comparator comparator,
! rb_combiner combiner,
! rb_allocfunc allocfunc,
rb_freefunc freefunc,
void *arg);
! extern RBNode *rb_find(RBTree *rb, const RBNode *data);
! extern RBNode *rb_leftmost(RBTree *rb);
! extern RBNode *rb_insert(RBTree *rb, const RBNode *data, bool *isNew);
! extern void rb_delete(RBTree *rb, RBNode *node);
! extern void rb_begin_iterate(RBTree *rb, RBOrderControl ctrl);
! extern RBNode *rb_iterate(RBTree *rb);
! #endif /* RBTREE_H */