*** a/doc/src/sgml/gist.sgml
--- b/doc/src/sgml/gist.sgml
***************
*** 642,647 **** my_distance(PG_FUNCTION_ARGS)
--- 642,676 ----
+
+ GiST buffering build
+
+ Building large GiST indexes that don't fit in cache by simply inserting
+ all the tuples tends to be slow, because if the index tuples are scattered
+ across the index, a large fraction of the insertions need to perform
+ I/O. The exception is well-ordered datasets, where the part of the index
+ where new insertions go to stays well cached.
+ PostgreSQL from version 9.2 supports a more efficient method to build
+ GiST indexes based on buffering, which can dramatically reduce number of
+ random I/O needed.
+
+
+
+ However, buffering index build needs to call the penalty>
+ function more often, which consumes some extra CPU resources. Also, it can
+ infuence the quality of the produced index, in both positive and negative
+ directions. That influence depends on various factors, like the
+ distribution of the input data and operator class implementation.
+
+
+
+ By default, the index build switches to the buffering method when the
+ index size reaches . It can
+ be manually turned on or off by the BUFFERING parameter
+ to the CREATE INDEX clause.
+
+
+
*** a/doc/src/sgml/ref/create_index.sgml
--- b/doc/src/sgml/ref/create_index.sgml
***************
*** 341,346 **** CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ name
--- 341,366 ----
+
+ GiST indexes additionaly accepts parameters:
+
+
+
+
+
+ BUFFERING>
+
+
+ Determines whether the buffering build technique described in
+ is used to build the index. With
+ OFF> it is disabled, with ON> it is enabled, and
+ with AUTO> (default) it is initially disabled, but turned on
+ on-the-fly once the index size reaches .
+
+
+
+
+
*** a/src/backend/access/common/reloptions.c
--- b/src/backend/access/common/reloptions.c
***************
*** 30,35 ****
--- 30,38 ----
#include "utils/memutils.h"
#include "utils/rel.h"
+
+ static void validateBufferingOption(char *value);
+
/*
* Contents of pg_class.reloptions
*
***************
*** 219,224 **** static relopt_real realRelOpts[] =
--- 222,238 ----
static relopt_string stringRelOpts[] =
{
+ {
+ {
+ "buffering",
+ "Enables buffering build for this GiST index",
+ RELOPT_KIND_GIST
+ },
+ 4,
+ false,
+ validateBufferingOption,
+ "auto"
+ },
/* list terminator */
{{NULL}}
};
***************
*** 1267,1269 **** tablespace_reloptions(Datum reloptions, bool validate)
--- 1281,1304 ----
return (bytea *) tsopts;
}
+
+ /*
+ * Validator for "buffering" option of GiST indexed. Allows only "on", "off" and
+ * "auto" values.
+ */
+ static void
+ validateBufferingOption(char *value)
+ {
+ if (!value ||
+ (
+ strcmp(value, "on") &&
+ strcmp(value, "off") &&
+ strcmp(value, "auto")
+ )
+ )
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("Only \"on\", \"off\" and \"auto\" values are available for \"buffering\" option.")));
+ }
+ }
*** a/src/backend/access/gist/Makefile
--- b/src/backend/access/gist/Makefile
***************
*** 13,18 **** top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = gist.o gistutil.o gistxlog.o gistvacuum.o gistget.o gistscan.o \
! gistproc.o gistsplit.o
include $(top_srcdir)/src/backend/common.mk
--- 13,18 ----
include $(top_builddir)/src/Makefile.global
OBJS = gist.o gistutil.o gistxlog.o gistvacuum.o gistget.o gistscan.o \
! gistproc.o gistsplit.o gistbuild.o gistbuildbuffers.o
include $(top_srcdir)/src/backend/common.mk
*** a/src/backend/access/gist/README
--- b/src/backend/access/gist/README
***************
*** 24,29 **** The current implementation of GiST supports:
--- 24,30 ----
* provides NULL-safe interface to GiST core
* Concurrency
* Recovery support via WAL logging
+ * Buffering build algorithm
The support for concurrency implemented in PostgreSQL was developed based on
the paper "Access Methods for Next-Generation Database Systems" by
***************
*** 31,36 **** Marcel Kornaker:
--- 32,43 ----
http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz
+ Buffering build algorithm for GiST was developed based on the paper "Efficient
+ Bulk Operations on Dynamic R-trees" by Lars Arge, Klaus Hinrichs, Jan Vahrenhold
+ and Jeffrey Scott Vitter.
+
+ http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.135.9894&rep=rep1&type=pdf
+
The original algorithms were modified in several ways:
* They had to be adapted to PostgreSQL conventions. For example, the SEARCH
***************
*** 278,283 **** would complicate the insertion algorithm. So when an insertion sees a page
--- 285,421 ----
with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
crashed in the middle to completion by adding the downlink in the parent.
+ Buffering build algorithm
+ -------------------------
+
+ In the buffering index build algorithm, some or all internal nodes have a
+ buffer attached to them. When a tuple is inserted at the top, the descend down
+ the tree is stopped as soon as a buffer is reached, and the tuple is pushed to
+ the buffer. When a buffer gets too full, all the tuples in it are flushed to
+ the lower level, where they again hit lower level buffers or leaf pages. This
+ makes the insertions happen in more of a breadth-first than depth-first order,
+ which greatly reduces the amount of random I/O required.
+
+ In the algorithm, levels are numbered so that leaf pages have level zero,
+ and internal node levels count up from 1. This numbering ensures that a page's
+ level number never changes, even when the root page is split.
+
+ Level Tree
+
+ 3 *
+ / \
+ 2 * *
+ / | \ / | \
+ 1 * * * * * *
+ / \ / \ / \ / \ / \ / \
+ 0 o o o o o o o o o o o o
+
+ * - internal page
+ o - leaf page
+
+ Internal pages that belong to certain levels have buffers associated with
+ them. Leaf pages never have buffers. Which levels have buffers is controlled
+ by "level step" parameter: level numbers that are multiples of level_step
+ have buffers, while others do not. For example, if level_step = 2, then
+ pages on levels 2, 4, 6, ... have buffers. If level_step = 1 then every
+ internal page has a buffer.
+
+ Level Tree (level_step = 1) Tree (level_step = 2)
+
+ 3 *(b) *
+ / \ / \
+ 2 *(b) *(b) *(b) *(b)
+ / | \ / | \ / | \ / | \
+ 1 *(b) *(b) *(b) *(b) *(b) *(b) * * * * * *
+ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \
+ 0 o o o o o o o o o o o o o o o o o o o o o o o o
+
+ (b) - buffer
+
+ Logically, a buffer is just bunch of tuples. Physically, it is divided in
+ pages, backed by a temporary file. Each buffer can be in one of two states:
+ a) Last page of the buffer is kept in main memory. A node buffer is
+ automatically switched to this state when a new index tuple is added to it,
+ or a tuple is removed from it.
+ b) All pages of the buffer are swapped out to disk. When a buffer becomes too
+ full, and we start to flush it, all other buffers are switched to this state.
+
+ When an index tuple is inserted, its initial processing can end in one of the
+ following points:
+ 1) Leaf page, if the depth of the index is less than level_step, meaning that
+ none of the internal pages have buffers associated with them.
+ 2) Buffer of root page if root page has a buffer.
+ 3) Buffer of topmost level page that has buffers, if root page doesn't have a
+ buffer.
+
+ New index tuples are processed until the root buffer (or a buffer in the
+ topmost buffered level) becomes half-full. When a buffer becomes half-full,
+ it's added to the emptying queue, and will be emptied before a new tuple is
+ processed.
+
+ Buffer emptying process means that index tuples from the buffer are moved
+ into buffers at a lower level, or leaf pages. First, all the other buffers are
+ swapped to disk to free up the memory. Then tuples are popped from the buffer
+ one by one, and cascaded down the tree to the next buffer or leaf page below
+ the buffered node.
+
+ Emptying a buffer has the interesting dynamic property that any intermediate
+ pages between the buffer being emptied, and the next buffered or leaf level
+ below it, become cached. If there are no more buffers below the node, the leaf
+ pages where the tuples finally land on get cached too. If there are, the last
+ buffer page of each buffer below is kept in memory. This is illustrated in
+ the figures below:
+
+ Buffer being emptied to
+ lower-level buffers Buffer being emptied to leaf pages
+
+ +(fb) +(fb)
+ / \ / \
+ + + + +
+ / \ / \ / \ / \
+ *(ab) *(ab) *(ab) *(ab) x x x x
+
+ + - cached internal page
+ x - cached leaf page
+ * - non-cached internal page
+ (fb) - buffer being emptied
+ (ab) - buffers being appended to, with last page in memory
+
+ In the beginning of the index build, the level-step is chosen so that all those
+ pages involved in emptying one buffer fit in cache, so after each of those
+ pages have been accessed once and cached, emptying a buffer doesn't involve
+ any more I/O. This locality is where the speedup of the buffering algorithm
+ comes from.
+
+ Emptying one buffer can fill up one or more of the lower-level buffers,
+ triggering emptying of them as well. Whenever a buffer becomes too full, it's
+ added to the emptying queue, and will be emptied after the current buffer has
+ been processed.
+
+ To keep the size of each buffer limited even in the worst case, buffer emptying
+ is scheduled as soon as a buffer becomes half-full, and emptying it continues
+ until 1/2 of the nominal buffer size worth of tuples has been emptied. This
+ guarantees that when buffer emptying begins, all the lower-level buffers
+ are at most half-full. In the worst case that all the tuples are cascaded down
+ to the same lower-level buffer, that buffer therefore has enough space to
+ accommodate all the tuples emptied from the upper-level buffer. There is no
+ hard size limit in any of the data structures used, though, so this only needs
+ to be approximate; small overfilling of some buffers doesn't matter.
+
+ If an internal page that has a buffer associated with it is split, the buffer
+ needs to be split too. All tuples in the buffer are scanned through and
+ relocated to the correct sibling buffers, using the penalty function to decide
+ which buffer each tuple should go to.
+
+ After all tuples from the heap have been processed, there are still some index
+ tuples in the buffers. At this point, final buffer emptying starts. All buffers
+ are emptied in top-down order. This is slightly complicated by the fact that
+ new buffers can be allocated during the emptying, due to page splits. However,
+ the new buffers will always be siblings of buffers that haven't been fully
+ emptied yet; tuples never move upwards in the tree. The final emptying loops
+ through buffers at a given level until all buffers at that level have been
+ emptied, and then moves down to the next level.
+
Authors:
Teodor Sigaev
*** a/src/backend/access/gist/gist.c
--- b/src/backend/access/gist/gist.c
***************
*** 24,38 ****
#include "utils/memutils.h"
#include "utils/rel.h"
- /* Working state for gistbuild and its callback */
- typedef struct
- {
- GISTSTATE giststate;
- int numindexattrs;
- double indtuples;
- MemoryContext tmpCtx;
- } GISTBuildState;
-
/* A List of these is used represent a split-in-progress. */
typedef struct
{
--- 24,29 ----
***************
*** 41,56 **** typedef struct
} GISTPageSplitInfo;
/* non-export function prototypes */
- static void gistbuildCallback(Relation index,
- HeapTuple htup,
- Datum *values,
- bool *isnull,
- bool tupleIsAlive,
- void *state);
- static void gistdoinsert(Relation r,
- IndexTuple itup,
- Size freespace,
- GISTSTATE *GISTstate);
static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate,
--- 32,37 ----
***************
*** 89,226 **** createTempGistContext(void)
}
/*
- * Routine to build an index. Basically calls insert over and over.
- *
- * XXX: it would be nice to implement some sort of bulk-loading
- * algorithm, but it is not clear how to do that.
- */
- Datum
- gistbuild(PG_FUNCTION_ARGS)
- {
- Relation heap = (Relation) PG_GETARG_POINTER(0);
- Relation index = (Relation) PG_GETARG_POINTER(1);
- IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
- IndexBuildResult *result;
- double reltuples;
- GISTBuildState buildstate;
- Buffer buffer;
- Page page;
-
- /*
- * We expect to be called exactly once for any index relation. If that's
- * not the case, big trouble's what we have.
- */
- if (RelationGetNumberOfBlocks(index) != 0)
- elog(ERROR, "index \"%s\" already contains data",
- RelationGetRelationName(index));
-
- /* no locking is needed */
- initGISTstate(&buildstate.giststate, index);
-
- /* initialize the root page */
- buffer = gistNewBuffer(index);
- Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
- page = BufferGetPage(buffer);
-
- START_CRIT_SECTION();
-
- GISTInitBuffer(buffer, F_LEAF);
-
- MarkBufferDirty(buffer);
-
- if (RelationNeedsWAL(index))
- {
- XLogRecPtr recptr;
- XLogRecData rdata;
-
- rdata.data = (char *) &(index->rd_node);
- rdata.len = sizeof(RelFileNode);
- rdata.buffer = InvalidBuffer;
- rdata.next = NULL;
-
- recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
- }
- else
- PageSetLSN(page, GetXLogRecPtrForTemp());
-
- UnlockReleaseBuffer(buffer);
-
- END_CRIT_SECTION();
-
- /* build the index */
- buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
- buildstate.indtuples = 0;
-
- /*
- * create a temporary memory context that is reset once for each tuple
- * inserted into the index
- */
- buildstate.tmpCtx = createTempGistContext();
-
- /* do the heap scan */
- reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
- gistbuildCallback, (void *) &buildstate);
-
- /* okay, all heap tuples are indexed */
- MemoryContextDelete(buildstate.tmpCtx);
-
- freeGISTstate(&buildstate.giststate);
-
- /*
- * Return statistics
- */
- result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
-
- result->heap_tuples = reltuples;
- result->index_tuples = buildstate.indtuples;
-
- PG_RETURN_POINTER(result);
- }
-
- /*
- * Per-tuple callback from IndexBuildHeapScan
- */
- static void
- gistbuildCallback(Relation index,
- HeapTuple htup,
- Datum *values,
- bool *isnull,
- bool tupleIsAlive,
- void *state)
- {
- GISTBuildState *buildstate = (GISTBuildState *) state;
- IndexTuple itup;
- MemoryContext oldCtx;
-
- oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
-
- /* form an index tuple and point it at the heap tuple */
- itup = gistFormTuple(&buildstate->giststate, index,
- values, isnull, true /* size is currently bogus */ );
- itup->t_tid = htup->t_self;
-
- /*
- * Since we already have the index relation locked, we call gistdoinsert
- * directly. Normal access method calls dispatch through gistinsert,
- * which locks the relation for write. This is the right thing to do if
- * you're inserting single tups, but not when you're initializing the
- * whole index at once.
- *
- * In this path we respect the fillfactor setting, whereas insertions
- * after initial build do not.
- */
- gistdoinsert(index, itup,
- RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR),
- &buildstate->giststate);
-
- buildstate->indtuples += 1;
- MemoryContextSwitchTo(oldCtx);
- MemoryContextReset(buildstate->tmpCtx);
- }
-
- /*
* gistbuildempty() -- build an empty gist index in the initialization fork
*/
Datum
--- 70,75 ----
***************
*** 275,281 **** gistinsert(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(false);
}
-
/*
* Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple
* at that offset is atomically removed along with inserting the new tuples.
--- 124,129 ----
***************
*** 293,311 **** gistinsert(PG_FUNCTION_ARGS)
* In that case, we continue to hold the root page locked, and the child
* pages are released; note that new tuple(s) are *not* on the root page
* but in one of the new child pages.
*/
! static bool
gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
Buffer buffer,
IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
Buffer leftchildbuf,
! List **splitinfo)
{
Page page = BufferGetPage(buffer);
bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
XLogRecPtr recptr;
int i;
bool is_split;
/*
* Refuse to modify a page that's incompletely split. This should not
--- 141,167 ----
* In that case, we continue to hold the root page locked, and the child
* pages are released; note that new tuple(s) are *not* on the root page
* but in one of the new child pages.
+ *
+ * Also this function have some special behaviour in buffering build. It takes
+ * care about maintaining data structured of buffering build: creates new
+ * root path item if needed and relocates buffer of splitted node. Also it
+ * doesn't returns splitinfo to the caller but uses simplified downlinks
+ * insertion by recursive call.
*/
! bool
gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
Buffer buffer,
IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
Buffer leftchildbuf,
! List **splitinfo,
! GISTBufferingInsertStack * path)
{
Page page = BufferGetPage(buffer);
bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
XLogRecPtr recptr;
int i;
bool is_split;
+ GISTBuildBuffers *gfbb = giststate->gfbb;
/*
* Refuse to modify a page that's incompletely split. This should not
***************
*** 319,325 **** gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
if (GistFollowRight(page))
elog(ERROR, "concurrent GiST page split was incomplete");
! *splitinfo = NIL;
/*
* if isupdate, remove old key: This node's key has been modified, either
--- 175,188 ----
if (GistFollowRight(page))
elog(ERROR, "concurrent GiST page split was incomplete");
! if (!gfbb)
! {
! /*
! * We haven't to return splitinfo in buffering build. Otherwise
! * initialize splitinfo as empty list.
! */
! *splitinfo = NIL;
! }
/*
* if isupdate, remove old key: This node's key has been modified, either
***************
*** 408,413 **** gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
--- 271,320 ----
GistTupleSetValid(ptr->itup);
}
+ /* Are we inside a buffering build? */
+ if (gfbb)
+ {
+ /*
+ * Some more actions in buffering build: take care about root path
+ * item if root split, simplified search for correct parent and
+ * relocation of buffers.
+ */
+ if (is_rootsplit)
+ {
+ /*
+ * Adjust the top element in the insert stacks for the new
+ * root page.
+ */
+ GISTBufferingInsertStack *oldroot = gfbb->rootitem;
+
+ gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc(
+ gfbb->context, sizeof(GISTBufferingInsertStack));
+ gfbb->rootitem->parent = NULL;
+ gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
+ gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
+ gfbb->rootitem->level = oldroot->level + 1;
+ gfbb->rootitem->refCount = 1;
+
+ oldroot->parent = gfbb->rootitem;
+ oldroot->blkno = dist->block.blkno;
+ oldroot->downlinkoffnum = InvalidOffsetNumber;
+ }
+
+ /*
+ * Parent may be changed from the moment we set it. So, let us
+ * adjust the parent.
+ */
+ if (!is_rootsplit)
+ gistBufferingFindCorrectParent(giststate, state->r, path);
+
+ /*
+ * Relocate index tuples from buffer of splitted page between
+ * buffers of the pages produced by split.
+ */
+ gistRelocateBuildBuffersOnSplit(giststate->gfbb, giststate, state->r,
+ path, buffer, dist);
+ }
+
/*
* If this is a root split, we construct the new root page with the
* downlinks here directly, instead of requiring the caller to insert
***************
*** 439,447 **** gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
rootpg.next = dist;
dist = &rootpg;
}
! else
{
! /* Prepare split-info to be returned to caller */
for (ptr = dist; ptr; ptr = ptr->next)
{
GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo));
--- 346,357 ----
rootpg.next = dist;
dist = &rootpg;
}
! else if (!gfbb)
{
! /*
! * If we're not in buffering build then prepare split-info to be
! * returned to caller.
! */
for (ptr = dist; ptr; ptr = ptr->next)
{
GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo));
***************
*** 474,480 **** gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
else
GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
! if (ptr->next && !is_rootsplit)
GistMarkFollowRight(ptr->page);
else
GistClearFollowRight(ptr->page);
--- 384,390 ----
else
GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
! if (ptr->next && !is_rootsplit && !gfbb)
GistMarkFollowRight(ptr->page);
else
GistClearFollowRight(ptr->page);
***************
*** 508,514 **** gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
/* Write the WAL record */
if (RelationNeedsWAL(state->r))
recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf,
! dist, oldrlink, oldnsn, leftchildbuf);
else
recptr = GetXLogRecPtrForTemp();
--- 418,425 ----
/* Write the WAL record */
if (RelationNeedsWAL(state->r))
recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf,
! dist, oldrlink, oldnsn, leftchildbuf,
! gfbb ? true : false);
else
recptr = GetXLogRecPtrForTemp();
***************
*** 524,535 **** gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
* If this was a root split, we've already inserted the downlink
* pointers, in the form of a new root page. Therefore we can release
* all the new buffers, and keep just the root page locked.
*/
! if (is_rootsplit)
{
for (ptr = dist->next; ptr; ptr = ptr->next)
UnlockReleaseBuffer(ptr->buffer);
}
}
else
{
--- 435,485 ----
* If this was a root split, we've already inserted the downlink
* pointers, in the form of a new root page. Therefore we can release
* all the new buffers, and keep just the root page locked.
+ *
+ * In buffering build due to no concurrent activity, we can use
+ * simplified downlinks insertion. So in that case we also can release
+ * all the new buffers.
*/
! if (is_rootsplit || gfbb)
{
for (ptr = dist->next; ptr; ptr = ptr->next)
UnlockReleaseBuffer(ptr->buffer);
}
+
+ if (gfbb && !is_rootsplit)
+ {
+ /*
+ * Simplified insertion of downlinkg in buffering build.
+ */
+ IndexTuple *itups;
+ int cnt = 0,
+ i;
+ Buffer parentBuffer;
+
+ /* Count number of downlinks for insert. */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ cnt++;
+ }
+
+ /* Allocate array of downlinks index tuples */
+ itups = (IndexTuple *) palloc(sizeof(IndexTuple) * cnt);
+
+ /* Fill that array */
+ i = 0;
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ itups[i] = ptr->itup;
+ i++;
+ }
+
+ /* Insert donwlinks into parent. */
+ parentBuffer = ReadBuffer(state->r, path->parent->blkno);
+ LockBuffer(parentBuffer, GIST_EXCLUSIVE);
+ gistplacetopage(state, giststate, parentBuffer,
+ itups, cnt, path->downlinkoffnum, InvalidBuffer, NULL, path->parent);
+ UnlockReleaseBuffer(parentBuffer);
+ }
}
else
{
***************
*** 570,577 **** gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
recptr = GetXLogRecPtrForTemp();
PageSetLSN(page, recptr);
}
-
- *splitinfo = NIL;
}
/*
--- 520,525 ----
***************
*** 608,614 **** gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
* this routine assumes it is invoked in a short-lived memory context,
* so it does not bother releasing palloc'd allocations.
*/
! static void
gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
{
ItemId iid;
--- 556,562 ----
* this routine assumes it is invoked in a short-lived memory context,
* so it does not bother releasing palloc'd allocations.
*/
! void
gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
{
ItemId iid;
***************
*** 917,924 **** gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum)
{
/*
* Page was split while we looked elsewhere. We didn't see the
! * downlink to the right page when we scanned the parent, so
! * add it to the queue now.
*
* Put the right page ahead of the queue, so that we visit it
* next. That's important, because if this is the lowest internal
--- 865,872 ----
{
/*
* Page was split while we looked elsewhere. We didn't see the
! * downlink to the right page when we scanned the parent, so add
! * it to the queue now.
*
* Put the right page ahead of the queue, so that we visit it
* next. That's important, because if this is the lowest internal
***************
*** 965,971 **** gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum)
elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u",
RelationGetRelationName(r), child);
! return NULL; /* keep compiler quiet */
}
/*
--- 913,919 ----
elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u",
RelationGetRelationName(r), child);
! return NULL; /* keep compiler quiet */
}
/*
***************
*** 1195,1201 **** gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
is_split = gistplacetopage(state, giststate, stack->buffer,
tuples, ntup, oldoffnum,
leftchild,
! &splitinfo);
if (splitinfo)
gistfinishsplit(state, stack, giststate, splitinfo);
--- 1143,1149 ----
is_split = gistplacetopage(state, giststate, stack->buffer,
tuples, ntup, oldoffnum,
leftchild,
! &splitinfo, NULL);
if (splitinfo)
gistfinishsplit(state, stack, giststate, splitinfo);
***************
*** 1414,1419 **** initGISTstate(GISTSTATE *giststate, Relation index)
--- 1362,1368 ----
else
giststate->supportCollation[i] = DEFAULT_COLLATION_OID;
}
+ giststate->gfbb = NULL;
}
void
*** /dev/null
--- b/src/backend/access/gist/gistbuild.c
***************
*** 0 ****
--- 1,872 ----
+ /*-------------------------------------------------------------------------
+ *
+ * gistbuild.c
+ * build algorithm for GiST indexes implementation.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistbuild.c
+ *
+ *-------------------------------------------------------------------------
+ */
+ #include "postgres.h"
+
+ #include "access/genam.h"
+ #include "access/gist_private.h"
+ #include "catalog/index.h"
+ #include "catalog/pg_collation.h"
+ #include "miscadmin.h"
+ #include "optimizer/cost.h"
+ #include "storage/bufmgr.h"
+ #include "storage/indexfsm.h"
+ #include "storage/smgr.h"
+ #include "utils/memutils.h"
+ #include "utils/rel.h"
+
+ /* Step of index tuples for check whether to switch to buffering build mode */
+ #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
+ #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
+
+ /* Working state for gistbuild and its callback */
+ typedef struct
+ {
+ GISTSTATE giststate;
+ int numindexattrs;
+ int64 indtuples;
+ int64 indtuplesSize;
+
+ /*
+ *
+ * ------------------------------------------------------------------------
+ * Buffering build mode. Possible values: 'y' - we are in buffering build
+ * mode. 's' - we are gathering statistics of index tuple size before
+ * switching to the buffering build mode. 'a' - we are now in regular
+ * build mode, but can switch to buffering build mode when we decide to.
+ * 'n' - we are in regular build mode and aren't going to switch.
+ * ------------------------------------------------------------------------
+ * */
+ char bufferingMode;
+ MemoryContext tmpCtx;
+ } GISTBuildState;
+
+ static void gistFreeUnreferencedPath(GISTBufferingInsertStack * path);
+ static bool gistProcessItup(GISTSTATE *giststate, GISTInsertState *state,
+ GISTBuildBuffers * gfbb, IndexTuple itup,
+ GISTBufferingInsertStack * startparent);
+ static void gistProcessEmptyingStack(GISTSTATE *giststate, GISTInsertState *state);
+ static void gistBufferingBuildInsert(Relation index, IndexTuple itup,
+ GISTBuildState *buildstate);
+ static void gistBuildCallback(Relation index,
+ HeapTuple htup,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state);
+ static int gistGetMaxLevel(Relation index);
+ static bool gistInitBuffering(GISTBuildState *buildstate, Relation index);
+
+ /*
+ * Free unreferenced parts of path;
+ */
+ static void
+ gistFreeUnreferencedPath(GISTBufferingInsertStack * path)
+ {
+ while (path->refCount == 0)
+ {
+ /*
+ * Path part is unreferenced. We can free it and decrease reference
+ * count of parent. If parent becomes unreferenced too procedure
+ * should be repeated for it.
+ */
+ GISTBufferingInsertStack *tmp = path->parent;
+
+ pfree(path);
+ path = tmp;
+ if (path)
+ path->refCount--;
+ else
+ break;
+ }
+ }
+
+ /*
+ * Decrease reference count of path part and remove unreferenced path parts if
+ * any.
+ */
+ void
+ gistDecreasePathRefcount(GISTBufferingInsertStack * path)
+ {
+ path->refCount--;
+ gistFreeUnreferencedPath(path);
+ }
+
+ /*
+ * Process index tuple. Run index tuple down until it meet leaf page or
+ * node buffer. If it meets a node buffer then it is just placed to it. If it
+ * meet leaf page then actual insert takes place. Returns true if we have to
+ * stop buffer emptying process (one of child buffers can't take index
+ * tuples anymore).
+ */
+ static bool
+ gistProcessItup(GISTSTATE *giststate, GISTInsertState *state,
+ GISTBuildBuffers * gfbb, IndexTuple itup,
+ GISTBufferingInsertStack * startparent)
+ {
+ GISTBufferingInsertStack *path;
+ BlockNumber childblkno;
+ Buffer buffer;
+ bool result = false;
+
+ /*
+ * NULL passed in startparent means that we start index tuple processing
+ * from the root.
+ */
+ if (!startparent)
+ path = gfbb->rootitem;
+ else
+ path = startparent;
+
+ /*
+ * Loop until we are on leaf page (level == 0) or we reach level with
+ * buffers (if it wasn't level that we've start at, because we should move
+ * forward at least in one level down).
+ */
+ for (;;)
+ {
+ ItemId iid;
+ IndexTuple idxtuple,
+ newtup;
+ Page page;
+ OffsetNumber childoffnum;
+ GISTBufferingInsertStack *parent;
+
+ /*
+ * Do we meet a level with buffers? Surely buffer of page we start
+ * from doesn't matter.
+ */
+ if (path != startparent && LEVEL_HAS_BUFFERS(path->level, gfbb))
+ break;
+
+ /* Do we meet leaf page? */
+ if (path->level == 0)
+ break;
+
+ /* Choose child for insertion */
+ buffer = ReadBuffer(state->r, path->blkno);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+
+ page = (Page) BufferGetPage(buffer);
+ childoffnum = gistchoose(state->r, page, itup, giststate);
+ iid = PageGetItemId(page, childoffnum);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+ /* Adjust key representing child if needed */
+ newtup = gistgetadjusted(state->r, idxtuple, itup, giststate);
+
+ if (newtup)
+ {
+ /*
+ * Key adjustment was actually produced a new key. So, we need to
+ * update it in the page.
+ */
+ gistplacetopage(state, giststate, buffer, &newtup, 1, childoffnum,
+ InvalidBuffer, NULL, path);
+ }
+ UnlockReleaseBuffer(buffer);
+
+ /* Create new path item representing current page */
+ parent = path;
+ path = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context,
+ sizeof(GISTBufferingInsertStack));
+ path->parent = parent;
+ path->level = parent->level - 1;
+ path->blkno = childblkno;
+ path->downlinkoffnum = childoffnum;
+
+ /* It's unreferenced just now */
+ path->refCount = 0;
+
+ /* Adjust reference count of parent */
+ if (parent)
+ parent->refCount++;
+ }
+
+ if (LEVEL_HAS_BUFFERS(path->level, gfbb))
+ {
+ /*
+ * We've reached level with buffers. Now place index tuple to the
+ * buffer and add buffer emptying stack element if buffer overflows.
+ */
+ NodeBuffer *childNodeBuffer;
+
+ /* Find node buffer or create a new one */
+ childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, path->blkno,
+ path->downlinkoffnum, path->parent,
+ true);
+
+ /* Add index tuple to it */
+ gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
+
+ if (BUFFER_HALF_FILLED(childNodeBuffer, gfbb) && !childNodeBuffer->plannedForEmptying)
+ {
+ /*
+ * Node buffer was overflowed just now. Lets' add it to the
+ * emptying stack. Emptying stack should persists That's why
+ * switching to persistent content.
+ */
+ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+
+ childNodeBuffer->plannedForEmptying = true;
+ gfbb->bufferEmptyingStack = lcons(childNodeBuffer,
+ gfbb->bufferEmptyingStack);
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
+ result = true;
+ }
+ else
+ {
+ /*
+ * We've reached leaf level. So, place index tuple here.
+ */
+ buffer = ReadBuffer(state->r, path->blkno);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ gistplacetopage(state, giststate, buffer, &itup, 1,
+ InvalidOffsetNumber, InvalidBuffer, NULL, path);
+ UnlockReleaseBuffer(buffer);
+ }
+
+ /*
+ * Free unreferenced path items if any. Path item may be referenced by
+ * node buffer.
+ */
+ gistFreeUnreferencedPath(path);
+
+ return result;
+ }
+
+
+ /*
+ * Find correct parent by following rightlinks in buffering index build. This
+ * method of parent searching is possible because no concurrent activity is
+ * possible while index builds.
+ */
+ void
+ gistBufferingFindCorrectParent(GISTSTATE *giststate, Relation r, GISTBufferingInsertStack * child)
+ {
+ GISTBuildBuffers *gfbb = giststate->gfbb;
+ GISTBufferingInsertStack *parent = child->parent;
+ OffsetNumber i,
+ maxoff;
+ ItemId iid;
+ IndexTuple idxtuple;
+ Buffer buffer;
+ Page page;
+ bool copied = false;
+
+ buffer = ReadBuffer(r, parent->blkno);
+ page = BufferGetPage(buffer);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ gistcheckpage(r, buffer);
+
+ /* Check if it was not moved */
+ if (child->downlinkoffnum != InvalidOffsetNumber)
+ {
+ iid = PageGetItemId(page, child->downlinkoffnum);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
+ {
+ /* Still there */
+ UnlockReleaseBuffer(buffer);
+ return;
+ }
+ }
+
+ /* parent is changed, look child in right links until found */
+ while (true)
+ {
+ /* Search for relevant downlink in the current page */
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ iid = PageGetItemId(page, i);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
+ {
+ /* yes!!, found */
+ child->downlinkoffnum = i;
+ UnlockReleaseBuffer(buffer);
+ return;
+ }
+ }
+
+ /*
+ * We should copy parent path item because some other path items can
+ * refer to it.
+ */
+ if (!copied)
+ {
+ parent = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context,
+ sizeof(GISTBufferingInsertStack));
+ memcpy(parent, child->parent, sizeof(GISTBufferingInsertStack));
+ if (parent->parent)
+ parent->parent->refCount++;
+ gistDecreasePathRefcount(child->parent);
+ child->parent = parent;
+ parent->refCount = 1;
+ copied = true;
+ }
+
+ /*
+ * Not found in current page. Move towards rightlink.
+ */
+ parent->blkno = GistPageGetOpaque(page)->rightlink;
+ UnlockReleaseBuffer(buffer);
+
+ if (parent->blkno == InvalidBlockNumber)
+ {
+ /*
+ * End of chain and still didn't find parent. Should not happen
+ * during index build.
+ */
+ break;
+ }
+
+ /* Get the next page */
+ buffer = ReadBuffer(r, parent->blkno);
+ page = BufferGetPage(buffer);
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ gistcheckpage(r, buffer);
+ }
+
+ elog(ERROR, "failed to re-find parent for block %u", child->blkno);
+ }
+
+ /*
+ * Process buffers emptying stack. Emptying of one buffer can cause emptying
+ * of other buffers. This function iterates until this cascading emptying
+ * process finished, e.g. until buffers emptying stack is empty.
+ */
+ static void
+ gistProcessEmptyingStack(GISTSTATE *giststate, GISTInsertState *state)
+ {
+ GISTBuildBuffers *gfbb = giststate->gfbb;
+
+ /* Iterate while we have elements in buffers emptying stack. */
+ while (gfbb->bufferEmptyingStack != NIL)
+ {
+ NodeBuffer *emptyingNodeBuffer;
+
+ /* Get node buffer from emptying stack. */
+ emptyingNodeBuffer = (NodeBuffer *) linitial(gfbb->bufferEmptyingStack);
+ gfbb->bufferEmptyingStack = list_delete_first(gfbb->bufferEmptyingStack);
+ emptyingNodeBuffer->plannedForEmptying = false;
+
+ /*
+ * We are going to load last pages of buffers where emptying will be
+ * to. So let's unload any previously loaded buffers.
+ */
+ gistUnloadNodeBuffers(gfbb);
+
+ /* Variables for split of current emptying buffer detection. */
+ gfbb->currentEmptyingBufferSplit = false;
+ gfbb->currentEmptyingBufferBlockNumber = emptyingNodeBuffer->nodeBlocknum;
+
+ while (true)
+ {
+ IndexTuple itup;
+
+ /* Get the next one index tuple from node buffer */
+ if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
+ break;
+
+ /* Run it down to the underlying node buffer or leaf page */
+ if (gistProcessItup(giststate, state, gfbb, itup, emptyingNodeBuffer->path))
+ break;
+
+ /* Free all the memory allocated during index tuple processing */
+ MemoryContextReset(CurrentMemoryContext);
+
+ /*
+ * If current emptying node buffer split we should stop emptying
+ * just because there is no such node buffer anymore.
+ */
+ if (gfbb->currentEmptyingBufferSplit)
+ break;
+ }
+ }
+ }
+
+ /*
+ * Insert function for buffering index build.
+ */
+ static void
+ gistBufferingBuildInsert(Relation index, IndexTuple itup,
+ GISTBuildState *buildstate)
+ {
+ GISTBuildBuffers *gfbb = buildstate->giststate.gfbb;
+ GISTInsertState insertstate;
+
+ memset(&insertstate, 0, sizeof(GISTInsertState));
+ insertstate.freespace = RelationGetTargetPageFreeSpace(index,
+ GIST_DEFAULT_FILLFACTOR);
+ insertstate.r = index;
+
+ /* We are ready for index tuple processing */
+ gistProcessItup(&buildstate->giststate, &insertstate, gfbb, itup, NULL);
+
+ /* Process buffer emptying stack if any */
+ gistProcessEmptyingStack(&buildstate->giststate, &insertstate);
+ }
+
+ /*
+ * Per-tuple callback from IndexBuildHeapScan.
+ */
+ static void
+ gistBuildCallback(Relation index,
+ HeapTuple htup,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state)
+ {
+ GISTBuildState *buildstate = (GISTBuildState *) state;
+ IndexTuple itup;
+ MemoryContext oldCtx;
+
+ oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = gistFormTuple(&buildstate->giststate, index,
+ values, isnull, true /* size is currently bogus */ );
+ itup->t_tid = htup->t_self;
+
+ if (buildstate->bufferingMode == 'y')
+ {
+ /* We've decided to use buffering. So let's use buffering insert. */
+ gistBufferingBuildInsert(index, itup, buildstate);
+ }
+ else
+ {
+ /*
+ * We didn't decide to use buffering yet or aren't goint to use it at
+ * all. Since we already have the index relation locked, we call
+ * gistdoinsert directly. Normal access method calls dispatch through
+ * gistinsert, which locks the relation for write. This is the right
+ * thing to do if you're inserting single tups, but not when you're
+ * initializing the whole index at once.
+ *
+ * In this path we respect the fillfactor setting, whereas insertions
+ * after initial build do not.
+ */
+ gistdoinsert(index, itup,
+ RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR),
+ &buildstate->giststate);
+ }
+
+ /* Increase statistics of index tuples count and their summary size. */
+ buildstate->indtuples += 1;
+ buildstate->indtuplesSize += IndexTupleSize(itup);
+
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(buildstate->tmpCtx);
+
+ if (buildstate->bufferingMode == 'y' &&
+ buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET)
+ {
+ /* We've to adjust buffers size now */
+
+ int pagesPerBuffer,
+ levelStep = buildstate->giststate.gfbb->levelStep;
+ int avgIndexTuplesPerPage,
+ itupAvgSize,
+ i;
+ Size pageFreeSpace;
+
+ /* Calc space of index page which is available for index tuples */
+ pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
+ - sizeof(ItemIdData)
+ - RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR);
+
+ /*
+ * Calculate average size of already inserted index tuples using
+ * gathered statistics.
+ */
+ itupAvgSize = round((double) buildstate->indtuplesSize /
+ (double) buildstate->indtuples);
+
+ avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
+
+ /* Recalculate required size of buffers. */
+ pagesPerBuffer = 2;
+ for (i = 0; i < levelStep; i++)
+ pagesPerBuffer *= avgIndexTuplesPerPage;
+
+ buildstate->giststate.gfbb->pagesPerBuffer = pagesPerBuffer;
+ }
+
+ /*
+ * For automatic switching to buffering mode, check whether index fits to
+ * effective cache. We calling smgrnblocks only each
+ * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples because frequent
+ * smgrnblocks calls can be expensive.
+ */
+ if ((buildstate->bufferingMode == 'a' &&
+ buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
+ effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
+ (buildstate->bufferingMode == 's' &&
+ buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
+ {
+ /*
+ * Index doesn't fit to effective cache anymore. Trying to switch to
+ * buffering build mode.
+ */
+ if (gistInitBuffering(buildstate, index))
+ {
+ /*
+ * Buffering build is successfully initialized. Now we can set
+ * appropriate flag.
+ */
+ buildstate->bufferingMode = 'y';
+ elog(INFO, "switching to buffered mode");
+ }
+ else
+ {
+ /*
+ * Failed to switch to buffering build due to not enough memory
+ * settings. Mark that we aren't going to switch anymore.
+ */
+ buildstate->bufferingMode = 'n';
+ }
+ }
+ }
+
+ /*
+ * Get maximum level number of GiST index. Scans tree from root until meets
+ * leaf page choosing first link in each page.
+ */
+ static int
+ gistGetMaxLevel(Relation index)
+ {
+ int maxLevel = 0;
+ BlockNumber blkno = GIST_ROOT_BLKNO;
+
+ while (true)
+ {
+ Buffer buffer;
+ Page page;
+ IndexTuple itup;
+
+ /* Read page */
+ buffer = ReadBuffer(index, blkno);
+ page = (Page) BufferGetPage(buffer);
+
+ /* Is it a leaf page? */
+ if (GistPageIsLeaf(page))
+ {
+ /* Page is leaf. We've counted height of tree. */
+ ReleaseBuffer(buffer);
+ break;
+ }
+
+ /*
+ * Page is not leaf. Iterate to underlying page using first link of
+ * it.
+ */
+ itup = (IndexTuple) PageGetItem(page,
+ PageGetItemId(page, FirstOffsetNumber));
+ blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ ReleaseBuffer(buffer);
+
+ /*
+ * We're going down on the tree. It means that there is yet one more
+ * level is the tree.
+ */
+ maxLevel++;
+ }
+ return maxLevel;
+ }
+
+ /*
+ * Initial calculations for GiST buffering build.
+ */
+ static bool
+ gistInitBuffering(GISTBuildState *buildstate, Relation index)
+ {
+ int pagesPerBuffer = -1;
+ Size pageFreeSpace;
+ Size itupAvgSize,
+ itupMinSize;
+ int i,
+ avgIndexTuplesPerPage,
+ maxIndexTuplesPerPage;
+ int effectiveMemory;
+ int levelStep;
+ GISTBuildBuffers *gfbb;
+
+ /* Calc space of index page which is available for index tuples */
+ pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
+ - sizeof(ItemIdData)
+ - RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR);
+
+ /*
+ * Calculate average size of already inserted index tuples using gathered
+ * statistics.
+ */
+ itupAvgSize = round((double) buildstate->indtuplesSize /
+ (double) buildstate->indtuples);
+
+ /*
+ * Calculate minimal possible size of index tuple by index metadata.
+ * Minimal possible size of varlena is VARHDRSZ.
+ */
+ itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
+ for (i = 0; i < index->rd_att->natts; i++)
+ {
+ if (index->rd_att->attrs[i]->attlen < 0)
+ itupMinSize += VARHDRSZ;
+ else
+ itupMinSize += index->rd_att->attrs[i]->attlen;
+ }
+
+ /* Calculate average and maximal number of index tuples which fit to page */
+ avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
+ maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
+
+ /*
+ * Calculate levelStep by available amount of memory. We should be able to
+ * load into main memory one page of each underlying node buffer (which
+ * are in levelStep below). That give constraint over
+ * maintenance_work_mem. Also we should be able to have subtree of
+ * levelStep level in cache. That give constraint over
+ * effective_cache_size.
+ *
+ * i'th underlying level of sub-tree can consists of
+ * i^maxIndexTuplesPerPage pages at maximum. So, subtree of levelStep
+ * levels can't be greater then 2 * maxIndexTuplesPerPage ^ levelStep
+ * pages. We use some more reserve due to we probably can't take whole
+ * effective cache and use formula 4 * maxIndexTuplesPerPage ^ levelStep =
+ * effectiveCache. We use similar logic with maintenance_work_mem. We
+ * should be able to store at least last pages of all buffers where we are
+ * emptying current buffer to.
+ */
+ effectiveMemory = Min(maintenance_work_mem * 1024 / BLCKSZ,
+ effective_cache_size);
+ levelStep = (int) log((double) effectiveMemory / 4.0) /
+ log((double) maxIndexTuplesPerPage);
+
+ if (levelStep > 0)
+ {
+ /*
+ * Enough of memory for at least level_step == 1. Buffer size should
+ * be so that emptying of half of buffer fills one page per underlying
+ * buffer in average.
+ */
+ pagesPerBuffer = 2;
+ for (i = 0; i < levelStep; i++)
+ pagesPerBuffer *= avgIndexTuplesPerPage;
+
+ gfbb = palloc(sizeof(GISTBuildBuffers));
+ gfbb->pagesPerBuffer = pagesPerBuffer;
+ gfbb->levelStep = levelStep;
+ gistInitBuildBuffers(gfbb, buildstate->indtuples > 0 ? gistGetMaxLevel(index) : 0);
+ buildstate->giststate.gfbb = gfbb;
+ elog(INFO, "Level step = %d, pagesPerBuffer = %d",
+ levelStep, pagesPerBuffer);
+ return true;
+ }
+ else
+ {
+ /* Not enough memory for buffering build. */
+ return false;
+ }
+ }
+
+ /*
+ * Routine to build an index. Basically calls insert over and over.
+ *
+ * XXX: it would be nice to implement some sort of bulk-loading
+ * algorithm, but it is not clear how to do that.
+ */
+ Datum
+ gistbuild(PG_FUNCTION_ARGS)
+ {
+ Relation heap = (Relation) PG_GETARG_POINTER(0);
+ Relation index = (Relation) PG_GETARG_POINTER(1);
+ IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
+ IndexBuildResult *result;
+ double reltuples;
+ GISTBuildState buildstate;
+ Buffer buffer;
+ Page page;
+ MemoryContext oldcxt = CurrentMemoryContext;
+
+ if (index->rd_options)
+ {
+ /* Get buffering mode from the options string */
+ GiSTOptions *options = (GiSTOptions *) index->rd_options;
+ char *bufferingMode = (char *) options + options->bufferingModeOffset;
+
+ if (!strcmp(bufferingMode, "on"))
+ buildstate.bufferingMode = 's';
+ else if (!strcmp(bufferingMode, "off"))
+ buildstate.bufferingMode = 'n';
+ else
+ buildstate.bufferingMode = 'a';
+ }
+ else
+ {
+ /* Automatic buffering mode switching by default */
+ buildstate.bufferingMode = 'a';
+ }
+
+ /*
+ * We expect to be called exactly once for any index relation. If that's
+ * not the case, big trouble's what we have.
+ */
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "index \"%s\" already contains data",
+ RelationGetRelationName(index));
+
+ /* no locking is needed */
+ initGISTstate(&buildstate.giststate, index);
+
+ /* initialize the root page */
+ buffer = gistNewBuffer(index);
+ Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
+ page = BufferGetPage(buffer);
+
+ START_CRIT_SECTION();
+
+ GISTInitBuffer(buffer, F_LEAF);
+
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+ XLogRecData rdata;
+
+ rdata.data = (char *) &(index->rd_node);
+ rdata.len = sizeof(RelFileNode);
+ rdata.buffer = InvalidBuffer;
+ rdata.next = NULL;
+
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
+ else
+ PageSetLSN(page, GetXLogRecPtrForTemp());
+
+ UnlockReleaseBuffer(buffer);
+
+ END_CRIT_SECTION();
+
+ /* build the index */
+ buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
+ buildstate.indtuples = 0;
+ buildstate.indtuplesSize = 0;
+
+ /*
+ * create a temporary memory context that is reset once for each tuple
+ * inserted into the index
+ */
+ buildstate.tmpCtx = createTempGistContext();
+
+ /*
+ * Do the heap scan.
+ */
+ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
+ gistBuildCallback, (void *) &buildstate);
+
+ /*
+ * If buffering build do final node buffers emptying.
+ */
+ if (buildstate.bufferingMode == 'y')
+ {
+ int i;
+ GISTInsertState insertstate;
+ NodeBuffer *nodeBuffer;
+ MemoryContext oldCtx;
+ GISTBuildBuffers *gfbb = buildstate.giststate.gfbb;
+
+ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
+
+ memset(&insertstate, 0, sizeof(GISTInsertState));
+ insertstate.freespace = RelationGetTargetPageFreeSpace(index,
+ GIST_DEFAULT_FILLFACTOR);
+ insertstate.r = index;
+
+ /*
+ * Iterate through the levels from the most higher.
+ */
+ for (i = gfbb->buffersOnLevelsCount - 1; i >= 0; i--)
+ {
+ bool nonEmpty = true;
+
+ /*
+ * Until we have non-empty node buffers on the level, iterate over
+ * them and initial emptying of non-empty ones.
+ */
+ while (nonEmpty)
+ {
+ ListCell *p;
+
+ nonEmpty = false;
+
+ for (p = list_head(gfbb->buffersOnLevels[i]); p; p = p->next)
+ {
+ bool isRoot;
+
+ /* Get next node buffer */
+ nodeBuffer = (NodeBuffer *) p->data.ptr_value;
+ isRoot = (nodeBuffer->nodeBlocknum == GIST_ROOT_BLKNO);
+
+ /* Skip empty node buffer */
+ if (nodeBuffer->blocksCount == 0)
+ continue;
+
+ /* Memorize that we met non-empty buffer. */
+ nonEmpty = true;
+
+ /* Process emptying of node buffer */
+ MemoryContextSwitchTo(gfbb->context);
+ gfbb->bufferEmptyingStack = lcons(nodeBuffer, gfbb->bufferEmptyingStack);
+ MemoryContextSwitchTo(buildstate.tmpCtx);
+ gistProcessEmptyingStack(&buildstate.giststate, &insertstate);
+
+ /*
+ * Root page node buffer is the only node buffer that can
+ * be deleted from the list. So, let's be careful and
+ * restart the scan.
+ */
+ if (isRoot)
+ break;
+ }
+ }
+ }
+ MemoryContextSwitchTo(oldCtx);
+ }
+
+ /* okay, all heap tuples are indexed */
+ MemoryContextSwitchTo(oldcxt);
+ MemoryContextDelete(buildstate.tmpCtx);
+
+ freeGISTstate(&buildstate.giststate);
+
+ /*
+ * Return statistics
+ */
+ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+ result->heap_tuples = reltuples;
+ result->index_tuples = (double) buildstate.indtuples;
+
+ PG_RETURN_POINTER(result);
+ }
*** /dev/null
--- b/src/backend/access/gist/gistbuildbuffers.c
***************
*** 0 ****
--- 1,911 ----
+
+ /*-------------------------------------------------------------------------
+ *
+ * gistbuildbuffers.c
+ * buffers management functions for GiST buffering build algorithm.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/gist/gistbuildbuffers.c
+ *
+ *-------------------------------------------------------------------------
+ */
+ #include "postgres.h"
+
+ #include "access/genam.h"
+ #include "access/gist_private.h"
+ #include "catalog/index.h"
+ #include "catalog/pg_collation.h"
+ #include "miscadmin.h"
+ #include "storage/bufmgr.h"
+ #include "storage/indexfsm.h"
+ #include "storage/buffile.h"
+ #include "utils/memutils.h"
+ #include "utils/rel.h"
+
+ static NodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers * gfbb);
+ static void gistAddLoadedBuffer(GISTBuildBuffers * gfbb, BlockNumber blocknum);
+ static void gistLoadNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer);
+ static void gistUnloadNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer);
+ static void gistPlaceItupToPage(NodeBufferPage * pageBuffer, IndexTuple item);
+ static void gistGetItupFromPage(NodeBufferPage * pageBuffer, IndexTuple *item);
+ static int gistBuffersFreeBlocksCmp(const void *a, const void *b);
+ static long gistBuffersGetFreeBlock(GISTBuildBuffers * gfbb);
+ static void gistBuffersReleaseBlock(GISTBuildBuffers * gfbb, long blocknum);
+
+ /*
+ * Initialize GiST buffering build data structure.
+ */
+ void
+ gistInitBuildBuffers(GISTBuildBuffers * gfbb, int maxLevel)
+ {
+ HASHCTL hashCtl;
+
+ /*
+ * Create temporary file initialize data structures for free pages
+ * management.
+ */
+ gfbb->pfile = BufFileCreateTemp(true);
+ gfbb->nFileBlocks = 0;
+ gfbb->nFreeBlocks = 0;
+ gfbb->blocksSorted = false;
+ gfbb->freeBlocksLen = 32;
+ gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long));
+
+ /*
+ * Current memory context will be used for all in-memory data structures
+ * of buffers which are persistent during buffering build.
+ */
+ gfbb->context = CurrentMemoryContext;
+
+ /*
+ * nodeBuffersTab hash is association between index blocks and it's
+ * buffers.
+ */
+ hashCtl.keysize = sizeof(BlockNumber);
+ hashCtl.entrysize = sizeof(NodeBuffer);
+ hashCtl.hcxt = CurrentMemoryContext;
+ hashCtl.hash = tag_hash;
+ hashCtl.match = memcmp;
+ gfbb->nodeBuffersTab = hash_create("gistbuildbuffers",
+ 1024,
+ &hashCtl,
+ HASH_ELEM | HASH_CONTEXT | HASH_FUNCTION
+ | HASH_COMPARE);
+
+ /*
+ * Stack of node buffers which was planned for emptying.
+ */
+ gfbb->bufferEmptyingStack = NIL;
+
+ gfbb->currentEmptyingBufferBlockNumber = InvalidBlockNumber;
+ gfbb->currentEmptyingBufferSplit = false;
+
+ /*
+ * Per-level node buffers lists for final buffers emptying process. Node
+ * buffer are inserted here when it is created. Root node buffer is the
+ * only buffer which can be deleted from appropriate list, because after
+ * split root node appears at higher level but saves block number.
+ */
+ gfbb->buffersOnLevelsLen = 16;
+ gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) *
+ gfbb->buffersOnLevelsLen);
+ gfbb->buffersOnLevelsCount = 0;
+
+ /*
+ * Block numbers of node buffers which last pages are currently loaded
+ * into main memory.
+ */
+ gfbb->loadedBuffersLen = 32;
+ gfbb->loadedBuffers = (BlockNumber *) palloc(gfbb->loadedBuffersLen *
+ sizeof(BlockNumber));
+ gfbb->loadedBuffersCount = 0;
+
+ /*
+ * Root path item of the tree. Being updated on each root node split.
+ */
+ gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc(
+ gfbb->context, sizeof(GISTBufferingInsertStack));
+ gfbb->rootitem->parent = NULL;
+ gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
+ gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
+ gfbb->rootitem->level = maxLevel;
+ }
+
+ /*
+ * Return NodeBuffer structure by it's block number. If createNew flag is
+ * specified then new NodeBuffer structure will be created on it's absence.
+ */
+ NodeBuffer *
+ gistGetNodeBuffer(GISTBuildBuffers * gfbb, GISTSTATE *giststate,
+ BlockNumber nodeBlocknum,
+ OffsetNumber downlinkoffnum,
+ GISTBufferingInsertStack * parent, bool createNew)
+ {
+ NodeBuffer *nodeBuffer;
+ bool found;
+
+ /*
+ * Find nodeBuffer in hash table
+ */
+ nodeBuffer = (NodeBuffer *) hash_search(gfbb->nodeBuffersTab,
+ (const void *) &nodeBlocknum,
+ createNew ? HASH_ENTER : HASH_FIND,
+ &found);
+ if (!found)
+ {
+ GISTBufferingInsertStack *path;
+ int levelIndex;
+ int i;
+ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+
+ /*
+ * Node buffer wasn't found. Create new if required.
+ */
+ if (!createNew)
+ return NULL;
+
+ if (nodeBlocknum != GIST_ROOT_BLKNO)
+ {
+ /*
+ * For non-root page we have to create new path item which
+ * references to the given parent.
+ */
+ path = (GISTBufferingInsertStack *) palloc(
+ sizeof(GISTBufferingInsertStack));
+ path->parent = parent;
+ path->blkno = nodeBlocknum;
+ path->downlinkoffnum = downlinkoffnum;
+ path->level = parent->level - 1;
+ path->refCount = 0;
+ parent->refCount++;
+ Assert(path->level > 0);
+ }
+ else
+ {
+ path = gfbb->rootitem;
+ }
+
+ /* Node buffer references it's path item. */
+ path->refCount++;
+
+ /*
+ * New node buffer. Fill data structure with default values.
+ */
+ nodeBuffer->pageBuffer = NULL;
+ nodeBuffer->blocksCount = 0;
+ nodeBuffer->level = path->level;
+ nodeBuffer->path = path;
+ nodeBuffer->plannedForEmptying = false;
+
+ /*
+ * Put node buffer to the appropriate list. Calc index of node buffer
+ * list by it's level.
+ */
+ levelIndex = (nodeBuffer->level - gfbb->levelStep) / gfbb->levelStep;
+
+ /*
+ * Probably, we should increase number of allocated buffers lists.
+ */
+ while (levelIndex >= gfbb->buffersOnLevelsLen)
+ {
+ gfbb->buffersOnLevelsLen *= 2;
+ gfbb->buffersOnLevels =
+ (List **) repalloc(gfbb->buffersOnLevels,
+ gfbb->buffersOnLevelsLen *
+ sizeof(List *));
+ }
+
+ /* Initialize new buffers lists as empty. */
+ if (levelIndex >= gfbb->buffersOnLevelsCount)
+ {
+ for (i = gfbb->buffersOnLevelsCount; i <= levelIndex; i++)
+ gfbb->buffersOnLevels[i] = NIL;
+ gfbb->buffersOnLevelsCount = levelIndex + 1;
+ }
+
+ /* Add node buffer to the corresponding list */
+ gfbb->buffersOnLevels[levelIndex] = lcons(
+ nodeBuffer, gfbb->buffersOnLevels[levelIndex]);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+ else
+ {
+ if (parent != nodeBuffer->path->parent)
+ {
+ /*
+ * Other parent path item was provided than we've remembered. We
+ * trust caller to provide more correct parent than we have.
+ * Previous parent may be outdated by page split.
+ */
+ gistDecreasePathRefcount(nodeBuffer->path->parent);
+ nodeBuffer->path->parent = parent;
+ parent->refCount++;
+ }
+ }
+
+ return nodeBuffer;
+ }
+
+ /*
+ * Allocate memory for buffer page.
+ */
+ static NodeBufferPage *
+ gistAllocateNewPageBuffer(GISTBuildBuffers * gfbb)
+ {
+ NodeBufferPage *pageBuffer;
+
+ /*
+ * Allocate memory for page in appropriate context.
+ */
+ pageBuffer = (NodeBufferPage *) MemoryContextAlloc(gfbb->context, BLCKSZ);
+
+ /*
+ * Set page free space
+ */
+ PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET;
+ return pageBuffer;
+ }
+
+ /*
+ * Add specified block number into preparedBlocks array.
+ */
+ static void
+ gistAddLoadedBuffer(GISTBuildBuffers * gfbb, BlockNumber blocknum)
+ {
+ if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen)
+ {
+ /*
+ * Not enough of memory is currently allocated.
+ */
+ gfbb->loadedBuffersLen *= 2;
+ gfbb->loadedBuffers = (BlockNumber *) repalloc(gfbb->loadedBuffers,
+ gfbb->loadedBuffersLen *
+ sizeof(BlockNumber));
+ }
+ /* Actual add to array */
+ gfbb->loadedBuffers[gfbb->loadedBuffersCount] = blocknum;
+ gfbb->loadedBuffersCount++;
+ }
+
+
+ /*
+ * Load last page of node buffer into main memory.
+ */
+ static void
+ gistLoadNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer)
+ {
+ /* Check if we really should load something */
+ if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0)
+ {
+ /* Allocate memory for page */
+ nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
+
+ /* Read block from temporary file */
+ BufFileSeekBlock(gfbb->pfile, nodeBuffer->pageBlocknum);
+ BufFileRead(gfbb->pfile, nodeBuffer->pageBuffer, BLCKSZ);
+
+ /* Mark file block as free */
+ gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum);
+
+ /* Mark node buffer as loaded */
+ gistAddLoadedBuffer(gfbb, nodeBuffer->nodeBlocknum);
+ nodeBuffer->pageBlocknum = InvalidBlockNumber;
+ }
+ }
+
+ /*
+ * Write last page of node buffer to the disk.
+ */
+ static void
+ gistUnloadNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer)
+ {
+ /* Check if we have something to write */
+ if (nodeBuffer->pageBuffer)
+ {
+ BlockNumber blkno;
+
+ /* Get free file block */
+ blkno = gistBuffersGetFreeBlock(gfbb);
+
+ /* Write block to the temporary file */
+ BufFileSeekBlock(gfbb->pfile, blkno);
+ BufFileWrite(gfbb->pfile, nodeBuffer->pageBuffer, BLCKSZ);
+
+ /* Free memory of that page */
+ pfree(nodeBuffer->pageBuffer);
+ nodeBuffer->pageBuffer = NULL;
+
+ /* Save block number */
+ nodeBuffer->pageBlocknum = blkno;
+ }
+ }
+
+ /*
+ * Write last pages of all node buffers to the disk.
+ */
+ void
+ gistUnloadNodeBuffers(GISTBuildBuffers * gfbb)
+ {
+ int i;
+
+ /* Iterate over node buffers which last page is loaded into main memory */
+ for (i = 0; i < gfbb->loadedBuffersCount; i++)
+ {
+ NodeBuffer *nodeBuffer;
+ bool found;
+
+ /* Find node buffer by it's block number */
+ nodeBuffer = hash_search(gfbb->nodeBuffersTab, &gfbb->loadedBuffers[i],
+ HASH_FIND, &found);
+
+ /*
+ * Node buffer can be not found. It can disappear during page split.
+ * So, it's ok, just skip it.
+ */
+ if (!found)
+ continue;
+
+ /* Unload last page to the disk */
+ gistUnloadNodeBuffer(gfbb, nodeBuffer);
+ }
+ /* Now there are no node buffers with loaded last page */
+ gfbb->loadedBuffersCount = 0;
+ }
+
+ /*
+ * Add index tuple to buffer page.
+ */
+ static void
+ gistPlaceItupToPage(NodeBufferPage * pageBuffer, IndexTuple itup)
+ {
+ /*
+ * Get pointer to the start of page free space
+ */
+ char *ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET
+ + PAGE_FREE_SPACE(pageBuffer) - MAXALIGN(IndexTupleSize(itup));
+
+ /*
+ * There should be enough of space
+ */
+ Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(IndexTupleSize(itup)));
+
+ /*
+ * Reduce free space value of page
+ */
+ PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(IndexTupleSize(itup));
+
+ /*
+ * Copy index tuple to free space
+ */
+ memcpy(ptr, itup, IndexTupleSize(itup));
+ }
+
+ /*
+ * Get last item from buffer page and remove it from page.
+ */
+ static void
+ gistGetItupFromPage(NodeBufferPage * pageBuffer, IndexTuple *itup)
+ {
+ /*
+ * Get pointer to last index tuple
+ */
+ IndexTuple ptr = (IndexTuple) ((char *) pageBuffer
+ + BUFFER_PAGE_DATA_OFFSET
+ + PAGE_FREE_SPACE(pageBuffer));
+
+ /*
+ * Page shouldn't be empty
+ */
+ Assert(!PAGE_IS_EMPTY(pageBuffer));
+
+ /*
+ * Allocate memory for returned index tuple copy
+ */
+ *itup = (IndexTuple) palloc(IndexTupleSize(ptr));
+
+ /*
+ * Copy data
+ */
+ memcpy(*itup, ptr, IndexTupleSize(ptr));
+
+ /*
+ * Increase free space value of page
+ */
+ PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(IndexTupleSize(*itup));
+ }
+
+ /*
+ * Push new index tuple to node buffer.
+ */
+ void
+ gistPushItupToNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer,
+ IndexTuple itup)
+ {
+ /*
+ * Most part of memory operations will be in buffering build persistent
+ * context. So, let's switch to it.
+ */
+ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+
+ /* Is the buffer currently empty? */
+ if (nodeBuffer->blocksCount == 0)
+ {
+ /* It's empty, let's create the first page */
+ nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
+ nodeBuffer->pageBuffer->prev = InvalidBlockNumber;
+ nodeBuffer->blocksCount = 1;
+ gistAddLoadedBuffer(gfbb, nodeBuffer->nodeBlocknum);
+ }
+
+ /* Load last page of node buffer if it wasn't already */
+ if (!nodeBuffer->pageBuffer)
+ {
+ gistLoadNodeBuffer(gfbb, nodeBuffer);
+ }
+
+ /*
+ * Check if there is enough space on the last page for the tuple
+ */
+ if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup))
+ {
+ /*
+ * Swap previous block to disk and allocate new one
+ */
+ BlockNumber blkno;
+
+ /* Write filled page to the disk */
+ blkno = gistBuffersGetFreeBlock(gfbb);
+ BufFileSeekBlock(gfbb->pfile, blkno);
+ BufFileWrite(gfbb->pfile, nodeBuffer->pageBuffer, BLCKSZ);
+
+ /* Mark space of in-memory page as empty */
+ PAGE_FREE_SPACE(nodeBuffer->pageBuffer) =
+ BLCKSZ - MAXALIGN(offsetof(NodeBufferPage, tupledata));
+
+ /* Save block number of the previous page */
+ nodeBuffer->pageBuffer->prev = blkno;
+
+ /* We've just added one more page */
+ nodeBuffer->blocksCount++;
+ }
+
+ gistPlaceItupToPage(nodeBuffer->pageBuffer, itup);
+
+ /*
+ * Restore memory context
+ */
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ /*
+ * Removes one index tuple from node buffer. Returns true if success and false
+ * if node buffer is empty.
+ */
+ bool
+ gistPopItupFromNodeBuffer(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer,
+ IndexTuple *itup)
+ {
+ /*
+ * If node buffer is empty then return false.
+ */
+ if (nodeBuffer->blocksCount <= 0)
+ return false;
+
+ /* Load last page of node buffer if needed */
+ if (!nodeBuffer->pageBuffer)
+ gistLoadNodeBuffer(gfbb, nodeBuffer);
+
+ /*
+ * Get index tuple from last non-empty page.
+ */
+ gistGetItupFromPage(nodeBuffer->pageBuffer, itup);
+
+ /*
+ * Check if the page which the index tuple was got from is now empty
+ */
+ if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer))
+ {
+ BlockNumber prevblkno;
+
+ /*
+ * If it's empty then we need to release buffer file block and free
+ * page buffer.
+ */
+ nodeBuffer->blocksCount--;
+
+ /*
+ * If there's more pages, fetch previous one
+ */
+ prevblkno = nodeBuffer->pageBuffer->prev;
+ if (prevblkno != InvalidBlockNumber)
+ {
+ /* There actually is previous page, so read it. */
+ Assert(nodeBuffer->blocksCount > 0);
+ BufFileSeekBlock(gfbb->pfile, prevblkno);
+ BufFileRead(gfbb->pfile, nodeBuffer->pageBuffer, BLCKSZ);
+
+ /* Mark block as free */
+ gistBuffersReleaseBlock(gfbb, prevblkno);
+ }
+ else
+ {
+ /* Actually there are no more pages. Free memory. */
+ Assert(nodeBuffer->blocksCount == 0);
+ pfree(nodeBuffer->pageBuffer);
+ nodeBuffer->pageBuffer = NULL;
+ }
+ }
+ return true;
+ }
+
+ /*
+ * qsort comparator for sorting freeBlocks[] into decreasing order.
+ */
+ static int
+ gistBuffersFreeBlocksCmp(const void *a, const void *b)
+ {
+ long ablk = *((const long *) a);
+ long bblk = *((const long *) b);
+
+ /*
+ * can't just subtract because long might be wider than int
+ */
+ if (ablk < bblk)
+ return 1;
+ if (ablk > bblk)
+ return -1;
+ return 0;
+ }
+
+ /*
+ * Select a currently unused block for writing to.
+ *
+ * NB: should only be called when writer is ready to write immediately,
+ * to ensure that first write pass is sequential.
+ */
+ static long
+ gistBuffersGetFreeBlock(GISTBuildBuffers * gfbb)
+ {
+ /*
+ * If there are multiple free blocks, we select the one appearing last in
+ * freeBlocks[] (after sorting the array if needed). If there are none,
+ * assign the next block at the end of the file.
+ */
+ if (gfbb->nFreeBlocks > 0)
+ {
+ if (!gfbb->blocksSorted)
+ {
+ qsort((void *) gfbb->freeBlocks, gfbb->nFreeBlocks,
+ sizeof(long), gistBuffersFreeBlocksCmp);
+ gfbb->blocksSorted = true;
+ }
+ return gfbb->freeBlocks[--gfbb->nFreeBlocks];
+ }
+ else
+ return gfbb->nFileBlocks++;
+ }
+
+ /*
+ * Return a block# to the freelist.
+ */
+ static void
+ gistBuffersReleaseBlock(GISTBuildBuffers * gfbb, long blocknum)
+ {
+ int ndx;
+
+ /*
+ * Enlarge freeBlocks array if full.
+ */
+ if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen)
+ {
+ gfbb->freeBlocksLen *= 2;
+ gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks,
+ gfbb->freeBlocksLen *
+ sizeof(long));
+ }
+
+ /*
+ * Add blocknum to array, and mark the array unsorted if it's no longer in
+ * decreasing order.
+ */
+ ndx = gfbb->nFreeBlocks++;
+ gfbb->freeBlocks[ndx] = blocknum;
+ if (ndx > 0 && gfbb->freeBlocks[ndx - 1] < blocknum)
+ gfbb->blocksSorted = false;
+ }
+
+ /*
+ * Free buffering build data structure.
+ */
+ void
+ gistFreeBuildBuffers(GISTBuildBuffers * gfbb)
+ {
+ /* Close buffers file. */
+ BufFileClose(gfbb->pfile);
+
+ /* All other things will be free on memory context release */
+ }
+
+ /*
+ * Data structure representing information about node buffer for index tuples
+ * relocation from splitted node buffer.
+ */
+ typedef struct
+ {
+ GISTENTRY entry[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ SplitedPageLayout *dist;
+ NodeBuffer *nodeBuffer;
+ } RelocationBufferInfo;
+
+ /*
+ * Maintain data structures on page split.
+ */
+ void
+ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers * gfbb, GISTSTATE *giststate,
+ Relation r, GISTBufferingInsertStack * path,
+ Buffer buffer, SplitedPageLayout *dist)
+ {
+ RelocationBufferInfo *relocationBuffersInfos;
+ bool found;
+ NodeBuffer *nodeBuffer;
+ BlockNumber blocknum;
+ IndexTuple itup;
+ int splitPagesCount = 0,
+ i;
+ GISTENTRY entry[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ SplitedPageLayout *ptr;
+ int level = path->level;
+ NodeBuffer nodebuf;
+
+ blocknum = BufferGetBlockNumber(buffer);
+
+ /*
+ * If splitted page level doesn't have buffers, then we've nothing to do
+ * with it.
+ */
+ if (!LEVEL_HAS_BUFFERS(level, gfbb))
+ return;
+
+ /*
+ * Get pointer of node buffer of splitted page.
+ */
+ nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum,
+ HASH_FIND, &found);
+ if (!found)
+ {
+ /*
+ * Node buffer should anyway be created at this moment. Either by
+ * index tuples insertion or page split.
+ */
+ elog(ERROR,
+ "node buffer of splitting page (%u) doesn't exists while it should.",
+ blocknum);
+ }
+
+ /*
+ * We are going to perform some operations with node buffers hash. Thus,
+ * it unsafe to operate with already removed hash item and impossible if
+ * we reuse it. Let's save it.
+ */
+ memcpy(&nodebuf, nodeBuffer, sizeof(NodeBuffer));
+
+ if (blocknum == GIST_ROOT_BLKNO)
+ {
+ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+ int levelIndex;
+
+ /*
+ * If root split, we don't reuse NodeBuffer data structure. So let's
+ * remove it from hash and from list of buffers.
+ */
+ levelIndex = (nodeBuffer->level - gfbb->levelStep) / gfbb->levelStep;
+ gfbb->buffersOnLevels[levelIndex] = list_delete(
+ gfbb->buffersOnLevels[levelIndex], nodeBuffer);
+ MemoryContextSwitchTo(oldcxt);
+
+ hash_search(gfbb->nodeBuffersTab, &blocknum, HASH_REMOVE, &found);
+ Assert(found);
+ }
+
+ /* Reassign pointer to the saved copy. */
+ nodeBuffer = &nodebuf;
+
+ /*
+ * Count pages produced by split and save pointer data structure of the
+ * last one.
+ */
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ splitPagesCount++;
+ }
+
+ /*
+ * Allocate memory for information about relocation buffers.
+ */
+ relocationBuffersInfos =
+ (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) *
+ splitPagesCount);
+
+ /*
+ * Fill relocation buffers information for node buffers of pages produced
+ * by split.
+ */
+ i = 0;
+ for (ptr = dist; ptr; ptr = ptr->next)
+ {
+ NodeBuffer *newNodeBuffer;
+
+ /*
+ * Decompress parent index tuple of node buffer page.
+ */
+ gistDeCompressAtt(giststate, r,
+ ptr->itup, NULL, (OffsetNumber) 0,
+ relocationBuffersInfos[i].entry,
+ relocationBuffersInfos[i].isnull);
+
+ newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, ptr->block.blkno,
+ path->downlinkoffnum, path->parent, true);
+
+ /*
+ * Fill relocation information
+ */
+ relocationBuffersInfos[i].nodeBuffer = newNodeBuffer;
+ if (newNodeBuffer->nodeBlocknum == blocknum)
+ {
+ /*
+ * Reuse of NodeBuffer data structure of splitted node. Old
+ * version was copied.
+ */
+ newNodeBuffer->blocksCount = 0;
+ newNodeBuffer->pageBuffer = NULL;
+ newNodeBuffer->pageBlocknum = InvalidBlockNumber;
+ }
+
+ /*
+ * Fill node buffer structure
+ */
+ relocationBuffersInfos[i].dist = ptr;
+
+ i++;
+ }
+
+ /*
+ * Loop of index tuples relocation.
+ */
+ while (gistPopItupFromNodeBuffer(gfbb, nodeBuffer, &itup))
+ {
+ float sum_grow,
+ which_grow[INDEX_MAX_KEYS];
+ int i,
+ which;
+ IndexTuple newtup;
+
+ /*
+ * Choose node buffer for index tuple insert.
+ */
+ gistDeCompressAtt(giststate, r,
+ itup, NULL, (OffsetNumber) 0, entry, isnull);
+
+ which = -1;
+ *which_grow = -1.0f;
+ sum_grow = 1.0f;
+
+ for (i = 0; i < splitPagesCount && sum_grow; i++)
+ {
+ int j;
+ RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i];
+
+ sum_grow = 0.0f;
+ for (j = 0; j < r->rd_att->natts; j++)
+ {
+ float usize;
+
+ usize = gistpenalty(giststate, j,
+ &splitPageInfo->entry[j],
+ splitPageInfo->isnull[j],
+ &entry[j], isnull[j]);
+
+ if (which_grow[j] < 0 || usize < which_grow[j])
+ {
+ which = i;
+ which_grow[j] = usize;
+ if (j < r->rd_att->natts - 1 && i == 0)
+ which_grow[j + 1] = -1;
+ sum_grow += which_grow[j];
+ }
+ else if (which_grow[j] == usize)
+ sum_grow += usize;
+ else
+ {
+ sum_grow = 1;
+ break;
+ }
+ }
+ }
+
+ /*
+ * push item to selected node buffer
+ */
+ gistPushItupToNodeBuffer(gfbb, relocationBuffersInfos[which].nodeBuffer,
+ itup);
+
+ /*
+ * If node buffer was just overflowed then we should add it to the
+ * emptying stack.
+ */
+ if (BUFFER_HALF_FILLED(relocationBuffersInfos[which].nodeBuffer, gfbb)
+ && !relocationBuffersInfos[which].nodeBuffer->plannedForEmptying)
+ {
+ MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+
+ relocationBuffersInfos[which].nodeBuffer->plannedForEmptying = true;
+ gfbb->bufferEmptyingStack =
+ lcons(relocationBuffersInfos[which].nodeBuffer,
+ gfbb->bufferEmptyingStack);
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ /*
+ * adjust tuple of parent page
+ */
+ newtup = gistgetadjusted(r, relocationBuffersInfos[which].dist->itup,
+ itup, giststate);
+ if (newtup)
+ {
+ /*
+ * Parent page index tuple expands. We need to update old index
+ * tuple with the new one.
+ */
+ gistDeCompressAtt(giststate, r,
+ newtup, NULL, (OffsetNumber) 0,
+ relocationBuffersInfos[which].entry,
+ relocationBuffersInfos[which].isnull);
+
+ relocationBuffersInfos[which].dist->itup = newtup;
+ }
+ }
+
+ /* Report about splitting for current emptying buffer */
+ if (blocknum == gfbb->currentEmptyingBufferBlockNumber)
+ gfbb->currentEmptyingBufferSplit = true;
+
+ /*
+ * If root split, we don't reuse NodeBuffer data structure. So, one
+ * reference to path item goes away.
+ */
+ if (blocknum == GIST_ROOT_BLKNO)
+ gistDecreasePathRefcount(nodeBuffer->path);
+
+ pfree(relocationBuffersInfos);
+ }
+
+ /*
+ * Return size of node buffer occupied by stored index tuples.
+ */
+ int
+ gistGetNodeBufferBusySize(GISTBuildBuffers * gfbb, NodeBuffer * nodeBuffer)
+ {
+ int size;
+
+ /*
+ * No occupied buffer file blocks means that node buffer is empty.
+ */
+ if (nodeBuffer->blocksCount == 0)
+ return 0;
+ if (!nodeBuffer->pageBuffer)
+ gistLoadNodeBuffer(gfbb, nodeBuffer);
+
+ /*
+ * We assume only the last page to be not fully filled.
+ */
+ size = (BLCKSZ - MAXALIGN(sizeof(uint32))) * nodeBuffer->blocksCount;
+ size -= PAGE_FREE_SPACE(nodeBuffer->pageBuffer);
+ return size;
+ }
*** a/src/backend/access/gist/gistutil.c
--- b/src/backend/access/gist/gistutil.c
***************
*** 670,682 **** gistoptions(PG_FUNCTION_ARGS)
{
Datum reloptions = PG_GETARG_DATUM(0);
bool validate = PG_GETARG_BOOL(1);
! bytea *result;
! result = default_reloptions(reloptions, validate, RELOPT_KIND_GIST);
- if (result)
- PG_RETURN_BYTEA_P(result);
- PG_RETURN_NULL();
}
/*
--- 670,699 ----
{
Datum reloptions = PG_GETARG_DATUM(0);
bool validate = PG_GETARG_BOOL(1);
! relopt_value *options;
! GiSTOptions *rdopts;
! int numoptions;
! static const relopt_parse_elt tab[] = {
! {"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)},
! {"buffering", RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)}
! };
! options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIST,
! &numoptions);
!
! /* if none set, we're done */
! if (numoptions == 0)
! PG_RETURN_NULL();
!
! rdopts = allocateReloptStruct(sizeof(GiSTOptions), options, numoptions);
!
! fillRelOptions((void *) rdopts, sizeof(GiSTOptions), options, numoptions,
! validate, tab, lengthof(tab));
!
! pfree(options);
!
! PG_RETURN_BYTEA_P(rdopts);
}
/*
*** a/src/backend/access/gist/gistxlog.c
--- b/src/backend/access/gist/gistxlog.c
***************
*** 266,272 **** gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record)
else
GistPageGetOpaque(page)->rightlink = xldata->origrlink;
GistPageGetOpaque(page)->nsn = xldata->orignsn;
! if (i < xlrec.data->npage - 1 && !isrootsplit)
GistMarkFollowRight(page);
else
GistClearFollowRight(page);
--- 266,273 ----
else
GistPageGetOpaque(page)->rightlink = xldata->origrlink;
GistPageGetOpaque(page)->nsn = xldata->orignsn;
! if (i < xlrec.data->npage - 1 && !isrootsplit &&
! !xldata->noFollowRight)
GistMarkFollowRight(page);
else
GistClearFollowRight(page);
***************
*** 414,420 **** XLogRecPtr
gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
SplitedPageLayout *dist,
BlockNumber origrlink, GistNSN orignsn,
! Buffer leftchildbuf)
{
XLogRecData *rdata;
gistxlogPageSplit xlrec;
--- 415,421 ----
gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
SplitedPageLayout *dist,
BlockNumber origrlink, GistNSN orignsn,
! Buffer leftchildbuf, bool noFollowFight)
{
XLogRecData *rdata;
gistxlogPageSplit xlrec;
***************
*** 436,441 **** gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
--- 437,443 ----
xlrec.npage = (uint16) npage;
xlrec.leftchild =
BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber;
+ xlrec.noFollowRight = noFollowFight;
rdata[0].data = (char *) &xlrec;
rdata[0].len = sizeof(gistxlogPageSplit);
*** a/src/include/access/gist_private.h
--- b/src/include/access/gist_private.h
***************
*** 17,29 ****
--- 17,78 ----
#include "access/gist.h"
#include "access/itup.h"
#include "storage/bufmgr.h"
+ #include "storage/buffile.h"
#include "utils/rbtree.h"
+ #include "utils/hsearch.h"
+
+ /* Has specified level buffers? */
+ #define LEVEL_HAS_BUFFERS(level,gfbb) ((level) != 0 && (level) % (gfbb)->levelStep == 0)
+ /* Is specified buffer at least half-filled (should be planned for emptying)?*/
+ #define BUFFER_HALF_FILLED(nodeBuffer,gfbb) ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2)
+ /* Is specified buffer overflowed (can't take index tuples anymore)?*/
+ #define BUFFER_OVERFLOWED(nodeBuffer,gfbb) ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer)
/* Buffer lock modes */
#define GIST_SHARE BUFFER_LOCK_SHARE
#define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
#define GIST_UNLOCK BUFFER_LOCK_UNLOCK
+ typedef struct
+ {
+ BlockNumber prev;
+ uint32 freespace;
+ char tupledata[1];
+ } NodeBufferPage;
+
+ #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(NodeBufferPage, tupledata))
+ /* Returns free space in node buffer page */
+ #define PAGE_FREE_SPACE(nbp) (nbp->freespace)
+ /* Checks if node buffer page is empty */
+ #define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET)
+ /* Checks if node buffers page don't contain sufficient space for index tuple */
+ #define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \
+ MAXALIGN(IndexTupleSize(itup)))
+
+ /* Buffer of tree node data structure */
+ typedef struct NodeBuffer
+ {
+ /* number of page containing node */
+ BlockNumber nodeBlocknum;
+
+ /* count of blocks occupied by buffer */
+ int blocksCount;
+
+ BlockNumber pageBlocknum;
+ NodeBufferPage *pageBuffer;
+
+ /* corresponding downlink number in parent page */
+ OffsetNumber downlinkoffnum;
+
+ /* is this buffer planned for emptying? */
+ bool plannedForEmptying;
+
+ /* level of corresponding node */
+ int level;
+
+ struct GISTBufferingInsertStack *path;
+ } NodeBuffer;
+
/*
* GISTSTATE: information needed for any GiST index operation
*
***************
*** 44,49 **** typedef struct GISTSTATE
--- 93,100 ----
/* Collations to pass to the support functions */
Oid supportCollation[INDEX_MAX_KEYS];
+ struct GISTBuildBuffers *gfbb;
+
TupleDesc tupdesc;
} GISTSTATE;
***************
*** 170,175 **** typedef struct gistxlogPageSplit
--- 221,227 ----
BlockNumber leftchild; /* like in gistxlogPageUpdate */
uint16 npage; /* # of pages in the split */
+ bool noFollowRight; /* skip followRight flag setting */
/*
* follow: 1. gistxlogPage and array of IndexTupleData per page
***************
*** 225,230 **** typedef struct GISTInsertStack
--- 277,352 ----
struct GISTInsertStack *parent;
} GISTInsertStack;
+ /*
+ * Extended GISTInsertStack for buffering GiST index build. It additionally hold
+ * level number of page.
+ */
+ typedef struct GISTBufferingInsertStack
+ {
+ /* current page */
+ BlockNumber blkno;
+
+ /* offset of the downlink in the parent page, that points to this page */
+ OffsetNumber downlinkoffnum;
+
+ /* pointer to parent */
+ struct GISTBufferingInsertStack *parent;
+
+ int refCount;
+
+ /* level number */
+ int level;
+ } GISTBufferingInsertStack;
+
+ /*
+ * Data structure with general information about build buffers.
+ */
+ typedef struct GISTBuildBuffers
+ {
+ /* memory context which is persistent during buffering build */
+ MemoryContext context;
+ /* underlying files */
+ BufFile *pfile;
+ /* # of blocks used in underlying files */
+ long nFileBlocks;
+ /* is freeBlocks[] currently in order? */
+ bool blocksSorted;
+ /* resizable array of free blocks */
+ long *freeBlocks;
+ /* # of currently free blocks */
+ int nFreeBlocks;
+ /* current allocated length of freeBlocks[] */
+ int freeBlocksLen;
+
+ /* hash for buffers by block number */
+ HTAB *nodeBuffersTab;
+
+ /* stack of buffers for emptying */
+ List *bufferEmptyingStack;
+ /* number of currently emptying buffer */
+ BlockNumber currentEmptyingBufferBlockNumber;
+ /* whether currently emptying buffer was split - a signal to stop emptying */
+ bool currentEmptyingBufferSplit;
+
+ /* step of levels for buffers location */
+ int levelStep;
+ /* maximal number of pages occupied by buffer */
+ int pagesPerBuffer;
+
+ /* array of lists of non-empty buffers on levels for final emptying */
+ List **buffersOnLevels;
+ int buffersOnLevelsLen;
+ int buffersOnLevelsCount;
+
+ /* dynamic array of block numbers of buffer loaded into main memory */
+ BlockNumber *loadedBuffers;
+ /* number of block numbers */
+ int loadedBuffersCount;
+ /* length of array */
+ int loadedBuffersLen;
+ GISTBufferingInsertStack *rootitem;
+ } GISTBuildBuffers;
+
typedef struct GistSplitVector
{
GIST_SPLITVEC splitVector; /* to/from PickSplit method */
***************
*** 286,291 **** extern Datum gistinsert(PG_FUNCTION_ARGS);
--- 408,424 ----
extern MemoryContext createTempGistContext(void);
extern void initGISTstate(GISTSTATE *giststate, Relation index);
extern void freeGISTstate(GISTSTATE *giststate);
+ void gistdoinsert(Relation r,
+ IndexTuple itup,
+ Size freespace,
+ GISTSTATE *GISTstate);
+ bool gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
+ Buffer buffer,
+ IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+ Buffer leftchildbuf,
+ List **splitinfo,
+ GISTBufferingInsertStack * path);
+ void gistBufferingFindCorrectParent(GISTSTATE *giststate, Relation r, GISTBufferingInsertStack * child);
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
***************
*** 305,311 **** extern XLogRecPtr gistXLogSplit(RelFileNode node,
BlockNumber blkno, bool page_is_leaf,
SplitedPageLayout *dist,
BlockNumber origrlink, GistNSN oldnsn,
! Buffer leftchild);
/* gistget.c */
extern Datum gistgettuple(PG_FUNCTION_ARGS);
--- 438,444 ----
BlockNumber blkno, bool page_is_leaf,
SplitedPageLayout *dist,
BlockNumber origrlink, GistNSN oldnsn,
! Buffer leftchild, bool noFollowFight);
/* gistget.c */
extern Datum gistgettuple(PG_FUNCTION_ARGS);
***************
*** 313,318 **** extern Datum gistgetbitmap(PG_FUNCTION_ARGS);
--- 446,461 ----
/* gistutil.c */
+ /*
+ * Storage type for GiST's reloptions
+ */
+ typedef struct GiSTOptions
+ {
+ int32 vl_len_; /* varlena header (do not touch directly!) */
+ int fillfactor; /* page fill factor in percent (0..100) */
+ int bufferingModeOffset; /* use buffering build? */
+ } GiSTOptions;
+
#define GiSTPageSize \
( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) )
***************
*** 380,383 **** extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
--- 523,547 ----
GistSplitVector *v, GistEntryVector *entryvec,
int attno);
+ /* gistbuild.c */
+ extern void gistDecreasePathRefcount(GISTBufferingInsertStack * path);
+
+ /* gistbuildbuffers.c */
+ extern void gistInitBuildBuffers(GISTBuildBuffers * gfbb, int maxLevel);
+ NodeBuffer *gistGetNodeBuffer(GISTBuildBuffers * gfbb, GISTSTATE *giststate,
+ BlockNumber blkno, OffsetNumber downlinkoffnu,
+ GISTBufferingInsertStack * parent, bool createNew);
+ extern void gistPushItupToNodeBuffer(GISTBuildBuffers * gfbb,
+ NodeBuffer * nodeBuffer, IndexTuple item);
+ extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers * gfbb,
+ NodeBuffer * nodeBuffer, IndexTuple *item);
+ extern void gistFreeBuildBuffers(GISTBuildBuffers * gfbb);
+ extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers * gfbb,
+ GISTSTATE *giststate, Relation r,
+ GISTBufferingInsertStack * path, Buffer buffer,
+ SplitedPageLayout *dist);
+ extern int gistGetNodeBufferBusySize(GISTBuildBuffers * gfbb,
+ NodeBuffer * nodeBuffer);
+ extern void gistUnloadNodeBuffers(GISTBuildBuffers * gfbb);
+
#endif /* GIST_PRIVATE_H */