diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index fb472b38f1..34c44e4d27 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -113,6 +113,7 @@ static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state, Page other, ScanKey key, OffsetNumber upperbound); +static inline bool bt_natts_check(BtreeCheckState *state, OffsetNumber offnum); static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum); /* @@ -560,6 +561,38 @@ bt_target_page_check(BtreeCheckState *state) elog(DEBUG2, "verifying %u items on %s block %u", max, P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock); + + /* Check the number of attributes in high key if any */ + if (!P_RIGHTMOST(topaque)) + { + if (!bt_natts_check(state, P_HIKEY)) + { + ItemId itemid; + IndexTuple itup; + char *itid, + *htid; + + itemid = PageGetItemId(state->target, P_HIKEY); + itup = (IndexTuple) PageGetItem(state->target, itemid); + itid = psprintf("(%u,%u)", state->targetblock, P_HIKEY); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid))); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("wrong number of index tuple attributes for index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + } + + /* * Loop over page items, starting from first non-highkey item, not high * key (if any). Also, immediately skip "negative infinity" real item (if @@ -587,6 +620,29 @@ bt_target_page_check(BtreeCheckState *state) itup = (IndexTuple) PageGetItem(state->target, itemid); skey = _bt_mkscankey(state->rel, itup); + /* Check the number of index tuple attributes */ + if (!bt_natts_check(state, offset)) + { + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid))); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("wrong number of index tuple attributes for index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.", + itid, + P_ISLEAF(topaque) ? "heap" : "index", + htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + /* * * High key check * * @@ -1152,6 +1208,32 @@ invariant_leq_nontarget_offset(BtreeCheckState *state, return cmp <= 0; } +/* + * Check if index tuple have appropriate number of attributes. + */ +static inline bool +bt_natts_check(BtreeCheckState *state, OffsetNumber offnum) +{ + int16 natts = IndexRelationGetNumberOfAttributes(state->rel); + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); + ItemId itemid; + IndexTuple itup; + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(state->target); + + itemid = PageGetItemId(state->target, offnum); + itup = (IndexTuple) PageGetItem(state->target, itemid); + + /* + * Pivot tuples stored in non-leaf pages and hikeys of leaf pages should + * have nkeyatts number of attributes. While regular tuples of leaf pages + * should have natts number of attributes. + */ + if (P_ISLEAF(opaque) && offnum >= P_FIRSTDATAKEY(opaque)) + return (BtreeTupGetNAtts(itup, state->rel) == natts); + else + return (BtreeTupGetNAtts(itup, state->rel) == nkeyatts); +} + /* * Given a block number of a B-Tree page, return page in palloc()'d memory. * While at it, perform some basic checks of the page. diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c index a58bd95620..ea6ad941ed 100644 --- a/src/backend/access/common/indextuple.c +++ b/src/backend/access/common/indextuple.c @@ -448,8 +448,8 @@ CopyIndexTuple(IndexTuple source) } /* - * Reform index tuple. Truncate nonkey (INCLUDE) attributes. - * Pass the number of attributes the truncated tuple must contain. + * Truncate tailing attributes from given index tuple leaving it with + * new_indnatts number of attributes. */ IndexTuple index_truncate_tuple(Relation idxrel, IndexTuple olditup, int new_indnatts) diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 3c73171e09..53aec4fd37 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1194,7 +1194,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, */ if (indnatts != indnkeyatts && P_ISLEAF(lopaque)) { - lefthikey = index_truncate_tuple(rel, item, indnkeyatts); + lefthikey = _bt_truncate_tuple(rel, item); itemsz = IndexTupleSize(lefthikey); itemsz = MAXALIGN(itemsz); } @@ -1816,7 +1816,7 @@ _bt_insert_parent(Relation rel, /* form an index tuple that points at the new right page */ new_item = CopyIndexTuple(ritem); - ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY); + ItemPointerSetBlockNumber(&(new_item->t_tid), rbknum); /* * Find the parent buffer and get the parent page. @@ -2081,7 +2081,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) left_item_sz = sizeof(IndexTupleData); left_item = (IndexTuple) palloc(left_item_sz); left_item->t_info = left_item_sz; - ItemPointerSet(&(left_item->t_tid), lbkno, P_HIKEY); + ItemPointerSetBlockNumber(&(left_item->t_tid), lbkno); + BTreeTupSetNAtts(left_item, 0); /* * Create downlink item for right page. The key for it is obtained from @@ -2091,7 +2092,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) right_item_sz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(lpage, itemid); right_item = CopyIndexTuple(item); - ItemPointerSet(&(right_item->t_tid), rbkno, P_HIKEY); + ItemPointerSetBlockNumber(&(right_item->t_tid), rbkno); /* NO EREPORT(ERROR) from here till newroot op is logged */ START_CRIT_SECTION(); diff --git a/src/backend/access/nbtree/nbtinsert.c.orig b/src/backend/access/nbtree/nbtinsert.c.orig new file mode 100644 index 0000000000..9ac025bcf1 --- /dev/null +++ b/src/backend/access/nbtree/nbtinsert.c.orig @@ -0,0 +1,2321 @@ +/*------------------------------------------------------------------------- + * + * nbtinsert.c + * Item insertion in Lehman and Yao btrees for Postgres. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtinsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "access/transam.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/smgr.h" +#include "utils/tqual.h" + + +typedef struct +{ + /* context data for _bt_checksplitloc */ + Size newitemsz; /* size of new item to be inserted */ + int fillfactor; /* needed when splitting rightmost page */ + bool is_leaf; /* T if splitting a leaf page */ + bool is_rightmost; /* T if splitting a rightmost page */ + OffsetNumber newitemoff; /* where the new item is to be inserted */ + int leftspace; /* space available for items on left page */ + int rightspace; /* space available for items on right page */ + int olddataitemstotal; /* space taken by old items */ + + bool have_split; /* found a valid split? */ + + /* these fields valid only if have_split is true */ + bool newitemonleft; /* new item on left or right of best split */ + OffsetNumber firstright; /* best split point */ + int best_delta; /* best size delta so far */ +} FindSplitData; + + +static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); + +static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, + Relation heapRel, Buffer buf, OffsetNumber offset, + ScanKey itup_scankey, + IndexUniqueCheck checkUnique, bool *is_unique, + uint32 *speculativeToken); +static void _bt_findinsertloc(Relation rel, + Buffer *bufptr, + OffsetNumber *offsetptr, + int keysz, + ScanKey scankey, + IndexTuple newtup, + BTStack stack, + Relation heapRel); +static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, + BTStack stack, + IndexTuple itup, + OffsetNumber newitemoff, + bool split_only_page); +static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, + OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, + IndexTuple newitem, bool newitemonleft); +static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, + BTStack stack, bool is_root, bool is_only); +static OffsetNumber _bt_findsplitloc(Relation rel, Page page, + OffsetNumber newitemoff, + Size newitemsz, + bool *newitemonleft); +static void _bt_checksplitloc(FindSplitData *state, + OffsetNumber firstoldonright, bool newitemonleft, + int dataitemstoleft, Size firstoldonrightsz); +static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, + int keysz, ScanKey scankey); +static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); + +/* + * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. + * + * This routine is called by the public interface routine, btinsert. + * By here, itup is filled in, including the TID. + * + * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this + * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or + * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate. + * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and + * don't actually insert. + * + * The result value is only significant for UNIQUE_CHECK_PARTIAL: + * it must be true if the entry is known unique, else false. + * (In the current implementation we'll also return true after a + * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but + * that's just a coding artifact.) + */ +bool +_bt_doinsert(Relation rel, IndexTuple itup, + IndexUniqueCheck checkUnique, Relation heapRel) +{ + bool is_unique = false; + int indnkeyatts; + ScanKey itup_scankey; + BTStack stack = NULL; + Buffer buf; + OffsetNumber offset; + bool fastpath; + + Assert(IndexRelationGetNumberOfAttributes(rel) != 0); + indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + Assert(indnkeyatts != 0); + + /* we need an insertion scan key to do our search, so build one */ + itup_scankey = _bt_mkscankey(rel, itup); + + /* + * It's very common to have an index on an auto-incremented or + * monotonically increasing value. In such cases, every insertion happens + * towards the end of the index. We try to optimise that case by caching + * the right-most leaf of the index. If our cached block is still the + * rightmost leaf, has enough free space to accommodate a new entry and + * the insertion key is strictly greater than the first key in this page, + * then we can safely conclude that the new key will be inserted in the + * cached block. So we simply search within the cached block and insert the + * key at the appropriate location. We call it a fastpath. + * + * Testing has revealed, though, that the fastpath can result in increased + * contention on the exclusive-lock on the rightmost leaf page. So we + * conditionally check if the lock is available. If it's not available then + * we simply abandon the fastpath and take the regular path. This makes + * sense because unavailability of the lock also signals that some other + * backend might be concurrently inserting into the page, thus reducing our + * chances to finding an insertion place in this page. + */ +top: + fastpath = false; + offset = InvalidOffsetNumber; + if (RelationGetTargetBlock(rel) != InvalidBlockNumber) + { + Size itemsz; + Page page; + BTPageOpaque lpageop; + + /* + * Conditionally acquire exclusive lock on the buffer before doing any + * checks. If we don't get the lock, we simply follow slowpath. If we + * do get the lock, this ensures that the index state cannot change, as + * far as the rightmost part of the index is concerned. + */ + buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); + + if (ConditionalLockBuffer(buf)) + { + _bt_checkpage(rel, buf); + + page = BufferGetPage(buf); + + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + itemsz = IndexTupleSize(itup); + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this + * but we need to be consistent */ + + /* + * Check if the page is still the rightmost leaf page, has enough + * free space to accommodate the new tuple, no split is in progress + * and the scankey is greater than or equal to the first key on the + * page. + */ + if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) && + !P_INCOMPLETE_SPLIT(lpageop) && + !P_IGNORE(lpageop) && + (PageGetFreeSpace(page) > itemsz) && + PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) && + _bt_compare(rel, indnkeyatts, itup_scankey, page, + P_FIRSTDATAKEY(lpageop)) > 0) + { + fastpath = true; + } + else + { + _bt_relbuf(rel, buf); + + /* + * Something did not workout. Just forget about the cached + * block and follow the normal path. It might be set again if + * the conditions are favourble. + */ + RelationSetTargetBlock(rel, InvalidBlockNumber); + } + } + else + { + ReleaseBuffer(buf); + + /* + * If someone's holding a lock, it's likely to change anyway, + * so don't try again until we get an updated rightmost leaf. + */ + RelationSetTargetBlock(rel, InvalidBlockNumber); + } + } + + if (!fastpath) + { + /* find the first page containing this key */ + stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE, + NULL); + + /* trade in our read lock for a write lock */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf, BT_WRITE); + + /* + * If the page was split between the time that we surrendered our read + * lock and acquired our write lock, then this page may no longer be + * the right place for the key we want to insert. In this case, we + * need to move right in the tree. See Lehman and Yao for an + * excruciatingly precise description. + */ + buf = _bt_moveright(rel, buf, indnkeyatts, itup_scankey, false, + true, stack, BT_WRITE, NULL); + } + + /* + * If we're not allowing duplicates, make sure the key isn't already in + * the index. + * + * NOTE: obviously, _bt_check_unique can only detect keys that are already + * in the index; so it cannot defend against concurrent insertions of the + * same key. We protect against that by means of holding a write lock on + * the target page. Any other would-be inserter of the same key must + * acquire a write lock on the same target page, so only one would-be + * inserter can be making the check at one time. Furthermore, once we are + * past the check we hold write locks continuously until we have performed + * our insertion, so no later inserter can fail to see our insertion. + * (This requires some care in _bt_insertonpg.) + * + * If we must wait for another xact, we release the lock while waiting, + * and then must start over completely. + * + * For a partial uniqueness check, we don't wait for the other xact. Just + * let the tuple in and return false for possibly non-unique, or true for + * definitely unique. + */ + if (checkUnique != UNIQUE_CHECK_NO) + { + TransactionId xwait; + uint32 speculativeToken; + + offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false); + xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey, + checkUnique, &is_unique, &speculativeToken); + + if (TransactionIdIsValid(xwait)) + { + /* Have to wait for the other guy ... */ + _bt_relbuf(rel, buf); + + /* + * If it's a speculative insertion, wait for it to finish (ie. to + * go ahead with the insertion, or kill the tuple). Otherwise + * wait for the transaction to finish as usual. + */ + if (speculativeToken) + SpeculativeInsertionWait(xwait, speculativeToken); + else + XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); + + /* start over... */ + if (stack) + _bt_freestack(stack); + goto top; + } + } + + if (checkUnique != UNIQUE_CHECK_EXISTING) + { + /* + * The only conflict predicate locking cares about for indexes is when + * an index tuple insert conflicts with an existing lock. Since the + * actual location of the insert is hard to predict because of the + * random search used to prevent O(N^2) performance when there are + * many duplicate entries, we can just use the "first valid" page. + */ + CheckForSerializableConflictIn(rel, NULL, buf); + /* do the insertion */ + _bt_findinsertloc(rel, &buf, &offset, indnkeyatts, itup_scankey, itup, + stack, heapRel); + _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false); + } + else + { + /* just release the buffer */ + _bt_relbuf(rel, buf); + } + + /* be tidy */ + if (stack) + _bt_freestack(stack); + _bt_freeskey(itup_scankey); + + return is_unique; +} + +/* + * _bt_check_unique() -- Check for violation of unique index constraint + * + * offset points to the first possible item that could conflict. It can + * also point to end-of-page, which means that the first tuple to check + * is the first tuple on the next page. + * + * Returns InvalidTransactionId if there is no conflict, else an xact ID + * we must wait for to see if it commits a conflicting tuple. If an actual + * conflict is detected, no return --- just ereport(). If an xact ID is + * returned, and the conflicting tuple still has a speculative insertion in + * progress, *speculativeToken is set to non-zero, and the caller can wait for + * the verdict on the insertion using SpeculativeInsertionWait(). + * + * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return + * InvalidTransactionId because we don't want to wait. In this case we + * set *is_unique to false if there is a potential conflict, and the + * core code must redo the uniqueness check later. + */ +static TransactionId +_bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, + Buffer buf, OffsetNumber offset, ScanKey itup_scankey, + IndexUniqueCheck checkUnique, bool *is_unique, + uint32 *speculativeToken) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + SnapshotData SnapshotDirty; + OffsetNumber maxoff; + Page page; + BTPageOpaque opaque; + Buffer nbuf = InvalidBuffer; + bool found = false; + + /* Assume unique until we find a duplicate */ + *is_unique = true; + + InitDirtySnapshot(SnapshotDirty); + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Scan over all equal tuples, looking for live conflicts. + */ + for (;;) + { + ItemId curitemid; + IndexTuple curitup; + BlockNumber nblkno; + + /* + * make sure the offset points to an actual item before trying to + * examine it... + */ + if (offset <= maxoff) + { + curitemid = PageGetItemId(page, offset); + + /* + * We can skip items that are marked killed. + * + * Formerly, we applied _bt_isequal() before checking the kill + * flag, so as to fall out of the item loop as soon as possible. + * However, in the presence of heavy update activity an index may + * contain many killed items with the same key; running + * _bt_isequal() on each killed item gets expensive. Furthermore + * it is likely that the non-killed version of each key appears + * first, so that we didn't actually get to exit any sooner + * anyway. So now we just advance over killed items as quickly as + * we can. We only apply _bt_isequal() when we get to a non-killed + * item or the end of the page. + */ + if (!ItemIdIsDead(curitemid)) + { + ItemPointerData htid; + bool all_dead; + + /* + * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's + * how we handling NULLs - and so we must not use _bt_compare + * in real comparison, but only for ordering/finding items on + * pages. - vadim 03/24/97 + */ + if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey)) + break; /* we're past all the equal tuples */ + + /* okay, we gotta fetch the heap tuple ... */ + curitup = (IndexTuple) PageGetItem(page, curitemid); + htid = curitup->t_tid; + + /* + * If we are doing a recheck, we expect to find the tuple we + * are rechecking. It's not a duplicate, but we have to keep + * scanning. + */ + if (checkUnique == UNIQUE_CHECK_EXISTING && + ItemPointerCompare(&htid, &itup->t_tid) == 0) + { + found = true; + } + + /* + * We check the whole HOT-chain to see if there is any tuple + * that satisfies SnapshotDirty. This is necessary because we + * have just a single index entry for the entire chain. + */ + else if (heap_hot_search(&htid, heapRel, &SnapshotDirty, + &all_dead)) + { + TransactionId xwait; + + /* + * It is a duplicate. If we are only doing a partial + * check, then don't bother checking if the tuple is being + * updated in another transaction. Just return the fact + * that it is a potential conflict and leave the full + * check till later. + */ + if (checkUnique == UNIQUE_CHECK_PARTIAL) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + *is_unique = false; + return InvalidTransactionId; + } + + /* + * If this tuple is being updated by other transaction + * then we have to wait for its commit/abort. + */ + xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? + SnapshotDirty.xmin : SnapshotDirty.xmax; + + if (TransactionIdIsValid(xwait)) + { + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + /* Tell _bt_doinsert to wait... */ + *speculativeToken = SnapshotDirty.speculativeToken; + return xwait; + } + + /* + * Otherwise we have a definite conflict. But before + * complaining, look to see if the tuple we want to insert + * is itself now committed dead --- if so, don't complain. + * This is a waste of time in normal scenarios but we must + * do it to support CREATE INDEX CONCURRENTLY. + * + * We must follow HOT-chains here because during + * concurrent index build, we insert the root TID though + * the actual tuple may be somewhere in the HOT-chain. + * While following the chain we might not stop at the + * exact tuple which triggered the insert, but that's OK + * because if we find a live tuple anywhere in this chain, + * we have a unique key conflict. The other live tuple is + * not part of this chain because it had a different index + * entry. + */ + htid = itup->t_tid; + if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL)) + { + /* Normal case --- it's still live */ + } + else + { + /* + * It's been deleted, so no error, and no need to + * continue searching + */ + break; + } + + /* + * Check for a conflict-in as we would if we were going to + * write to this page. We aren't actually going to write, + * but we want a chance to report SSI conflicts that would + * otherwise be masked by this unique constraint + * violation. + */ + CheckForSerializableConflictIn(rel, NULL, buf); + + /* + * This is a definite conflict. Break the tuple down into + * datums and report the error. But first, make sure we + * release the buffer locks we're holding --- + * BuildIndexValueDescription could make catalog accesses, + * which in the worst case might touch this same index and + * cause deadlocks. + */ + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + _bt_relbuf(rel, buf); + + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + index_deform_tuple(itup, RelationGetDescr(rel), + values, isnull); + + key_desc = BuildIndexValueDescription(rel, values, + isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("duplicate key value violates unique constraint \"%s\"", + RelationGetRelationName(rel)), + key_desc ? errdetail("Key %s already exists.", + key_desc) : 0, + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + } + } + else if (all_dead) + { + /* + * The conflicting tuple (or whole HOT chain) is dead to + * everyone, so we may as well mark the index entry + * killed. + */ + ItemIdMarkDead(curitemid); + opaque->btpo_flags |= BTP_HAS_GARBAGE; + + /* + * Mark buffer with a dirty hint, since state is not + * crucial. Be sure to mark the proper buffer dirty. + */ + if (nbuf != InvalidBuffer) + MarkBufferDirtyHint(nbuf, true); + else + MarkBufferDirtyHint(buf, true); + } + } + } + + /* + * Advance to next tuple to continue checking. + */ + if (offset < maxoff) + offset = OffsetNumberNext(offset); + else + { + /* If scankey == hikey we gotta check the next page too */ + if (P_RIGHTMOST(opaque)) + break; + if (!_bt_isequal(itupdesc, page, P_HIKEY, + indnkeyatts, itup_scankey)) + break; + /* Advance to next non-dead page --- there must be one */ + for (;;) + { + nblkno = opaque->btpo_next; + nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ); + page = BufferGetPage(nbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_IGNORE(opaque)) + break; + if (P_RIGHTMOST(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + } + maxoff = PageGetMaxOffsetNumber(page); + offset = P_FIRSTDATAKEY(opaque); + } + } + + /* + * If we are doing a recheck then we should have found the tuple we are + * checking. Otherwise there's something very wrong --- probably, the + * index is on a non-immutable expression. + */ + if (checkUnique == UNIQUE_CHECK_EXISTING && !found) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to re-find tuple within index \"%s\"", + RelationGetRelationName(rel)), + errhint("This may be because of a non-immutable index expression."), + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf); + + return InvalidTransactionId; +} + + +/* + * _bt_findinsertloc() -- Finds an insert location for a tuple + * + * If the new key is equal to one or more existing keys, we can + * legitimately place it anywhere in the series of equal keys --- in fact, + * if the new key is equal to the page's "high key" we can place it on + * the next page. If it is equal to the high key, and there's not room + * to insert the new tuple on the current page without splitting, then + * we can move right hoping to find more free space and avoid a split. + * (We should not move right indefinitely, however, since that leads to + * O(N^2) insertion behavior in the presence of many equal keys.) + * Once we have chosen the page to put the key on, we'll insert it before + * any existing equal keys because of the way _bt_binsrch() works. + * + * If there's not enough room in the space, we try to make room by + * removing any LP_DEAD tuples. + * + * On entry, *bufptr and *offsetptr point to the first legal position + * where the new tuple could be inserted. The caller should hold an + * exclusive lock on *bufptr. *offsetptr can also be set to + * InvalidOffsetNumber, in which case the function will search for the + * right location within the page if needed. On exit, they point to the + * chosen insert location. If _bt_findinsertloc decides to move right, + * the lock and pin on the original page will be released and the new + * page returned to the caller is exclusively locked instead. + * + * newtup is the new tuple we're inserting, and scankey is an insertion + * type scan key for it. + */ +static void +_bt_findinsertloc(Relation rel, + Buffer *bufptr, + OffsetNumber *offsetptr, + int keysz, + ScanKey scankey, + IndexTuple newtup, + BTStack stack, + Relation heapRel) +{ + Buffer buf = *bufptr; + Page page = BufferGetPage(buf); + Size itemsz; + BTPageOpaque lpageop; + bool movedright, + vacuumed; + OffsetNumber newitemoff; + OffsetNumber firstlegaloff = *offsetptr; + + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + + itemsz = IndexTupleSize(newtup); + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we + * need to be consistent */ + + /* + * Check whether the item can fit on a btree page at all. (Eventually, we + * ought to try to apply TOAST methods if not.) We actually need to be + * able to fit three items on every page, so restrict any one item to 1/3 + * the per-page available space. Note that at this point, itemsz doesn't + * include the ItemId. + * + * NOTE: if you change this, see also the similar code in _bt_buildadd(). + */ + if (itemsz > BTMaxItemSize(page)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + itemsz, BTMaxItemSize(page), + RelationGetRelationName(rel)), + errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" + "Consider a function index of an MD5 hash of the value, " + "or use full text indexing."), + errtableconstraint(heapRel, + RelationGetRelationName(rel)))); + + /*---------- + * If we will need to split the page to put the item on this page, + * check whether we can put the tuple somewhere to the right, + * instead. Keep scanning right until we + * (a) find a page with enough free space, + * (b) reach the last page where the tuple can legally go, or + * (c) get tired of searching. + * (c) is not flippant; it is important because if there are many + * pages' worth of equal keys, it's better to split one of the early + * pages than to scan all the way to the end of the run of equal keys + * on every insert. We implement "get tired" as a random choice, + * since stopping after scanning a fixed number of pages wouldn't work + * well (we'd never reach the right-hand side of previously split + * pages). Currently the probability of moving right is set at 0.99, + * which may seem too high to change the behavior much, but it does an + * excellent job of preventing O(N^2) behavior with many equal keys. + *---------- + */ + movedright = false; + vacuumed = false; + while (PageGetFreeSpace(page) < itemsz) + { + Buffer rbuf; + BlockNumber rblkno; + + /* + * before considering moving right, see if we can obtain enough space + * by erasing LP_DEAD items + */ + if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) + { + _bt_vacuum_one_page(rel, buf, heapRel); + + /* + * remember that we vacuumed this page, because that makes the + * hint supplied by the caller invalid + */ + vacuumed = true; + + if (PageGetFreeSpace(page) >= itemsz) + break; /* OK, now we have enough space */ + } + + /* + * nope, so check conditions (b) and (c) enumerated above + */ + if (P_RIGHTMOST(lpageop) || + _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || + random() <= (MAX_RANDOM_VALUE / 100)) + break; + + /* + * step right to next non-dead page + * + * must write-lock that page before releasing write lock on current + * page; else someone else's _bt_check_unique scan could fail to see + * our insertion. write locks on intermediate dead pages won't do + * because we don't know when they will get de-linked from the tree. + */ + rbuf = InvalidBuffer; + + rblkno = lpageop->btpo_next; + for (;;) + { + rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); + page = BufferGetPage(rbuf); + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * If this page was incompletely split, finish the split now. We + * do this while holding a lock on the left sibling, which is not + * good because finishing the split could be a fairly lengthy + * operation. But this should happen very seldom. + */ + if (P_INCOMPLETE_SPLIT(lpageop)) + { + _bt_finish_split(rel, rbuf, stack); + rbuf = InvalidBuffer; + continue; + } + + if (!P_IGNORE(lpageop)) + break; + if (P_RIGHTMOST(lpageop)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + + rblkno = lpageop->btpo_next; + } + _bt_relbuf(rel, buf); + buf = rbuf; + movedright = true; + vacuumed = false; + } + + /* + * Now we are on the right page, so find the insert position. If we moved + * right at all, we know we should insert at the start of the page. If we + * didn't move right, we can use the firstlegaloff hint if the caller + * supplied one, unless we vacuumed the page which might have moved tuples + * around making the hint invalid. If we didn't move right or can't use + * the hint, find the position by searching. + */ + if (movedright) + newitemoff = P_FIRSTDATAKEY(lpageop); + else if (firstlegaloff != InvalidOffsetNumber && !vacuumed) + newitemoff = firstlegaloff; + else + newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); + + *bufptr = buf; + *offsetptr = newitemoff; +} + +/*---------- + * _bt_insertonpg() -- Insert a tuple on a particular page in the index. + * + * This recursive procedure does the following things: + * + * + if necessary, splits the target page (making sure that the + * split is equitable as far as post-insert free space goes). + * + inserts the tuple. + * + if the page was split, pops the parent stack, and finds the + * right place to insert the new child pointer (by walking + * right using information stored in the parent stack). + * + invokes itself with the appropriate tuple for the right + * child page on the parent. + * + updates the metapage if a true root or fast root is split. + * + * On entry, we must have the correct buffer in which to do the + * insertion, and the buffer must be pinned and write-locked. On return, + * we will have dropped both the pin and the lock on the buffer. + * + * When inserting to a non-leaf page, 'cbuf' is the left-sibling of the + * page we're inserting the downlink for. This function will clear the + * INCOMPLETE_SPLIT flag on it, and release the buffer. + * + * The locking interactions in this code are critical. You should + * grok Lehman and Yao's paper before making any changes. In addition, + * you need to understand how we disambiguate duplicate keys in this + * implementation, in order to be able to find our location using + * L&Y "move right" operations. Since we may insert duplicate user + * keys, and since these dups may propagate up the tree, we use the + * 'afteritem' parameter to position ourselves correctly for the + * insertion on internal pages. + *---------- + */ +static void +_bt_insertonpg(Relation rel, + Buffer buf, + Buffer cbuf, + BTStack stack, + IndexTuple itup, + OffsetNumber newitemoff, + bool split_only_page) +{ + Page page; + BTPageOpaque lpageop; + OffsetNumber firstright = InvalidOffsetNumber; + Size itemsz; + + page = BufferGetPage(buf); + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + + /* child buffer must be given iff inserting on an internal page */ + Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf)); + + /* The caller should've finished any incomplete splits already. */ + if (P_INCOMPLETE_SPLIT(lpageop)) + elog(ERROR, "cannot insert to incompletely split page %u", + BufferGetBlockNumber(buf)); + + itemsz = IndexTupleSize(itup); + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we + * need to be consistent */ + + /* + * Do we need to split the page to fit the item on it? + * + * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, + * so this comparison is correct even though we appear to be accounting + * only for the item and not for its line pointer. + */ + if (PageGetFreeSpace(page) < itemsz) + { + bool is_root = P_ISROOT(lpageop); + bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop); + bool newitemonleft; + Buffer rbuf; + + /* Choose the split point */ + firstright = _bt_findsplitloc(rel, page, + newitemoff, itemsz, + &newitemonleft); + + /* split the buffer into left and right halves */ + rbuf = _bt_split(rel, buf, cbuf, firstright, + newitemoff, itemsz, itup, newitemonleft); + PredicateLockPageSplit(rel, + BufferGetBlockNumber(buf), + BufferGetBlockNumber(rbuf)); + + /*---------- + * By here, + * + * + our target page has been split; + * + the original tuple has been inserted; + * + we have write locks on both the old (left half) + * and new (right half) buffers, after the split; and + * + we know the key we want to insert into the parent + * (it's the "high key" on the left child page). + * + * We're ready to do the parent insertion. We need to hold onto the + * locks for the child pages until we locate the parent, but we can + * release them before doing the actual insertion (see Lehman and Yao + * for the reasoning). + *---------- + */ + _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only); + } + else + { + Buffer metabuf = InvalidBuffer; + Page metapg = NULL; + BTMetaPageData *metad = NULL; + OffsetNumber itup_off; + BlockNumber itup_blkno; + + itup_off = newitemoff; + itup_blkno = BufferGetBlockNumber(buf); + + /* + * If we are doing this insert because we split a page that was the + * only one on its tree level, but was not the root, it may have been + * the "fast root". We need to ensure that the fast root link points + * at or above the current page. We can safely acquire a lock on the + * metapage here --- see comments for _bt_newroot(). + */ + if (split_only_page) + { + Assert(!P_ISLEAF(lpageop)); + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + if (metad->btm_fastlevel >= lpageop->btpo.level) + { + /* no update wanted */ + _bt_relbuf(rel, metabuf); + metabuf = InvalidBuffer; + } + } + + /* Do the update. No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + if (!_bt_pgaddtup(page, itemsz, itup, newitemoff)) + elog(PANIC, "failed to add new item to block %u in index \"%s\"", + itup_blkno, RelationGetRelationName(rel)); + + MarkBufferDirty(buf); + + if (BufferIsValid(metabuf)) + { + metad->btm_fastroot = itup_blkno; + metad->btm_fastlevel = lpageop->btpo.level; + MarkBufferDirty(metabuf); + } + + /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */ + if (BufferIsValid(cbuf)) + { + Page cpage = BufferGetPage(cbuf); + BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage); + + Assert(P_INCOMPLETE_SPLIT(cpageop)); + cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(cbuf); + } + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_insert xlrec; + xl_btree_metadata xlmeta; + uint8 xlinfo; + XLogRecPtr recptr; + IndexTupleData trunctuple; + + xlrec.offnum = itup_off; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); + + if (P_ISLEAF(lpageop)) + { + xlinfo = XLOG_BTREE_INSERT_LEAF; + + /* + * Cache the block information if we just inserted into the + * rightmost leaf page of the index. + */ + if (P_RIGHTMOST(lpageop)) + RelationSetTargetBlock(rel, BufferGetBlockNumber(buf)); + } + else + { + /* + * Register the left child whose INCOMPLETE_SPLIT flag was + * cleared. + */ + XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD); + + xlinfo = XLOG_BTREE_INSERT_UPPER; + } + + if (BufferIsValid(metabuf)) + { + xlmeta.root = metad->btm_root; + xlmeta.level = metad->btm_level; + xlmeta.fastroot = metad->btm_fastroot; + xlmeta.fastlevel = metad->btm_fastlevel; + + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); + + xlinfo = XLOG_BTREE_INSERT_META; + } + + /* Read comments in _bt_pgaddtup */ + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop)) + { + trunctuple = *itup; + trunctuple.t_info = sizeof(IndexTupleData); + XLogRegisterBufData(0, (char *) &trunctuple, + sizeof(IndexTupleData)); + } + else + XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + + recptr = XLogInsert(RM_BTREE_ID, xlinfo); + + if (BufferIsValid(metabuf)) + { + PageSetLSN(metapg, recptr); + } + if (BufferIsValid(cbuf)) + { + PageSetLSN(BufferGetPage(cbuf), recptr); + } + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* release buffers */ + if (BufferIsValid(metabuf)) + _bt_relbuf(rel, metabuf); + if (BufferIsValid(cbuf)) + _bt_relbuf(rel, cbuf); + _bt_relbuf(rel, buf); + } +} + +/* + * _bt_split() -- split a page in the btree. + * + * On entry, buf is the page to split, and is pinned and write-locked. + * firstright is the item index of the first item to be moved to the + * new right page. newitemoff etc. tell us about the new item that + * must be inserted along with the data from the old page. + * + * When splitting a non-leaf page, 'cbuf' is the left-sibling of the + * page we're inserting the downlink for. This function will clear the + * INCOMPLETE_SPLIT flag on it, and release the buffer. + * + * Returns the new right sibling of buf, pinned and write-locked. + * The pin and lock on buf are maintained. + */ +static Buffer +_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, + bool newitemonleft) +{ + Buffer rbuf; + Page origpage; + Page leftpage, + rightpage; + BlockNumber origpagenumber, + rightpagenumber; + BTPageOpaque ropaque, + lopaque, + oopaque; + Buffer sbuf = InvalidBuffer; + Page spage = NULL; + BTPageOpaque sopaque = NULL; + Size itemsz; + ItemId itemid; + IndexTuple item; + OffsetNumber leftoff, + rightoff; + OffsetNumber maxoff; + OffsetNumber i; + bool isleaf; + IndexTuple lefthikey; + int indnatts = IndexRelationGetNumberOfAttributes(rel); + int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* Acquire a new page to split into */ + rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + + /* + * origpage is the original page to be split. leftpage is a temporary + * buffer that receives the left-sibling data, which will be copied back + * into origpage on success. rightpage is the new page that receives the + * right-sibling data. If we fail before reaching the critical section, + * origpage hasn't been modified and leftpage is only workspace. In + * principle we shouldn't need to worry about rightpage either, because it + * hasn't been linked into the btree page structure; but to avoid leaving + * possibly-confusing junk behind, we are careful to rewrite rightpage as + * zeroes before throwing any error. + */ + origpage = BufferGetPage(buf); + leftpage = PageGetTempPage(origpage); + rightpage = BufferGetPage(rbuf); + + origpagenumber = BufferGetBlockNumber(buf); + rightpagenumber = BufferGetBlockNumber(rbuf); + + _bt_pageinit(leftpage, BufferGetPageSize(buf)); + /* rightpage was already initialized by _bt_getbuf */ + + /* + * Copy the original page's LSN into leftpage, which will become the + * updated version of the page. We need this because XLogInsert will + * examine the LSN and possibly dump it in a page image. + */ + PageSetLSN(leftpage, PageGetLSN(origpage)); + + /* init btree private data */ + oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); + + isleaf = P_ISLEAF(oopaque); + + /* if we're splitting this page, it won't be the root when we're done */ + /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */ + lopaque->btpo_flags = oopaque->btpo_flags; + lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + ropaque->btpo_flags = lopaque->btpo_flags; + /* set flag in left page indicating that the right page has no downlink */ + lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; + lopaque->btpo_prev = oopaque->btpo_prev; + lopaque->btpo_next = rightpagenumber; + ropaque->btpo_prev = origpagenumber; + ropaque->btpo_next = oopaque->btpo_next; + lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level; + /* Since we already have write-lock on both pages, ok to read cycleid */ + lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel); + ropaque->btpo_cycleid = lopaque->btpo_cycleid; + + /* + * If the page we're splitting is not the rightmost page at its level in + * the tree, then the first entry on the page is the high key for the + * page. We need to copy that to the right half. Otherwise (meaning the + * rightmost page case), all the items on the right half will be user + * data. + */ + rightoff = P_HIKEY; + + if (!P_RIGHTMOST(oopaque)) + { + itemid = PageGetItemId(origpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + item = (IndexTuple) PageGetItem(origpage, itemid); + if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, + false, false) == InvalidOffsetNumber) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add hikey to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + rightoff = OffsetNumberNext(rightoff); + } + + /* + * The "high key" for the new left page will be the first key that's going + * to go into the new right page. This might be either the existing data + * item at position firstright, or the incoming tuple. + */ + leftoff = P_HIKEY; + if (!newitemonleft && newitemoff == firstright) + { + /* incoming tuple will become first on right page */ + itemsz = newitemsz; + item = newitem; + } + else + { + /* existing item at firstright will become first on right page */ + itemid = PageGetItemId(origpage, firstright); + itemsz = ItemIdGetLength(itemid); + item = (IndexTuple) PageGetItem(origpage, itemid); + } + + /* + * We must truncate included attributes of the "high key" item, + * before insert it onto the leaf page. It's the only point in insertion + * process, where we perform truncation. All other functions work with + * this high key and do not change it. + */ + if (indnatts != indnkeyatts && P_ISLEAF(lopaque)) + { + lefthikey = _bt_truncate_tuple(rel, item); + itemsz = IndexTupleSize(lefthikey); + itemsz = MAXALIGN(itemsz); + } + else + lefthikey = item; + + if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff, + false, false) == InvalidOffsetNumber) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add hikey to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + leftoff = OffsetNumberNext(leftoff); + + /* + * Now transfer all the data items to the appropriate page. + * + * Note: we *must* insert at least the right page's items in item-number + * order, for the benefit of _bt_restore_page(). + */ + maxoff = PageGetMaxOffsetNumber(origpage); + + for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i)) + { + itemid = PageGetItemId(origpage, i); + itemsz = ItemIdGetLength(itemid); + item = (IndexTuple) PageGetItem(origpage, itemid); + + /* does new item belong before this one? */ + if (i == newitemoff) + { + if (newitemonleft) + { + if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + leftoff = OffsetNumberNext(leftoff); + } + else + { + if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + rightoff = OffsetNumberNext(rightoff); + } + } + + /* decide which page to put it on */ + if (i < firstright) + { + if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add old item to the left sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + leftoff = OffsetNumberNext(leftoff); + } + else + { + if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add old item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + rightoff = OffsetNumberNext(rightoff); + } + } + + /* cope with possibility that newitem goes at the end */ + if (i <= newitemoff) + { + /* + * Can't have newitemonleft here; that would imply we were told to put + * *everything* on the left page, which cannot fit (if it could, we'd + * not be splitting the page). + */ + Assert(!newitemonleft); + if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff)) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add new item to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + rightoff = OffsetNumberNext(rightoff); + } + + /* + * We have to grab the right sibling (if any) and fix the prev pointer + * there. We are guaranteed that this is deadlock-free since no other + * writer will be holding a lock on that page and trying to move left, and + * all readers release locks on a page before trying to fetch its + * neighbors. + */ + + if (!P_RIGHTMOST(oopaque)) + { + sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE); + spage = BufferGetPage(sbuf); + sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); + if (sopaque->btpo_prev != origpagenumber) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "right sibling's left-link doesn't match: " + "block %u links to %u instead of expected %u in index \"%s\"", + oopaque->btpo_next, sopaque->btpo_prev, origpagenumber, + RelationGetRelationName(rel)); + } + + /* + * Check to see if we can set the SPLIT_END flag in the right-hand + * split page; this can save some I/O for vacuum since it need not + * proceed to the right sibling. We can set the flag if the right + * sibling has a different cycleid: that means it could not be part of + * a group of pages that were all split off from the same ancestor + * page. If you're confused, imagine that page A splits to A B and + * then again, yielding A C B, while vacuum is in progress. Tuples + * originally in A could now be in either B or C, hence vacuum must + * examine both pages. But if D, our right sibling, has a different + * cycleid then it could not contain any tuples that were in A when + * the vacuum started. + */ + if (sopaque->btpo_cycleid != ropaque->btpo_cycleid) + ropaque->btpo_flags |= BTP_SPLIT_END; + } + + /* + * Right sibling is locked, new siblings are prepared, but original page + * is not updated yet. + * + * NO EREPORT(ERROR) till right sibling is updated. We can get away with + * not starting the critical section till here because we haven't been + * scribbling on the original page yet; see comments above. + */ + START_CRIT_SECTION(); + + /* + * By here, the original data page has been split into two new halves, and + * these are correct. The algorithm requires that the left page never + * move during a split, so we copy the new left page back on top of the + * original. Note that this is not a waste of time, since we also require + * (in the page management code) that the center of a page always be + * clean, and the most efficient way to guarantee this is just to compact + * the data by reinserting it into a new left page. (XXX the latter + * comment is probably obsolete; but in any case it's good to not scribble + * on the original page until we enter the critical section.) + * + * We need to do this before writing the WAL record, so that XLogInsert + * can WAL log an image of the page if necessary. + */ + PageRestoreTempPage(leftpage, origpage); + /* leftpage, lopaque must not be used below here */ + + MarkBufferDirty(buf); + MarkBufferDirty(rbuf); + + if (!P_RIGHTMOST(ropaque)) + { + sopaque->btpo_prev = rightpagenumber; + MarkBufferDirty(sbuf); + } + + /* + * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes + * a split. + */ + if (!isleaf) + { + Page cpage = BufferGetPage(cbuf); + BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage); + + cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(cbuf); + } + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_split xlrec; + uint8 xlinfo; + XLogRecPtr recptr; + + xlrec.level = ropaque->btpo.level; + xlrec.firstright = firstright; + xlrec.newitemoff = newitemoff; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); + + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT); + /* Log the right sibling, because we've changed its prev-pointer. */ + if (!P_RIGHTMOST(ropaque)) + XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD); + if (BufferIsValid(cbuf)) + XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD); + + /* + * Log the new item, if it was inserted on the left page. (If it was + * put on the right page, we don't need to explicitly WAL log it + * because it's included with all the other items on the right page.) + * Show the new item as belonging to the left page buffer, so that it + * is not stored if XLogInsert decides it needs a full-page image of + * the left page. We store the offset anyway, though, to support + * archive compression of these records. + */ + if (newitemonleft) + XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); + + /* + * We must also log the left page's high key. There are two reasons + * for that: right page's leftmost key is suppressed on non-leaf levels, + * in covering indexes, included columns are truncated from high keys. + * For simplicity, we don't distinguish these cases, but log the high + * key every time. Show it as belonging to the left page buffer, so + * that it is not stored if XLogInsert decides it needs a full-page + * image of the left page. + */ + itemid = PageGetItemId(origpage, P_HIKEY); + item = (IndexTuple) PageGetItem(origpage, itemid); + XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item))); + + /* + * Log the contents of the right page in the format understood by + * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer, + * because we're going to recreate the whole page anyway, so it should + * never be stored by XLogInsert. + * + * Direct access to page is not good but faster - we should implement + * some new func in page API. Note we only store the tuples + * themselves, knowing that they were inserted in item-number order + * and so the item pointers can be reconstructed. See comments for + * _bt_restore_page(). + */ + XLogRegisterBufData(1, + (char *) rightpage + ((PageHeader) rightpage)->pd_upper, + ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper); + + xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; + recptr = XLogInsert(RM_BTREE_ID, xlinfo); + + PageSetLSN(origpage, recptr); + PageSetLSN(rightpage, recptr); + if (!P_RIGHTMOST(ropaque)) + { + PageSetLSN(spage, recptr); + } + if (!isleaf) + { + PageSetLSN(BufferGetPage(cbuf), recptr); + } + } + + END_CRIT_SECTION(); + + /* release the old right sibling */ + if (!P_RIGHTMOST(ropaque)) + _bt_relbuf(rel, sbuf); + + /* release the child */ + if (!isleaf) + _bt_relbuf(rel, cbuf); + + /* split's done */ + return rbuf; +} + +/* + * _bt_findsplitloc() -- find an appropriate place to split a page. + * + * The idea here is to equalize the free space that will be on each split + * page, *after accounting for the inserted tuple*. (If we fail to account + * for it, we might find ourselves with too little room on the page that + * it needs to go into!) + * + * If the page is the rightmost page on its level, we instead try to arrange + * to leave the left split page fillfactor% full. In this way, when we are + * inserting successively increasing keys (consider sequences, timestamps, + * etc) we will end up with a tree whose pages are about fillfactor% full, + * instead of the 50% full result that we'd get without this special case. + * This is the same as nbtsort.c produces for a newly-created tree. Note + * that leaf and nonleaf pages use different fillfactors. + * + * We are passed the intended insert position of the new tuple, expressed as + * the offsetnumber of the tuple it must go in front of. (This could be + * maxoff+1 if the tuple is to go at the end.) + * + * We return the index of the first existing tuple that should go on the + * righthand page, plus a boolean indicating whether the new tuple goes on + * the left or right page. The bool is necessary to disambiguate the case + * where firstright == newitemoff. + */ +static OffsetNumber +_bt_findsplitloc(Relation rel, + Page page, + OffsetNumber newitemoff, + Size newitemsz, + bool *newitemonleft) +{ + BTPageOpaque opaque; + OffsetNumber offnum; + OffsetNumber maxoff; + ItemId itemid; + FindSplitData state; + int leftspace, + rightspace, + goodenough, + olddataitemstotal, + olddataitemstoleft; + bool goodenoughfound; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + + /* Total free space available on a btree page, after fixed overhead */ + leftspace = rightspace = + PageGetPageSize(page) - SizeOfPageHeaderData - + MAXALIGN(sizeof(BTPageOpaqueData)); + + /* The right page will have the same high key as the old page */ + if (!P_RIGHTMOST(opaque)) + { + itemid = PageGetItemId(page, P_HIKEY); + rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + + sizeof(ItemIdData)); + } + + /* Count up total space in data items without actually scanning 'em */ + olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page); + + state.newitemsz = newitemsz; + state.is_leaf = P_ISLEAF(opaque); + state.is_rightmost = P_RIGHTMOST(opaque); + state.have_split = false; + if (state.is_leaf) + state.fillfactor = RelationGetFillFactor(rel, + BTREE_DEFAULT_FILLFACTOR); + else + state.fillfactor = BTREE_NONLEAF_FILLFACTOR; + state.newitemonleft = false; /* these just to keep compiler quiet */ + state.firstright = 0; + state.best_delta = 0; + state.leftspace = leftspace; + state.rightspace = rightspace; + state.olddataitemstotal = olddataitemstotal; + state.newitemoff = newitemoff; + + /* + * Finding the best possible split would require checking all the possible + * split points, because of the high-key and left-key special cases. + * That's probably more work than it's worth; instead, stop as soon as we + * find a "good-enough" split, where good-enough is defined as an + * imbalance in free space of no more than pagesize/16 (arbitrary...) This + * should let us stop near the middle on most pages, instead of plowing to + * the end. + */ + goodenough = leftspace / 16; + + /* + * Scan through the data items and calculate space usage for a split at + * each possible position. + */ + olddataitemstoleft = 0; + goodenoughfound = false; + maxoff = PageGetMaxOffsetNumber(page); + + for (offnum = P_FIRSTDATAKEY(opaque); + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + Size itemsz; + + itemid = PageGetItemId(page, offnum); + itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); + + /* + * Will the new item go to left or right of split? + */ + if (offnum > newitemoff) + _bt_checksplitloc(&state, offnum, true, + olddataitemstoleft, itemsz); + + else if (offnum < newitemoff) + _bt_checksplitloc(&state, offnum, false, + olddataitemstoleft, itemsz); + else + { + /* need to try it both ways! */ + _bt_checksplitloc(&state, offnum, true, + olddataitemstoleft, itemsz); + + _bt_checksplitloc(&state, offnum, false, + olddataitemstoleft, itemsz); + } + + /* Abort scan once we find a good-enough choice */ + if (state.have_split && state.best_delta <= goodenough) + { + goodenoughfound = true; + break; + } + + olddataitemstoleft += itemsz; + } + + /* + * If the new item goes as the last item, check for splitting so that all + * the old items go to the left page and the new item goes to the right + * page. + */ + if (newitemoff > maxoff && !goodenoughfound) + _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0); + + /* + * I believe it is not possible to fail to find a feasible split, but just + * in case ... + */ + if (!state.have_split) + elog(ERROR, "could not find a feasible split point for index \"%s\"", + RelationGetRelationName(rel)); + + *newitemonleft = state.newitemonleft; + return state.firstright; +} + +/* + * Subroutine to analyze a particular possible split choice (ie, firstright + * and newitemonleft settings), and record the best split so far in *state. + * + * firstoldonright is the offset of the first item on the original page + * that goes to the right page, and firstoldonrightsz is the size of that + * tuple. firstoldonright can be > max offset, which means that all the old + * items go to the left page and only the new item goes to the right page. + * In that case, firstoldonrightsz is not used. + * + * olddataitemstoleft is the total size of all old items to the left of + * firstoldonright. + */ +static void +_bt_checksplitloc(FindSplitData *state, + OffsetNumber firstoldonright, + bool newitemonleft, + int olddataitemstoleft, + Size firstoldonrightsz) +{ + int leftfree, + rightfree; + Size firstrightitemsz; + bool newitemisfirstonright; + + /* Is the new item going to be the first item on the right page? */ + newitemisfirstonright = (firstoldonright == state->newitemoff + && !newitemonleft); + + if (newitemisfirstonright) + firstrightitemsz = state->newitemsz; + else + firstrightitemsz = firstoldonrightsz; + + /* Account for all the old tuples */ + leftfree = state->leftspace - olddataitemstoleft; + rightfree = state->rightspace - + (state->olddataitemstotal - olddataitemstoleft); + + /* + * The first item on the right page becomes the high key of the left page; + * therefore it counts against left space as well as right space. + */ + leftfree -= firstrightitemsz; + + /* account for the new item */ + if (newitemonleft) + leftfree -= (int) state->newitemsz; + else + rightfree -= (int) state->newitemsz; + + /* + * If we are not on the leaf level, we will be able to discard the key + * data from the first item that winds up on the right page. + */ + if (!state->is_leaf) + rightfree += (int) firstrightitemsz - + (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); + + /* + * If feasible split point, remember best delta. + */ + if (leftfree >= 0 && rightfree >= 0) + { + int delta; + + if (state->is_rightmost) + { + /* + * If splitting a rightmost page, try to put (100-fillfactor)% of + * free space on left page. See comments for _bt_findsplitloc. + */ + delta = (state->fillfactor * leftfree) + - ((100 - state->fillfactor) * rightfree); + } + else + { + /* Otherwise, aim for equal free space on both sides */ + delta = leftfree - rightfree; + } + + if (delta < 0) + delta = -delta; + if (!state->have_split || delta < state->best_delta) + { + state->have_split = true; + state->newitemonleft = newitemonleft; + state->firstright = firstoldonright; + state->best_delta = delta; + } + } +} + +/* + * _bt_insert_parent() -- Insert downlink into parent after a page split. + * + * On entry, buf and rbuf are the left and right split pages, which we + * still hold write locks on per the L&Y algorithm. We release the + * write locks once we have write lock on the parent page. (Any sooner, + * and it'd be possible for some other process to try to split or delete + * one of these pages, and get confused because it cannot find the downlink.) + * + * stack - stack showing how we got here. May be NULL in cases that don't + * have to be efficient (concurrent ROOT split, WAL recovery) + * is_root - we split the true root + * is_only - we split a page alone on its level (might have been fast root) + */ +static void +_bt_insert_parent(Relation rel, + Buffer buf, + Buffer rbuf, + BTStack stack, + bool is_root, + bool is_only) +{ + /* + * Here we have to do something Lehman and Yao don't talk about: deal with + * a root split and construction of a new root. If our stack is empty + * then we have just split a node on what had been the root level when we + * descended the tree. If it was still the root then we perform a + * new-root construction. If it *wasn't* the root anymore, search to find + * the next higher level that someone constructed meanwhile, and find the + * right place to insert as for the normal case. + * + * If we have to search for the parent level, we do so by re-descending + * from the root. This is not super-efficient, but it's rare enough not + * to matter. + */ + if (is_root) + { + Buffer rootbuf; + + Assert(stack == NULL); + Assert(is_only); + /* create a new root node and update the metapage */ + rootbuf = _bt_newroot(rel, buf, rbuf); + /* release the split buffers */ + _bt_relbuf(rel, rootbuf); + _bt_relbuf(rel, rbuf); + _bt_relbuf(rel, buf); + } + else + { + BlockNumber bknum = BufferGetBlockNumber(buf); + BlockNumber rbknum = BufferGetBlockNumber(rbuf); + Page page = BufferGetPage(buf); + IndexTuple new_item; + BTStackData fakestack; + IndexTuple ritem; + Buffer pbuf; + + if (stack == NULL) + { + BTPageOpaque lpageop; + + elog(DEBUG2, "concurrent ROOT page split"); + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + /* Find the leftmost page at the next level up */ + pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false, + NULL); + /* Set up a phony stack entry pointing there */ + stack = &fakestack; + stack->bts_blkno = BufferGetBlockNumber(pbuf); + stack->bts_offset = InvalidOffsetNumber; + /* bts_btentry will be initialized below */ + stack->bts_parent = NULL; + _bt_relbuf(rel, pbuf); + } + + /* get high key from left page == lowest key on new right page */ + ritem = (IndexTuple) PageGetItem(page, + PageGetItemId(page, P_HIKEY)); + + /* form an index tuple that points at the new right page */ + new_item = CopyIndexTuple(ritem); + ItemPointerSetBlockNumber(&(new_item->t_tid), rbknum); +// ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY); + + /* + * Find the parent buffer and get the parent page. + * + * Oops - if we were moved right then we need to change stack item! We + * want to find parent pointing to where we are, right ? - vadim + * 05/27/97 + */ + ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY); + pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + + /* + * Now we can unlock the right child. The left child will be unlocked + * by _bt_insertonpg(). + */ + _bt_relbuf(rel, rbuf); + + /* Check for error only after writing children */ + if (pbuf == InvalidBuffer) + elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u", + RelationGetRelationName(rel), bknum, rbknum); + + /* Recursively update the parent */ + _bt_insertonpg(rel, pbuf, buf, stack->bts_parent, + new_item, stack->bts_offset + 1, + is_only); + + /* be tidy */ + pfree(new_item); + } +} + +/* + * _bt_finish_split() -- Finish an incomplete split + * + * A crash or other failure can leave a split incomplete. The insertion + * routines won't allow to insert on a page that is incompletely split. + * Before inserting on such a page, call _bt_finish_split(). + * + * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked + * and unpinned. + */ +void +_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack) +{ + Page lpage = BufferGetPage(lbuf); + BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage); + Buffer rbuf; + Page rpage; + BTPageOpaque rpageop; + bool was_root; + bool was_only; + + Assert(P_INCOMPLETE_SPLIT(lpageop)); + + /* Lock right sibling, the one missing the downlink */ + rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE); + rpage = BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* Could this be a root split? */ + if (!stack) + { + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + + /* acquire lock on the metapage */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + was_root = (metad->btm_root == BufferGetBlockNumber(lbuf)); + + _bt_relbuf(rel, metabuf); + } + else + was_root = false; + + /* Was this the only page on the level before split? */ + was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop)); + + elog(DEBUG1, "finishing incomplete split of %u/%u", + BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf)); + + _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only); +} + +/* + * _bt_getstackbuf() -- Walk back up the tree one step, and find the item + * we last looked at in the parent. + * + * This is possible because we save the downlink from the parent item, + * which is enough to uniquely identify it. Insertions into the parent + * level could cause the item to move right; deletions could cause it + * to move left, but not left of the page we previously found it in. + * + * Adjusts bts_blkno & bts_offset if changed. + * + * Returns InvalidBuffer if item not found (should not happen). + */ +Buffer +_bt_getstackbuf(Relation rel, BTStack stack, int access) +{ + BlockNumber blkno; + OffsetNumber start; + + blkno = stack->bts_blkno; + start = stack->bts_offset; + + for (;;) + { + Buffer buf; + Page page; + BTPageOpaque opaque; + + buf = _bt_getbuf(rel, blkno, access); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque)) + { + _bt_finish_split(rel, buf, stack->bts_parent); + continue; + } + + if (!P_IGNORE(opaque)) + { + OffsetNumber offnum, + minoff, + maxoff; + ItemId itemid; + IndexTuple item; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * start = InvalidOffsetNumber means "search the whole page". We + * need this test anyway due to possibility that page has a high + * key now when it didn't before. + */ + if (start < minoff) + start = minoff; + + /* + * Need this check too, to guard against possibility that page + * split since we visited it originally. + */ + if (start > maxoff) + start = OffsetNumberNext(maxoff); + + /* + * These loops will check every item on the page --- but in an + * order that's attuned to the probability of where it actually + * is. Scan to the right first, then to the left. + */ + for (offnum = start; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + itemid = PageGetItemId(page, offnum); + item = (IndexTuple) PageGetItem(page, itemid); + if (BTEntrySame(item, &stack->bts_btentry)) + { + /* Return accurate pointer to where link is now */ + stack->bts_blkno = blkno; + stack->bts_offset = offnum; + return buf; + } + } + + for (offnum = OffsetNumberPrev(start); + offnum >= minoff; + offnum = OffsetNumberPrev(offnum)) + { + itemid = PageGetItemId(page, offnum); + item = (IndexTuple) PageGetItem(page, itemid); + if (BTEntrySame(item, &stack->bts_btentry)) + { + /* Return accurate pointer to where link is now */ + stack->bts_blkno = blkno; + stack->bts_offset = offnum; + return buf; + } + } + } + + /* + * The item we're looking for moved right at least one page. + */ + if (P_RIGHTMOST(opaque)) + { + _bt_relbuf(rel, buf); + return InvalidBuffer; + } + blkno = opaque->btpo_next; + start = InvalidOffsetNumber; + _bt_relbuf(rel, buf); + } +} + +/* + * _bt_newroot() -- Create a new root page for the index. + * + * We've just split the old root page and need to create a new one. + * In order to do this, we add a new root page to the file, then lock + * the metadata page and update it. This is guaranteed to be deadlock- + * free, because all readers release their locks on the metadata page + * before trying to lock the root, and all writers lock the root before + * trying to lock the metadata page. We have a write lock on the old + * root page, so we have not introduced any cycles into the waits-for + * graph. + * + * On entry, lbuf (the old root) and rbuf (its new peer) are write- + * locked. On exit, a new root page exists with entries for the + * two new children, metapage is updated and unlocked/unpinned. + * The new root buffer is returned to caller which has to unlock/unpin + * lbuf, rbuf & rootbuf. + */ +static Buffer +_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) +{ + Buffer rootbuf; + Page lpage, + rootpage; + BlockNumber lbkno, + rbkno; + BlockNumber rootblknum; + BTPageOpaque rootopaque; + BTPageOpaque lopaque; + ItemId itemid; + IndexTuple item; + IndexTuple left_item; + Size left_item_sz; + IndexTuple right_item; + Size right_item_sz; + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + + lbkno = BufferGetBlockNumber(lbuf); + rbkno = BufferGetBlockNumber(rbuf); + lpage = BufferGetPage(lbuf); + lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); + + /* get a new root page */ + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootpage = BufferGetPage(rootbuf); + rootblknum = BufferGetBlockNumber(rootbuf); + + /* acquire lock on the metapage */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + /* + * Create downlink item for left page (old root). Since this will be the + * first item in a non-leaf page, it implicitly has minus-infinity key + * value, so we need not store any actual key in it. + */ + left_item_sz = sizeof(IndexTupleData); + left_item = (IndexTuple) palloc(left_item_sz); + left_item->t_info = left_item_sz; + ItemPointerSet(&(left_item->t_tid), lbkno, P_HIKEY); + + /* + * Create downlink item for right page. The key for it is obtained from + * the "high key" position in the left page. + */ + itemid = PageGetItemId(lpage, P_HIKEY); + right_item_sz = ItemIdGetLength(itemid); + item = (IndexTuple) PageGetItem(lpage, itemid); + right_item = CopyIndexTuple(item); + ItemPointerSet(&(right_item->t_tid), rbkno, P_HIKEY); + + /* NO EREPORT(ERROR) from here till newroot op is logged */ + START_CRIT_SECTION(); + + /* set btree special data */ + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; + rootopaque->btpo_flags = BTP_ROOT; + rootopaque->btpo.level = + ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1; + rootopaque->btpo_cycleid = 0; + + /* update metapage data */ + metad->btm_root = rootblknum; + metad->btm_level = rootopaque->btpo.level; + metad->btm_fastroot = rootblknum; + metad->btm_fastlevel = rootopaque->btpo.level; + + /* + * Insert the left page pointer into the new root page. The root page is + * the rightmost page on its level so there is no "high key" in it; the + * two items will go into positions P_HIKEY and P_FIRSTKEY. + * + * Note: we *must* insert the two items in item-number order, for the + * benefit of _bt_restore_page(). + */ + if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add leftkey to new root page" + " while splitting block %u of index \"%s\"", + BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); + + /* + * insert the right page pointer into the new root page. + */ + if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add rightkey to new root page" + " while splitting block %u of index \"%s\"", + BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); + + /* Clear the incomplete-split flag in the left child */ + Assert(P_INCOMPLETE_SPLIT(lopaque)); + lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(lbuf); + + MarkBufferDirty(rootbuf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_btree_newroot xlrec; + XLogRecPtr recptr; + xl_btree_metadata md; + + xlrec.rootblk = rootblknum; + xlrec.level = metad->btm_level; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot); + + XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); + XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); + XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + md.root = rootblknum; + md.level = metad->btm_level; + md.fastroot = rootblknum; + md.fastlevel = metad->btm_level; + + XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); + + /* + * Direct access to page is not good but faster - we should implement + * some new func in page API. + */ + XLogRegisterBufData(0, + (char *) rootpage + ((PageHeader) rootpage)->pd_upper, + ((PageHeader) rootpage)->pd_special - + ((PageHeader) rootpage)->pd_upper); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); + + PageSetLSN(lpage, recptr); + PageSetLSN(rootpage, recptr); + PageSetLSN(metapg, recptr); + } + + END_CRIT_SECTION(); + + /* done with metapage */ + _bt_relbuf(rel, metabuf); + + pfree(left_item); + pfree(right_item); + + return rootbuf; +} + +/* + * _bt_pgaddtup() -- add a tuple to a particular page in the index. + * + * This routine adds the tuple to the page as requested. It does + * not affect pin/lock status, but you'd better have a write lock + * and pin on the target buffer! Don't forget to write and release + * the buffer afterwards, either. + * + * The main difference between this routine and a bare PageAddItem call + * is that this code knows that the leftmost index tuple on a non-leaf + * btree page doesn't need to have a key. Therefore, it strips such + * tuples down to just the tuple header. CAUTION: this works ONLY if + * we insert the tuples in order, so that the given itup_off does + * represent the final position of the tuple! + */ +bool +_bt_pgaddtup(Page page, + Size itemsize, + IndexTuple itup, + OffsetNumber itup_off) +{ + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + IndexTupleData trunctuple; + + if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque)) + { + trunctuple = *itup; + trunctuple.t_info = sizeof(IndexTupleData); + itup = &trunctuple; + itemsize = sizeof(IndexTupleData); + } + + if (PageAddItem(page, (Item) itup, itemsize, itup_off, + false, false) == InvalidOffsetNumber) + return false; + + return true; +} + +/* + * _bt_isequal - used in _bt_doinsert in check for duplicates. + * + * This is very similar to _bt_compare, except for NULL handling. + * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too. + */ +static bool +_bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, + int keysz, ScanKey scankey) +{ + IndexTuple itup; + int i; + + /* Better be comparing to a leaf item */ + Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page))); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + + for (i = 1; i <= keysz; i++) + { + AttrNumber attno; + Datum datum; + bool isNull; + int32 result; + + attno = scankey->sk_attno; + Assert(attno == i); + datum = index_getattr(itup, attno, itupdesc, &isNull); + + /* NULLs are never equal to anything */ + if (isNull || (scankey->sk_flags & SK_ISNULL)) + return false; + + result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum, + scankey->sk_argument)); + + if (result != 0) + return false; + + scankey++; + } + + /* if we get here, the keys are equal */ + return true; +} + +/* + * _bt_vacuum_one_page - vacuum just one index page. + * + * Try to remove LP_DEAD items from the given page. The passed buffer + * must be exclusive-locked, but unlike a real VACUUM, we don't need a + * super-exclusive "cleanup" lock (see nbtree/README). + */ +static void +_bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) +{ + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buffer); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Scan over all items to see which ones need to be deleted according to + * LP_DEAD flags. + */ + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel); + + /* + * Note: if we didn't find any LP_DEAD items, then the page's + * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a + * separate write to clear it, however. We will clear it when we split + * the page. + */ +} diff --git a/src/backend/access/nbtree/nbtinsert.c.rej b/src/backend/access/nbtree/nbtinsert.c.rej new file mode 100644 index 0000000000..f3fa315055 --- /dev/null +++ b/src/backend/access/nbtree/nbtinsert.c.rej @@ -0,0 +1,17 @@ +*************** +*** 1811,1817 **** + + /* form an index tuple that points at the new right page */ + new_item = CopyIndexTuple(ritem); +- ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY); + + /* + * Find the parent buffer and get the parent page. +--- 1811,1817 ---- + + /* form an index tuple that points at the new right page */ + new_item = CopyIndexTuple(ritem); ++ ItemPointerSetBlockNumber(&(new_item->t_tid), rbknum); + + /* + * Find the parent buffer and get the parent page. diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index e6bfb18e7b..6d3637921c 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -985,7 +985,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, * Locate the downlink of "child" in the parent (updating the stack entry * if needed) */ - ItemPointerSet(&(stack->bts_btentry.t_tid), child, P_HIKEY); + ItemPointerSetBlockNumber(&(stack->bts_btentry.t_tid), child); pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u", @@ -1425,7 +1425,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) itemid = PageGetItemId(page, topoff); itup = (IndexTuple) PageGetItem(page, itemid); - ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); + ItemPointerSetBlockNumber(&(itup->t_tid), rightsib); nextoffset = OffsetNumberNext(topoff); PageIndexTupleDelete(page, nextoffset); @@ -1444,7 +1444,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); if (target != leafblkno) - ItemPointerSet(&trunctuple.t_tid, target, P_HIKEY); + ItemPointerSetBlockNumber(&trunctuple.t_tid, target); else ItemPointerSetInvalid(&trunctuple.t_tid); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, @@ -1763,7 +1763,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) if (nextchild == InvalidBlockNumber) ItemPointerSetInvalid(leafhikey); else - ItemPointerSet(leafhikey, nextchild, P_HIKEY); + ItemPointerSetBlockNumber(leafhikey, nextchild); } /* diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index d19348a206..91441b467c 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -899,7 +899,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * it will be that in the future. Now the purpose is just to save * more space on inner pages of btree. */ - keytup = index_truncate_tuple(wstate->index, oitup, indnkeyatts); + keytup = _bt_truncate_tuple(wstate->index, oitup); /* delete "wrong" high key, insert keytup as P_HIKEY. */ PageIndexTupleDelete(opage, P_HIKEY); @@ -918,7 +918,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); Assert(state->btps_minkey != NULL); - ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY); + ItemPointerSetBlockNumber(&(state->btps_minkey->t_tid), oblkno); _bt_buildadd(wstate, state->btps_next, state->btps_minkey); pfree(state->btps_minkey); @@ -972,8 +972,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * into the parent page as a downlink */ if (indnkeyatts != indnatts && P_ISLEAF(pageop)) - state->btps_minkey = index_truncate_tuple(wstate->index, - itup, indnkeyatts); + state->btps_minkey = _bt_truncate_tuple(wstate->index, itup); else state->btps_minkey = CopyIndexTuple(itup); } @@ -1028,7 +1027,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) else { Assert(s->btps_minkey != NULL); - ItemPointerSet(&(s->btps_minkey->t_tid), blkno, P_HIKEY); + ItemPointerSetBlockNumber(&(s->btps_minkey->t_tid), blkno); _bt_buildadd(wstate, s->btps_next, s->btps_minkey); pfree(s->btps_minkey); s->btps_minkey = NULL; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 2fc5924bf0..149b52e3ad 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -2078,3 +2078,23 @@ btproperty(Oid index_oid, int attno, return false; /* punt to generic code */ } } + +/* + * _bt_truncate_tuple() -- remove non-key (INCLUDE) attributes from index + * tuple. + * + * Transforms an ordinal B-tree leaf index tuple into pivot tuple to be used + * as hikey or non-leaf page tuple with downlink. Note that t_tid offset + * will be overritten in order to represent number of present tuple attributes. + */ +IndexTuple +_bt_truncate_tuple(Relation idxrel, IndexTuple olditup) +{ + IndexTuple newitup; + int nkeyattrs = IndexRelationGetNumberOfKeyAttributes(idxrel); + + newitup = index_truncate_tuple(idxrel, olditup, nkeyattrs); + BTreeTupSetNAtts(newitup, nkeyattrs); + + return newitup; +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index bbfe860e36..e09a389181 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -764,7 +764,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); - ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); + ItemPointerSetBlockNumber(&(itup->t_tid), rightsib); nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); @@ -794,7 +794,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); if (xlrec->topparent != InvalidBlockNumber) - ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY); + ItemPointerSetBlockNumber(&trunctuple.t_tid, xlrec->topparent); else ItemPointerSetInvalid(&trunctuple.t_tid); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, @@ -904,7 +904,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); if (xlrec->topparent != InvalidBlockNumber) - ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY); + ItemPointerSetBlockNumber(&trunctuple.t_tid, xlrec->topparent); else ItemPointerSetInvalid(&trunctuple.t_tid); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 053f8aa345..6d6b22fafb 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -151,11 +151,8 @@ typedef struct BTMetaPageData * as unique identifier for a given index tuple (logical position * within a level). - vadim 04/09/97 */ -#define BTTidSame(i1, i2) \ - ((ItemPointerGetBlockNumber(&(i1)) == ItemPointerGetBlockNumber(&(i2))) && \ - (ItemPointerGetOffsetNumber(&(i1)) == ItemPointerGetOffsetNumber(&(i2)))) #define BTEntrySame(i1, i2) \ - BTTidSame((i1)->t_tid, (i2)->t_tid) + ((ItemPointerGetBlockNumber(&(i1)->t_tid) == ItemPointerGetBlockNumber(&(i2)->t_tid))) /* @@ -206,6 +203,33 @@ typedef struct BTMetaPageData #define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY) +/* + * In B-tree index with INCLUDE clause, pivot tuples used in non-leaf pages + * and as hikeys are truncated. So, such tuples don't contain included + * attributes. In order to keep on-disk compatibility with upcoming suffix + * truncation of pivot tuples, we store number of attributes present inside + * tuple itself. Thankfully, offset number is always unused in pivot tuple. + * So, we use high bit of offset (which is free in every tuple) as flag + * that offset have alternative meaning: it stores number of keys present in + * index tuple (12 bit is far enough for that). And we have 3 bits reserved + * for future usage. + */ +#define BT_ALT_OFFSET_FLAG 0x8000 /* flag indicating t_tid offset has + an alternative meaning */ +#define BT_N_KEYS_OFFSET_MASK 0x0FFF /* mask of bits in t_tid offset + holding number of attributes + actually present in index tuple */ + +/* Set number of attributes to B-tree index tuple overriding t_tid offset */ +#define BTreeTupSetNAtts(itup, n) \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) | BT_ALT_OFFSET_FLAG) +/* Get number of attributes in B-tree index tuple */ +#define BtreeTupGetNAtts(itup, index) \ + (ItemPointerGetOffsetNumber(&(itup)->t_tid) & BT_ALT_OFFSET_FLAG ? \ + ItemPointerGetOffsetNumber(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK : \ + IndexRelationGetNumberOfAttributes(index)) + + /* * Operator strategy numbers for B-tree have been moved to access/stratnum.h, * because many places need to use them in ScanKeyInit() calls. @@ -545,6 +569,7 @@ extern bytea *btoptions(Datum reloptions, bool validate); extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); +extern IndexTuple _bt_truncate_tuple(Relation idxrel, IndexTuple olditup); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/nbtree.h.orig b/src/include/access/nbtree.h.orig new file mode 100644 index 0000000000..e45e46f452 --- /dev/null +++ b/src/include/access/nbtree.h.orig @@ -0,0 +1,577 @@ +/*------------------------------------------------------------------------- + * + * nbtree.h + * header file for postgres btree access method implementation. + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/nbtree.h + * + *------------------------------------------------------------------------- + */ +#ifndef NBTREE_H +#define NBTREE_H + +#include "access/amapi.h" +#include "access/itup.h" +#include "access/sdir.h" +#include "access/xlogreader.h" +#include "catalog/pg_index.h" +#include "lib/stringinfo.h" +#include "storage/bufmgr.h" +#include "storage/shm_toc.h" + +/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ +typedef uint16 BTCycleId; + +/* + * BTPageOpaqueData -- At the end of every page, we store a pointer + * to both siblings in the tree. This is used to do forward/backward + * index scans. The next-page link is also critical for recovery when + * a search has navigated to the wrong page due to concurrent page splits + * or deletions; see src/backend/access/nbtree/README for more info. + * + * In addition, we store the page's btree level (counting upwards from + * zero at a leaf page) as well as some flag bits indicating the page type + * and status. If the page is deleted, we replace the level with the + * next-transaction-ID value indicating when it is safe to reclaim the page. + * + * We also store a "vacuum cycle ID". When a page is split while VACUUM is + * processing the index, a nonzero value associated with the VACUUM run is + * stored into both halves of the split page. (If VACUUM is not running, + * both pages receive zero cycleids.) This allows VACUUM to detect whether + * a page was split since it started, with a small probability of false match + * if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs + * ago. Also, during a split, the BTP_SPLIT_END flag is cleared in the left + * (original) page, and set in the right page, but only if the next page + * to its right has a different cycleid. + * + * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested + * instead. + */ + +typedef struct BTPageOpaqueData +{ + BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ + BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ + union + { + uint32 level; /* tree level --- zero for leaf pages */ + TransactionId xact; /* next transaction ID, if deleted */ + } btpo; + uint16 btpo_flags; /* flag bits, see below */ + BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ +} BTPageOpaqueData; + +typedef BTPageOpaqueData *BTPageOpaque; + +/* Bits defined in btpo_flags */ +#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */ +#define BTP_ROOT (1 << 1) /* root page (has no parent) */ +#define BTP_DELETED (1 << 2) /* page has been deleted from tree */ +#define BTP_META (1 << 3) /* meta-page */ +#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ +#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ +#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples */ +#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ + +/* + * The max allowed value of a cycle ID is a bit less than 64K. This is + * for convenience of pg_filedump and similar utilities: we want to use + * the last 2 bytes of special space as an index type indicator, and + * restricting cycle ID lets btree use that space for vacuum cycle IDs + * while still allowing index type to be identified. + */ +#define MAX_BT_CYCLE_ID 0xFF7F + + +#define BT_ALT_OFFSET_FLAG 0x8000 /* flag indicating t_tid offset has + an alternative meaning */ +#define BT_N_KEYS_OFFSET_MASK 0x0FFF /* mask of bits in t_tid offset + holding number of attributes + actually present in index tuple */ + +/* Set number of attributes to B-tree index tuple overriding t_tid offset */ +#define BTreeTupSetNAtts(itup, n) \ + ItemPointerSetOffsetNumber(&(itup)->t_tid,(n) | BT_ALT_OFFSET_FLAG) +/* Get number of attributes in B-tree index tuple */ +#define BtreeTupGetNAtts(itup, index) \ + (ItemPointerGetOffsetNumber(&(itup)->t_tid) & BT_ALT_OFFSET_FLAG ? \ + ItemPointerGetOffsetNumber(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK : \ + IndexRelationGetNumberOfAttributes(index)) + + +/* + * The Meta page is always the first page in the btree index. + * Its primary purpose is to point to the location of the btree root page. + * We also point to the "fast" root, which is the current effective root; + * see README for discussion. + */ + +typedef struct BTMetaPageData +{ + uint32 btm_magic; /* should contain BTREE_MAGIC */ + uint32 btm_version; /* should contain BTREE_VERSION */ + BlockNumber btm_root; /* current root location */ + uint32 btm_level; /* tree level of the root page */ + BlockNumber btm_fastroot; /* current "fast" root location */ + uint32 btm_fastlevel; /* tree level of the "fast" root page */ +} BTMetaPageData; + +#define BTPageGetMeta(p) \ + ((BTMetaPageData *) PageGetContents(p)) + +#define BTREE_METAPAGE 0 /* first page is meta */ +#define BTREE_MAGIC 0x053162 /* magic number of btree pages */ +#define BTREE_VERSION 2 /* current version number */ + +/* + * Maximum size of a btree index entry, including its tuple header. + * + * We actually need to be able to fit three items on every page, + * so restrict any one item to 1/3 the per-page available space. + */ +#define BTMaxItemSize(page) \ + MAXALIGN_DOWN((PageGetPageSize(page) - \ + MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ + MAXALIGN(sizeof(BTPageOpaqueData))) / 3) + +/* + * The leaf-page fillfactor defaults to 90% but is user-adjustable. + * For pages above the leaf level, we use a fixed 70% fillfactor. + * The fillfactor is applied during index build and when splitting + * a rightmost page; when splitting non-rightmost pages we try to + * divide the data equally. + */ +#define BTREE_MIN_FILLFACTOR 10 +#define BTREE_DEFAULT_FILLFACTOR 90 +#define BTREE_NONLEAF_FILLFACTOR 70 + +/* + * Test whether two btree entries are "the same". + * + * Old comments: + * In addition, we must guarantee that all tuples in the index are unique, + * in order to satisfy some assumptions in Lehman and Yao. The way that we + * do this is by generating a new OID for every insertion that we do in the + * tree. This adds eight bytes to the size of btree index tuples. Note + * that we do not use the OID as part of a composite key; the OID only + * serves as a unique identifier for a given index tuple (logical position + * within a page). + * + * New comments: + * actually, we must guarantee that all tuples in A LEVEL + * are unique, not in ALL INDEX. So, we can use the t_tid + * as unique identifier for a given index tuple (logical position + * within a level). - vadim 04/09/97 + */ +#define BTTidSame(i1, i2) \ + ((ItemPointerGetBlockNumber(&(i1)) == ItemPointerGetBlockNumber(&(i2)))) +#define BTEntrySame(i1, i2) \ + BTTidSame((i1)->t_tid, (i2)->t_tid) + + +/* + * In general, the btree code tries to localize its knowledge about + * page layout to a couple of routines. However, we need a special + * value to indicate "no page number" in those places where we expect + * page numbers. We can use zero for this because we never need to + * make a pointer to the metadata page. + */ + +#define P_NONE 0 + +/* + * Macros to test whether a page is leftmost or rightmost on its tree level, + * as well as other state info kept in the opaque data. + */ +#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE) +#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE) +#define P_ISLEAF(opaque) (((opaque)->btpo_flags & BTP_LEAF) != 0) +#define P_ISROOT(opaque) (((opaque)->btpo_flags & BTP_ROOT) != 0) +#define P_ISDELETED(opaque) (((opaque)->btpo_flags & BTP_DELETED) != 0) +#define P_ISMETA(opaque) (((opaque)->btpo_flags & BTP_META) != 0) +#define P_ISHALFDEAD(opaque) (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0) +#define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0) +#define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) +#define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) + +/* + * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost + * page. The high key is not a data key, but gives info about what range of + * keys is supposed to be on this page. The high key on a page is required + * to be greater than or equal to any data key that appears on the page. + * If we find ourselves trying to insert a key > high key, we know we need + * to move right (this should only happen if the page was split since we + * examined the parent page). + * + * Our insertion algorithm guarantees that we can use the initial least key + * on our right sibling as the high key. Once a page is created, its high + * key changes only if the page is split. + * + * On a non-rightmost page, the high key lives in item 1 and data items + * start in item 2. Rightmost pages have no high key, so we store data + * items beginning in item 1. + */ + +#define P_HIKEY ((OffsetNumber) 1) +#define P_FIRSTKEY ((OffsetNumber) 2) +#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY) + + +/* + * Operator strategy numbers for B-tree have been moved to access/stratnum.h, + * because many places need to use them in ScanKeyInit() calls. + * + * The strategy numbers are chosen so that we can commute them by + * subtraction, thus: + */ +#define BTCommuteStrategyNumber(strat) (BTMaxStrategyNumber + 1 - (strat)) + +/* + * When a new operator class is declared, we require that the user + * supply us with an amproc procedure (BTORDER_PROC) for determining + * whether, for two keys a and b, a < b, a = b, or a > b. This routine + * must return < 0, 0, > 0, respectively, in these three cases. (It must + * not return INT_MIN, since we may negate the result before using it.) + * + * To facilitate accelerated sorting, an operator class may choose to + * offer a second procedure (BTSORTSUPPORT_PROC). For full details, see + * src/include/utils/sortsupport.h. + * + * To support window frames defined by "RANGE offset PRECEDING/FOLLOWING", + * an operator class may choose to offer a third amproc procedure + * (BTINRANGE_PROC), independently of whether it offers sortsupport. + * For full details, see doc/src/sgml/btree.sgml. + */ + +#define BTORDER_PROC 1 +#define BTSORTSUPPORT_PROC 2 +#define BTINRANGE_PROC 3 +#define BTNProcs 3 + +/* + * We need to be able to tell the difference between read and write + * requests for pages, in order to do locking correctly. + */ + +#define BT_READ BUFFER_LOCK_SHARE +#define BT_WRITE BUFFER_LOCK_EXCLUSIVE + +/* + * BTStackData -- As we descend a tree, we push the (location, downlink) + * pairs from internal pages onto a private stack. If we split a + * leaf, we use this stack to walk back up the tree and insert data + * into parent pages (and possibly to split them, too). Lehman and + * Yao's update algorithm guarantees that under no circumstances can + * our private stack give us an irredeemably bad picture up the tree. + * Again, see the paper for details. + */ + +typedef struct BTStackData +{ + BlockNumber bts_blkno; + OffsetNumber bts_offset; + IndexTupleData bts_btentry; + struct BTStackData *bts_parent; +} BTStackData; + +typedef BTStackData *BTStack; + +/* + * BTScanOpaqueData is the btree-private state needed for an indexscan. + * This consists of preprocessed scan keys (see _bt_preprocess_keys() for + * details of the preprocessing), information about the current location + * of the scan, and information about the marked location, if any. (We use + * BTScanPosData to represent the data needed for each of current and marked + * locations.) In addition we can remember some known-killed index entries + * that must be marked before we can move off the current page. + * + * Index scans work a page at a time: we pin and read-lock the page, identify + * all the matching items on the page and save them in BTScanPosData, then + * release the read-lock while returning the items to the caller for + * processing. This approach minimizes lock/unlock traffic. Note that we + * keep the pin on the index page until the caller is done with all the items + * (this is needed for VACUUM synchronization, see nbtree/README). When we + * are ready to step to the next page, if the caller has told us any of the + * items were killed, we re-lock the page to mark them killed, then unlock. + * Finally we drop the pin and step to the next page in the appropriate + * direction. + * + * If we are doing an index-only scan, we save the entire IndexTuple for each + * matched item, otherwise only its heap TID and offset. The IndexTuples go + * into a separate workspace array; each BTScanPosItem stores its tuple's + * offset within that array. + */ + +typedef struct BTScanPosItem /* what we remember about each match */ +{ + ItemPointerData heapTid; /* TID of referenced heap item */ + OffsetNumber indexOffset; /* index item's location within page */ + LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ +} BTScanPosItem; + +typedef struct BTScanPosData +{ + Buffer buf; /* if valid, the buffer is pinned */ + + XLogRecPtr lsn; /* pos in the WAL stream when page was read */ + BlockNumber currPage; /* page referenced by items array */ + BlockNumber nextPage; /* page's right link when we scanned it */ + + /* + * moreLeft and moreRight track whether we think there may be matching + * index entries to the left and right of the current page, respectively. + * We can clear the appropriate one of these flags when _bt_checkkeys() + * returns continuescan = false. + */ + bool moreLeft; + bool moreRight; + + /* + * If we are doing an index-only scan, nextTupleOffset is the first free + * location in the associated tuple storage workspace. + */ + int nextTupleOffset; + + /* + * The items array is always ordered in index order (ie, increasing + * indexoffset). When scanning backwards it is convenient to fill the + * array back-to-front, so we start at the last slot and fill downwards. + * Hence we need both a first-valid-entry and a last-valid-entry counter. + * itemIndex is a cursor showing which entry was last returned to caller. + */ + int firstItem; /* first valid index in items[] */ + int lastItem; /* last valid index in items[] */ + int itemIndex; /* current index in items[] */ + + BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ +} BTScanPosData; + +typedef BTScanPosData *BTScanPos; + +#define BTScanPosIsPinned(scanpos) \ +( \ + AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ + !BufferIsValid((scanpos).buf)), \ + BufferIsValid((scanpos).buf) \ +) +#define BTScanPosUnpin(scanpos) \ + do { \ + ReleaseBuffer((scanpos).buf); \ + (scanpos).buf = InvalidBuffer; \ + } while (0) +#define BTScanPosUnpinIfPinned(scanpos) \ + do { \ + if (BTScanPosIsPinned(scanpos)) \ + BTScanPosUnpin(scanpos); \ + } while (0) + +#define BTScanPosIsValid(scanpos) \ +( \ + AssertMacro(BlockNumberIsValid((scanpos).currPage) || \ + !BufferIsValid((scanpos).buf)), \ + BlockNumberIsValid((scanpos).currPage) \ +) +#define BTScanPosInvalidate(scanpos) \ + do { \ + (scanpos).currPage = InvalidBlockNumber; \ + (scanpos).nextPage = InvalidBlockNumber; \ + (scanpos).buf = InvalidBuffer; \ + (scanpos).lsn = InvalidXLogRecPtr; \ + (scanpos).nextTupleOffset = 0; \ + } while (0); + +/* We need one of these for each equality-type SK_SEARCHARRAY scan key */ +typedef struct BTArrayKeyInfo +{ + int scan_key; /* index of associated key in arrayKeyData */ + int cur_elem; /* index of current element in elem_values */ + int mark_elem; /* index of marked element in elem_values */ + int num_elems; /* number of elems in current array value */ + Datum *elem_values; /* array of num_elems Datums */ +} BTArrayKeyInfo; + +typedef struct BTScanOpaqueData +{ + /* these fields are set by _bt_preprocess_keys(): */ + bool qual_ok; /* false if qual can never be satisfied */ + int numberOfKeys; /* number of preprocessed scan keys */ + ScanKey keyData; /* array of preprocessed scan keys */ + + /* workspace for SK_SEARCHARRAY support */ + ScanKey arrayKeyData; /* modified copy of scan->keyData */ + int numArrayKeys; /* number of equality-type array keys (-1 if + * there are any unsatisfiable array keys) */ + int arrayKeyCount; /* count indicating number of array scan keys + * processed */ + BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ + MemoryContext arrayContext; /* scan-lifespan context for array data */ + + /* info about killed items if any (killedItems is NULL if never used) */ + int *killedItems; /* currPos.items indexes of killed items */ + int numKilled; /* number of currently stored items */ + + /* + * If we are doing an index-only scan, these are the tuple storage + * workspaces for the currPos and markPos respectively. Each is of size + * BLCKSZ, so it can hold as much as a full page's worth of tuples. + */ + char *currTuples; /* tuple storage for currPos */ + char *markTuples; /* tuple storage for markPos */ + + /* + * If the marked position is on the same page as current position, we + * don't use markPos, but just keep the marked itemIndex in markItemIndex + * (all the rest of currPos is valid for the mark position). Hence, to + * determine if there is a mark, first look at markItemIndex, then at + * markPos. + */ + int markItemIndex; /* itemIndex, or -1 if not valid */ + + /* keep these last in struct for efficiency */ + BTScanPosData currPos; /* current position data */ + BTScanPosData markPos; /* marked position, if any */ +} BTScanOpaqueData; + +typedef BTScanOpaqueData *BTScanOpaque; + +/* + * We use some private sk_flags bits in preprocessed scan keys. We're allowed + * to use bits 16-31 (see skey.h). The uppermost bits are copied from the + * index's indoption[] array entry for the index attribute. + */ +#define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ +#define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ +#define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ +#define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) +#define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) + +/* + * external entry points for btree, in nbtree.c + */ +extern void btbuildempty(Relation index); +extern bool btinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); +extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); +extern Size btestimateparallelscan(void); +extern void btinitparallelscan(void *target); +extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); +extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); +extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys); +extern void btparallelrescan(IndexScanDesc scan); +extern void btendscan(IndexScanDesc scan); +extern void btmarkpos(IndexScanDesc scan); +extern void btrestrpos(IndexScanDesc scan); +extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, + void *callback_state); +extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats); +extern bool btcanreturn(Relation index, int attno); + +/* + * prototypes for internal functions in nbtree.c + */ +extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno); +extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); +extern void _bt_parallel_done(IndexScanDesc scan); +extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); + +/* + * prototypes for functions in nbtinsert.c + */ +extern bool _bt_doinsert(Relation rel, IndexTuple itup, + IndexUniqueCheck checkUnique, Relation heapRel); +extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access); +extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); +extern bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, + OffsetNumber itup_off); + +/* + * prototypes for functions in nbtpage.c + */ +extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); +extern Buffer _bt_getroot(Relation rel, int access); +extern Buffer _bt_gettrueroot(Relation rel); +extern int _bt_getrootheight(Relation rel); +extern void _bt_checkpage(Relation rel, Buffer buf); +extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); +extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, + BlockNumber blkno, int access); +extern void _bt_relbuf(Relation rel, Buffer buf); +extern void _bt_pageinit(Page page, Size size); +extern bool _bt_page_recyclable(Page page); +extern void _bt_delitems_delete(Relation rel, Buffer buf, + OffsetNumber *itemnos, int nitems, Relation heapRel); +extern void _bt_delitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *itemnos, int nitems, + BlockNumber lastBlockVacuumed); +extern int _bt_pagedel(Relation rel, Buffer buf); + +/* + * prototypes for functions in nbtsearch.c + */ +extern BTStack _bt_search(Relation rel, + int keysz, ScanKey scankey, bool nextkey, + Buffer *bufP, int access, Snapshot snapshot); +extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, + ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, + int access, Snapshot snapshot); +extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, + ScanKey scankey, bool nextkey); +extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, + Page page, OffsetNumber offnum); +extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); +extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + Snapshot snapshot); + +/* + * prototypes for functions in nbtutils.c + */ +extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup); +extern ScanKey _bt_mkscankey_nodata(Relation rel); +extern void _bt_freeskey(ScanKey skey); +extern void _bt_freestack(BTStack stack); +extern void _bt_preprocess_array_keys(IndexScanDesc scan); +extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir); +extern void _bt_mark_array_keys(IndexScanDesc scan); +extern void _bt_restore_array_keys(IndexScanDesc scan); +extern void _bt_preprocess_keys(IndexScanDesc scan); +extern IndexTuple _bt_checkkeys(IndexScanDesc scan, + Page page, OffsetNumber offnum, + ScanDirection dir, bool *continuescan); +extern void _bt_killitems(IndexScanDesc scan); +extern BTCycleId _bt_vacuum_cycleid(Relation rel); +extern BTCycleId _bt_start_vacuum(Relation rel); +extern void _bt_end_vacuum(Relation rel); +extern void _bt_end_vacuum_callback(int code, Datum arg); +extern Size BTreeShmemSize(void); +extern void BTreeShmemInit(void); +extern bytea *btoptions(Datum reloptions, bool validate); +extern bool btproperty(Oid index_oid, int attno, + IndexAMProperty prop, const char *propname, + bool *res, bool *isnull); +extern IndexTuple _bt_truncate_tuple(Relation idxrel, IndexTuple olditup); + +/* + * prototypes for functions in nbtvalidate.c + */ +extern bool btvalidate(Oid opclassoid); + +/* + * prototypes for functions in nbtsort.c + */ +extern IndexBuildResult *btbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo); +extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc); + +#endif /* NBTREE_H */