diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index fb472b38f1..34c44e4d27 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -113,6 +113,7 @@ static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state,
 							   Page other,
 							   ScanKey key,
 							   OffsetNumber upperbound);
+static inline bool bt_natts_check(BtreeCheckState *state, OffsetNumber offnum);
 static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum);
 
 /*
@@ -560,6 +561,38 @@ bt_target_page_check(BtreeCheckState *state)
 	elog(DEBUG2, "verifying %u items on %s block %u", max,
 		 P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock);
 
+
+	/* Check the number of attributes in high key if any */
+	if (!P_RIGHTMOST(topaque))
+	{
+		if (!bt_natts_check(state, P_HIKEY))
+		{
+			ItemId		itemid;
+			IndexTuple	itup;
+			char	   *itid,
+					   *htid;
+
+			itemid = PageGetItemId(state->target, P_HIKEY);
+			itup = (IndexTuple) PageGetItem(state->target, itemid);
+			itid = psprintf("(%u,%u)", state->targetblock, P_HIKEY);
+			htid = psprintf("(%u,%u)",
+							ItemPointerGetBlockNumber(&(itup->t_tid)),
+							ItemPointerGetOffsetNumber(&(itup->t_tid)));
+
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("wrong number of index tuple attributes for index \"%s\"",
+							RelationGetRelationName(state->rel)),
+					 errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",
+										itid,
+										P_ISLEAF(topaque) ? "heap" : "index",
+										htid,
+										(uint32) (state->targetlsn >> 32),
+										(uint32) state->targetlsn)));
+		}
+	}
+
+
 	/*
 	 * Loop over page items, starting from first non-highkey item, not high
 	 * key (if any).  Also, immediately skip "negative infinity" real item (if
@@ -587,6 +620,29 @@ bt_target_page_check(BtreeCheckState *state)
 		itup = (IndexTuple) PageGetItem(state->target, itemid);
 		skey = _bt_mkscankey(state->rel, itup);
 
+		/* Check the number of index tuple attributes */
+		if (!bt_natts_check(state, offset))
+		{
+			char	   *itid,
+					   *htid;
+
+			itid = psprintf("(%u,%u)", state->targetblock, offset);
+			htid = psprintf("(%u,%u)",
+							ItemPointerGetBlockNumber(&(itup->t_tid)),
+							ItemPointerGetOffsetNumber(&(itup->t_tid)));
+
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("wrong number of index tuple attributes for index \"%s\"",
+							RelationGetRelationName(state->rel)),
+					 errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",
+										itid,
+										P_ISLEAF(topaque) ? "heap" : "index",
+										htid,
+										(uint32) (state->targetlsn >> 32),
+										(uint32) state->targetlsn)));
+		}
+
 		/*
 		 * * High key check *
 		 *
@@ -1152,6 +1208,32 @@ invariant_leq_nontarget_offset(BtreeCheckState *state,
 	return cmp <= 0;
 }
 
+/*
+ * Check if index tuple have appropriate number of attributes.
+ */
+static inline bool
+bt_natts_check(BtreeCheckState *state, OffsetNumber offnum)
+{
+	int16		natts = IndexRelationGetNumberOfAttributes(state->rel);
+	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
+	ItemId		itemid;
+	IndexTuple	itup;
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
+
+	itemid = PageGetItemId(state->target, offnum);
+	itup = (IndexTuple) PageGetItem(state->target, itemid);
+
+	/*
+	 * Pivot tuples stored in non-leaf pages and hikeys of leaf pages should
+	 * have nkeyatts number of attributes.  While regular tuples of leaf pages
+	 * should have natts number of attributes.
+	 */
+	if (P_ISLEAF(opaque) && offnum >= P_FIRSTDATAKEY(opaque))
+		return (BtreeTupGetNAtts(itup, state->rel) == natts);
+	else
+		return (BtreeTupGetNAtts(itup, state->rel) == nkeyatts);
+}
+
 /*
  * Given a block number of a B-Tree page, return page in palloc()'d memory.
  * While at it, perform some basic checks of the page.
diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c
index a58bd95620..ea6ad941ed 100644
--- a/src/backend/access/common/indextuple.c
+++ b/src/backend/access/common/indextuple.c
@@ -448,8 +448,8 @@ CopyIndexTuple(IndexTuple source)
 }
 
 /*
- * Reform index tuple.  Truncate nonkey (INCLUDE) attributes.
- * Pass the number of attributes the truncated tuple must contain.
+ * Truncate tailing attributes from given index tuple leaving it with
+ * new_indnatts number of attributes.
  */
 IndexTuple
 index_truncate_tuple(Relation idxrel, IndexTuple olditup, int new_indnatts)
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 3c73171e09..53aec4fd37 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -1194,7 +1194,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 	 */
 	if (indnatts != indnkeyatts && P_ISLEAF(lopaque))
 	{
-		lefthikey = index_truncate_tuple(rel, item, indnkeyatts);
+		lefthikey = _bt_truncate_tuple(rel, item);
 		itemsz = IndexTupleSize(lefthikey);
 		itemsz = MAXALIGN(itemsz);
 	}
@@ -1816,7 +1816,7 @@ _bt_insert_parent(Relation rel,
 
 		/* form an index tuple that points at the new right page */
 		new_item = CopyIndexTuple(ritem);
-		ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY);
+		ItemPointerSetBlockNumber(&(new_item->t_tid), rbknum);
 
 		/*
 		 * Find the parent buffer and get the parent page.
@@ -2081,7 +2081,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	left_item_sz = sizeof(IndexTupleData);
 	left_item = (IndexTuple) palloc(left_item_sz);
 	left_item->t_info = left_item_sz;
-	ItemPointerSet(&(left_item->t_tid), lbkno, P_HIKEY);
+	ItemPointerSetBlockNumber(&(left_item->t_tid), lbkno);
+	BTreeTupSetNAtts(left_item, 0);
 
 	/*
 	 * Create downlink item for right page.  The key for it is obtained from
@@ -2091,7 +2092,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	right_item_sz = ItemIdGetLength(itemid);
 	item = (IndexTuple) PageGetItem(lpage, itemid);
 	right_item = CopyIndexTuple(item);
-	ItemPointerSet(&(right_item->t_tid), rbkno, P_HIKEY);
+	ItemPointerSetBlockNumber(&(right_item->t_tid), rbkno);
 
 	/* NO EREPORT(ERROR) from here till newroot op is logged */
 	START_CRIT_SECTION();
diff --git a/src/backend/access/nbtree/nbtinsert.c.orig b/src/backend/access/nbtree/nbtinsert.c.orig
new file mode 100644
index 0000000000..9ac025bcf1
--- /dev/null
+++ b/src/backend/access/nbtree/nbtinsert.c.orig
@@ -0,0 +1,2321 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtinsert.c
+ *	  Item insertion in Lehman and Yao btrees for Postgres.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtinsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "access/transam.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "storage/smgr.h"
+#include "utils/tqual.h"
+
+
+typedef struct
+{
+	/* context data for _bt_checksplitloc */
+	Size		newitemsz;		/* size of new item to be inserted */
+	int			fillfactor;		/* needed when splitting rightmost page */
+	bool		is_leaf;		/* T if splitting a leaf page */
+	bool		is_rightmost;	/* T if splitting a rightmost page */
+	OffsetNumber newitemoff;	/* where the new item is to be inserted */
+	int			leftspace;		/* space available for items on left page */
+	int			rightspace;		/* space available for items on right page */
+	int			olddataitemstotal;	/* space taken by old items */
+
+	bool		have_split;		/* found a valid split? */
+
+	/* these fields valid only if have_split is true */
+	bool		newitemonleft;	/* new item on left or right of best split */
+	OffsetNumber firstright;	/* best split point */
+	int			best_delta;		/* best size delta so far */
+} FindSplitData;
+
+
+static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
+
+static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
+				 Relation heapRel, Buffer buf, OffsetNumber offset,
+				 ScanKey itup_scankey,
+				 IndexUniqueCheck checkUnique, bool *is_unique,
+				 uint32 *speculativeToken);
+static void _bt_findinsertloc(Relation rel,
+				  Buffer *bufptr,
+				  OffsetNumber *offsetptr,
+				  int keysz,
+				  ScanKey scankey,
+				  IndexTuple newtup,
+				  BTStack stack,
+				  Relation heapRel);
+static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
+			   BTStack stack,
+			   IndexTuple itup,
+			   OffsetNumber newitemoff,
+			   bool split_only_page);
+static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf,
+		  OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
+		  IndexTuple newitem, bool newitemonleft);
+static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
+				  BTStack stack, bool is_root, bool is_only);
+static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
+				 OffsetNumber newitemoff,
+				 Size newitemsz,
+				 bool *newitemonleft);
+static void _bt_checksplitloc(FindSplitData *state,
+				  OffsetNumber firstoldonright, bool newitemonleft,
+				  int dataitemstoleft, Size firstoldonrightsz);
+static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
+			int keysz, ScanKey scankey);
+static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
+
+/*
+ *	_bt_doinsert() -- Handle insertion of a single index tuple in the tree.
+ *
+ *		This routine is called by the public interface routine, btinsert.
+ *		By here, itup is filled in, including the TID.
+ *
+ *		If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
+ *		will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
+ *		UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
+ *		For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
+ *		don't actually insert.
+ *
+ *		The result value is only significant for UNIQUE_CHECK_PARTIAL:
+ *		it must be true if the entry is known unique, else false.
+ *		(In the current implementation we'll also return true after a
+ *		successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
+ *		that's just a coding artifact.)
+ */
+bool
+_bt_doinsert(Relation rel, IndexTuple itup,
+			 IndexUniqueCheck checkUnique, Relation heapRel)
+{
+	bool		is_unique = false;
+	int			indnkeyatts;
+	ScanKey		itup_scankey;
+	BTStack		stack = NULL;
+	Buffer		buf;
+	OffsetNumber offset;
+	bool		fastpath;
+
+	Assert(IndexRelationGetNumberOfAttributes(rel) != 0);
+	indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	Assert(indnkeyatts != 0);
+
+	/* we need an insertion scan key to do our search, so build one */
+	itup_scankey = _bt_mkscankey(rel, itup);
+
+	/*
+	 * It's very common to have an index on an auto-incremented or
+	 * monotonically increasing value. In such cases, every insertion happens
+	 * towards the end of the index. We try to optimise that case by caching
+	 * the right-most leaf of the index. If our cached block is still the
+	 * rightmost leaf, has enough free space to accommodate a new entry and
+	 * the insertion key is strictly greater than the first key in this page,
+	 * then we can safely conclude that the new key will be inserted in the
+	 * cached block. So we simply search within the cached block and insert the
+	 * key at the appropriate location. We call it a fastpath.
+	 *
+	 * Testing has revealed, though, that the fastpath can result in increased
+	 * contention on the exclusive-lock on the rightmost leaf page. So we
+	 * conditionally check if the lock is available. If it's not available then
+	 * we simply abandon the fastpath and take the regular path. This makes
+	 * sense because unavailability of the lock also signals that some other
+	 * backend might be concurrently inserting into the page, thus reducing our
+	 * chances to finding an insertion place in this page.
+	 */
+top:
+	fastpath = false;
+	offset = InvalidOffsetNumber;
+	if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
+	{
+		Size 			itemsz;
+		Page			page;
+		BTPageOpaque	lpageop;
+
+		/*
+		 * Conditionally acquire exclusive lock on the buffer before doing any
+		 * checks. If we don't get the lock, we simply follow slowpath. If we
+		 * do get the lock, this ensures that the index state cannot change, as
+		 * far as the rightmost part of the index is concerned.
+		 */
+		buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
+
+		if (ConditionalLockBuffer(buf))
+		{
+			_bt_checkpage(rel, buf);
+
+			page = BufferGetPage(buf);
+
+			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+			itemsz = IndexTupleSize(itup);
+			itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this
+										 * but we need to be consistent */
+
+			/*
+			 * Check if the page is still the rightmost leaf page, has enough
+			 * free space to accommodate the new tuple, no split is in progress
+			 * and the scankey is greater than or equal to the first key on the
+			 * page.
+			 */
+			if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
+					!P_INCOMPLETE_SPLIT(lpageop) &&
+					!P_IGNORE(lpageop) &&
+					(PageGetFreeSpace(page) > itemsz) &&
+					PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
+					_bt_compare(rel, indnkeyatts, itup_scankey, page,
+						P_FIRSTDATAKEY(lpageop)) > 0)
+			{
+				fastpath = true;
+			}
+			else
+			{
+				_bt_relbuf(rel, buf);
+
+				/*
+				 * Something did not workout. Just forget about the cached
+				 * block and follow the normal path. It might be set again if
+				 * the conditions are favourble.
+				 */
+				RelationSetTargetBlock(rel, InvalidBlockNumber);
+			}
+		}
+		else
+		{
+			ReleaseBuffer(buf);
+
+			/*
+			 * If someone's holding a lock, it's likely to change anyway,
+			 * so don't try again until we get an updated rightmost leaf.
+			 */
+			RelationSetTargetBlock(rel, InvalidBlockNumber);
+		}
+	}
+
+	if (!fastpath)
+	{
+		/* find the first page containing this key */
+		stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE,
+						   NULL);
+
+		/* trade in our read lock for a write lock */
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		LockBuffer(buf, BT_WRITE);
+
+		/*
+		 * If the page was split between the time that we surrendered our read
+		 * lock and acquired our write lock, then this page may no longer be
+		 * the right place for the key we want to insert.  In this case, we
+		 * need to move right in the tree.  See Lehman and Yao for an
+		 * excruciatingly precise description.
+		 */
+		buf = _bt_moveright(rel, buf, indnkeyatts, itup_scankey, false,
+							true, stack, BT_WRITE, NULL);
+	}
+
+	/*
+	 * If we're not allowing duplicates, make sure the key isn't already in
+	 * the index.
+	 *
+	 * NOTE: obviously, _bt_check_unique can only detect keys that are already
+	 * in the index; so it cannot defend against concurrent insertions of the
+	 * same key.  We protect against that by means of holding a write lock on
+	 * the target page.  Any other would-be inserter of the same key must
+	 * acquire a write lock on the same target page, so only one would-be
+	 * inserter can be making the check at one time.  Furthermore, once we are
+	 * past the check we hold write locks continuously until we have performed
+	 * our insertion, so no later inserter can fail to see our insertion.
+	 * (This requires some care in _bt_insertonpg.)
+	 *
+	 * If we must wait for another xact, we release the lock while waiting,
+	 * and then must start over completely.
+	 *
+	 * For a partial uniqueness check, we don't wait for the other xact. Just
+	 * let the tuple in and return false for possibly non-unique, or true for
+	 * definitely unique.
+	 */
+	if (checkUnique != UNIQUE_CHECK_NO)
+	{
+		TransactionId xwait;
+		uint32		speculativeToken;
+
+		offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false);
+		xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
+								 checkUnique, &is_unique, &speculativeToken);
+
+		if (TransactionIdIsValid(xwait))
+		{
+			/* Have to wait for the other guy ... */
+			_bt_relbuf(rel, buf);
+
+			/*
+			 * If it's a speculative insertion, wait for it to finish (ie. to
+			 * go ahead with the insertion, or kill the tuple).  Otherwise
+			 * wait for the transaction to finish as usual.
+			 */
+			if (speculativeToken)
+				SpeculativeInsertionWait(xwait, speculativeToken);
+			else
+				XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
+
+			/* start over... */
+			if (stack)
+				_bt_freestack(stack);
+			goto top;
+		}
+	}
+
+	if (checkUnique != UNIQUE_CHECK_EXISTING)
+	{
+		/*
+		 * The only conflict predicate locking cares about for indexes is when
+		 * an index tuple insert conflicts with an existing lock.  Since the
+		 * actual location of the insert is hard to predict because of the
+		 * random search used to prevent O(N^2) performance when there are
+		 * many duplicate entries, we can just use the "first valid" page.
+		 */
+		CheckForSerializableConflictIn(rel, NULL, buf);
+		/* do the insertion */
+		_bt_findinsertloc(rel, &buf, &offset, indnkeyatts, itup_scankey, itup,
+						  stack, heapRel);
+		_bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false);
+	}
+	else
+	{
+		/* just release the buffer */
+		_bt_relbuf(rel, buf);
+	}
+
+	/* be tidy */
+	if (stack)
+		_bt_freestack(stack);
+	_bt_freeskey(itup_scankey);
+
+	return is_unique;
+}
+
+/*
+ *	_bt_check_unique() -- Check for violation of unique index constraint
+ *
+ * offset points to the first possible item that could conflict. It can
+ * also point to end-of-page, which means that the first tuple to check
+ * is the first tuple on the next page.
+ *
+ * Returns InvalidTransactionId if there is no conflict, else an xact ID
+ * we must wait for to see if it commits a conflicting tuple.   If an actual
+ * conflict is detected, no return --- just ereport().  If an xact ID is
+ * returned, and the conflicting tuple still has a speculative insertion in
+ * progress, *speculativeToken is set to non-zero, and the caller can wait for
+ * the verdict on the insertion using SpeculativeInsertionWait().
+ *
+ * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
+ * InvalidTransactionId because we don't want to wait.  In this case we
+ * set *is_unique to false if there is a potential conflict, and the
+ * core code must redo the uniqueness check later.
+ */
+static TransactionId
+_bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
+				 Buffer buf, OffsetNumber offset, ScanKey itup_scankey,
+				 IndexUniqueCheck checkUnique, bool *is_unique,
+				 uint32 *speculativeToken)
+{
+	TupleDesc	itupdesc = RelationGetDescr(rel);
+	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	SnapshotData SnapshotDirty;
+	OffsetNumber maxoff;
+	Page		page;
+	BTPageOpaque opaque;
+	Buffer		nbuf = InvalidBuffer;
+	bool		found = false;
+
+	/* Assume unique until we find a duplicate */
+	*is_unique = true;
+
+	InitDirtySnapshot(SnapshotDirty);
+
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * Scan over all equal tuples, looking for live conflicts.
+	 */
+	for (;;)
+	{
+		ItemId		curitemid;
+		IndexTuple	curitup;
+		BlockNumber nblkno;
+
+		/*
+		 * make sure the offset points to an actual item before trying to
+		 * examine it...
+		 */
+		if (offset <= maxoff)
+		{
+			curitemid = PageGetItemId(page, offset);
+
+			/*
+			 * We can skip items that are marked killed.
+			 *
+			 * Formerly, we applied _bt_isequal() before checking the kill
+			 * flag, so as to fall out of the item loop as soon as possible.
+			 * However, in the presence of heavy update activity an index may
+			 * contain many killed items with the same key; running
+			 * _bt_isequal() on each killed item gets expensive. Furthermore
+			 * it is likely that the non-killed version of each key appears
+			 * first, so that we didn't actually get to exit any sooner
+			 * anyway. So now we just advance over killed items as quickly as
+			 * we can. We only apply _bt_isequal() when we get to a non-killed
+			 * item or the end of the page.
+			 */
+			if (!ItemIdIsDead(curitemid))
+			{
+				ItemPointerData htid;
+				bool		all_dead;
+
+				/*
+				 * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
+				 * how we handling NULLs - and so we must not use _bt_compare
+				 * in real comparison, but only for ordering/finding items on
+				 * pages. - vadim 03/24/97
+				 */
+				if (!_bt_isequal(itupdesc, page, offset, indnkeyatts, itup_scankey))
+					break;		/* we're past all the equal tuples */
+
+				/* okay, we gotta fetch the heap tuple ... */
+				curitup = (IndexTuple) PageGetItem(page, curitemid);
+				htid = curitup->t_tid;
+
+				/*
+				 * If we are doing a recheck, we expect to find the tuple we
+				 * are rechecking.  It's not a duplicate, but we have to keep
+				 * scanning.
+				 */
+				if (checkUnique == UNIQUE_CHECK_EXISTING &&
+					ItemPointerCompare(&htid, &itup->t_tid) == 0)
+				{
+					found = true;
+				}
+
+				/*
+				 * We check the whole HOT-chain to see if there is any tuple
+				 * that satisfies SnapshotDirty.  This is necessary because we
+				 * have just a single index entry for the entire chain.
+				 */
+				else if (heap_hot_search(&htid, heapRel, &SnapshotDirty,
+										 &all_dead))
+				{
+					TransactionId xwait;
+
+					/*
+					 * It is a duplicate. If we are only doing a partial
+					 * check, then don't bother checking if the tuple is being
+					 * updated in another transaction. Just return the fact
+					 * that it is a potential conflict and leave the full
+					 * check till later.
+					 */
+					if (checkUnique == UNIQUE_CHECK_PARTIAL)
+					{
+						if (nbuf != InvalidBuffer)
+							_bt_relbuf(rel, nbuf);
+						*is_unique = false;
+						return InvalidTransactionId;
+					}
+
+					/*
+					 * If this tuple is being updated by other transaction
+					 * then we have to wait for its commit/abort.
+					 */
+					xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
+						SnapshotDirty.xmin : SnapshotDirty.xmax;
+
+					if (TransactionIdIsValid(xwait))
+					{
+						if (nbuf != InvalidBuffer)
+							_bt_relbuf(rel, nbuf);
+						/* Tell _bt_doinsert to wait... */
+						*speculativeToken = SnapshotDirty.speculativeToken;
+						return xwait;
+					}
+
+					/*
+					 * Otherwise we have a definite conflict.  But before
+					 * complaining, look to see if the tuple we want to insert
+					 * is itself now committed dead --- if so, don't complain.
+					 * This is a waste of time in normal scenarios but we must
+					 * do it to support CREATE INDEX CONCURRENTLY.
+					 *
+					 * We must follow HOT-chains here because during
+					 * concurrent index build, we insert the root TID though
+					 * the actual tuple may be somewhere in the HOT-chain.
+					 * While following the chain we might not stop at the
+					 * exact tuple which triggered the insert, but that's OK
+					 * because if we find a live tuple anywhere in this chain,
+					 * we have a unique key conflict.  The other live tuple is
+					 * not part of this chain because it had a different index
+					 * entry.
+					 */
+					htid = itup->t_tid;
+					if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
+					{
+						/* Normal case --- it's still live */
+					}
+					else
+					{
+						/*
+						 * It's been deleted, so no error, and no need to
+						 * continue searching
+						 */
+						break;
+					}
+
+					/*
+					 * Check for a conflict-in as we would if we were going to
+					 * write to this page.  We aren't actually going to write,
+					 * but we want a chance to report SSI conflicts that would
+					 * otherwise be masked by this unique constraint
+					 * violation.
+					 */
+					CheckForSerializableConflictIn(rel, NULL, buf);
+
+					/*
+					 * This is a definite conflict.  Break the tuple down into
+					 * datums and report the error.  But first, make sure we
+					 * release the buffer locks we're holding ---
+					 * BuildIndexValueDescription could make catalog accesses,
+					 * which in the worst case might touch this same index and
+					 * cause deadlocks.
+					 */
+					if (nbuf != InvalidBuffer)
+						_bt_relbuf(rel, nbuf);
+					_bt_relbuf(rel, buf);
+
+					{
+						Datum		values[INDEX_MAX_KEYS];
+						bool		isnull[INDEX_MAX_KEYS];
+						char	   *key_desc;
+
+						index_deform_tuple(itup, RelationGetDescr(rel),
+										   values, isnull);
+
+						key_desc = BuildIndexValueDescription(rel, values,
+															  isnull);
+
+						ereport(ERROR,
+								(errcode(ERRCODE_UNIQUE_VIOLATION),
+								 errmsg("duplicate key value violates unique constraint \"%s\"",
+										RelationGetRelationName(rel)),
+								 key_desc ? errdetail("Key %s already exists.",
+													  key_desc) : 0,
+								 errtableconstraint(heapRel,
+													RelationGetRelationName(rel))));
+					}
+				}
+				else if (all_dead)
+				{
+					/*
+					 * The conflicting tuple (or whole HOT chain) is dead to
+					 * everyone, so we may as well mark the index entry
+					 * killed.
+					 */
+					ItemIdMarkDead(curitemid);
+					opaque->btpo_flags |= BTP_HAS_GARBAGE;
+
+					/*
+					 * Mark buffer with a dirty hint, since state is not
+					 * crucial. Be sure to mark the proper buffer dirty.
+					 */
+					if (nbuf != InvalidBuffer)
+						MarkBufferDirtyHint(nbuf, true);
+					else
+						MarkBufferDirtyHint(buf, true);
+				}
+			}
+		}
+
+		/*
+		 * Advance to next tuple to continue checking.
+		 */
+		if (offset < maxoff)
+			offset = OffsetNumberNext(offset);
+		else
+		{
+			/* If scankey == hikey we gotta check the next page too */
+			if (P_RIGHTMOST(opaque))
+				break;
+			if (!_bt_isequal(itupdesc, page, P_HIKEY,
+							 indnkeyatts, itup_scankey))
+				break;
+			/* Advance to next non-dead page --- there must be one */
+			for (;;)
+			{
+				nblkno = opaque->btpo_next;
+				nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
+				page = BufferGetPage(nbuf);
+				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+				if (!P_IGNORE(opaque))
+					break;
+				if (P_RIGHTMOST(opaque))
+					elog(ERROR, "fell off the end of index \"%s\"",
+						 RelationGetRelationName(rel));
+			}
+			maxoff = PageGetMaxOffsetNumber(page);
+			offset = P_FIRSTDATAKEY(opaque);
+		}
+	}
+
+	/*
+	 * If we are doing a recheck then we should have found the tuple we are
+	 * checking.  Otherwise there's something very wrong --- probably, the
+	 * index is on a non-immutable expression.
+	 */
+	if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("failed to re-find tuple within index \"%s\"",
+						RelationGetRelationName(rel)),
+				 errhint("This may be because of a non-immutable index expression."),
+				 errtableconstraint(heapRel,
+									RelationGetRelationName(rel))));
+
+	if (nbuf != InvalidBuffer)
+		_bt_relbuf(rel, nbuf);
+
+	return InvalidTransactionId;
+}
+
+
+/*
+ *	_bt_findinsertloc() -- Finds an insert location for a tuple
+ *
+ *		If the new key is equal to one or more existing keys, we can
+ *		legitimately place it anywhere in the series of equal keys --- in fact,
+ *		if the new key is equal to the page's "high key" we can place it on
+ *		the next page.  If it is equal to the high key, and there's not room
+ *		to insert the new tuple on the current page without splitting, then
+ *		we can move right hoping to find more free space and avoid a split.
+ *		(We should not move right indefinitely, however, since that leads to
+ *		O(N^2) insertion behavior in the presence of many equal keys.)
+ *		Once we have chosen the page to put the key on, we'll insert it before
+ *		any existing equal keys because of the way _bt_binsrch() works.
+ *
+ *		If there's not enough room in the space, we try to make room by
+ *		removing any LP_DEAD tuples.
+ *
+ *		On entry, *bufptr and *offsetptr point to the first legal position
+ *		where the new tuple could be inserted.  The caller should hold an
+ *		exclusive lock on *bufptr.  *offsetptr can also be set to
+ *		InvalidOffsetNumber, in which case the function will search for the
+ *		right location within the page if needed.  On exit, they point to the
+ *		chosen insert location.  If _bt_findinsertloc decides to move right,
+ *		the lock and pin on the original page will be released and the new
+ *		page returned to the caller is exclusively locked instead.
+ *
+ *		newtup is the new tuple we're inserting, and scankey is an insertion
+ *		type scan key for it.
+ */
+static void
+_bt_findinsertloc(Relation rel,
+				  Buffer *bufptr,
+				  OffsetNumber *offsetptr,
+				  int keysz,
+				  ScanKey scankey,
+				  IndexTuple newtup,
+				  BTStack stack,
+				  Relation heapRel)
+{
+	Buffer		buf = *bufptr;
+	Page		page = BufferGetPage(buf);
+	Size		itemsz;
+	BTPageOpaque lpageop;
+	bool		movedright,
+				vacuumed;
+	OffsetNumber newitemoff;
+	OffsetNumber firstlegaloff = *offsetptr;
+
+	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	itemsz = IndexTupleSize(newtup);
+	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
+								 * need to be consistent */
+
+	/*
+	 * Check whether the item can fit on a btree page at all. (Eventually, we
+	 * ought to try to apply TOAST methods if not.) We actually need to be
+	 * able to fit three items on every page, so restrict any one item to 1/3
+	 * the per-page available space. Note that at this point, itemsz doesn't
+	 * include the ItemId.
+	 *
+	 * NOTE: if you change this, see also the similar code in _bt_buildadd().
+	 */
+	if (itemsz > BTMaxItemSize(page))
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+						itemsz, BTMaxItemSize(page),
+						RelationGetRelationName(rel)),
+				 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
+						 "Consider a function index of an MD5 hash of the value, "
+						 "or use full text indexing."),
+				 errtableconstraint(heapRel,
+									RelationGetRelationName(rel))));
+
+	/*----------
+	 * If we will need to split the page to put the item on this page,
+	 * check whether we can put the tuple somewhere to the right,
+	 * instead.  Keep scanning right until we
+	 *		(a) find a page with enough free space,
+	 *		(b) reach the last page where the tuple can legally go, or
+	 *		(c) get tired of searching.
+	 * (c) is not flippant; it is important because if there are many
+	 * pages' worth of equal keys, it's better to split one of the early
+	 * pages than to scan all the way to the end of the run of equal keys
+	 * on every insert.  We implement "get tired" as a random choice,
+	 * since stopping after scanning a fixed number of pages wouldn't work
+	 * well (we'd never reach the right-hand side of previously split
+	 * pages).  Currently the probability of moving right is set at 0.99,
+	 * which may seem too high to change the behavior much, but it does an
+	 * excellent job of preventing O(N^2) behavior with many equal keys.
+	 *----------
+	 */
+	movedright = false;
+	vacuumed = false;
+	while (PageGetFreeSpace(page) < itemsz)
+	{
+		Buffer		rbuf;
+		BlockNumber rblkno;
+
+		/*
+		 * before considering moving right, see if we can obtain enough space
+		 * by erasing LP_DEAD items
+		 */
+		if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
+		{
+			_bt_vacuum_one_page(rel, buf, heapRel);
+
+			/*
+			 * remember that we vacuumed this page, because that makes the
+			 * hint supplied by the caller invalid
+			 */
+			vacuumed = true;
+
+			if (PageGetFreeSpace(page) >= itemsz)
+				break;			/* OK, now we have enough space */
+		}
+
+		/*
+		 * nope, so check conditions (b) and (c) enumerated above
+		 */
+		if (P_RIGHTMOST(lpageop) ||
+			_bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
+			random() <= (MAX_RANDOM_VALUE / 100))
+			break;
+
+		/*
+		 * step right to next non-dead page
+		 *
+		 * must write-lock that page before releasing write lock on current
+		 * page; else someone else's _bt_check_unique scan could fail to see
+		 * our insertion.  write locks on intermediate dead pages won't do
+		 * because we don't know when they will get de-linked from the tree.
+		 */
+		rbuf = InvalidBuffer;
+
+		rblkno = lpageop->btpo_next;
+		for (;;)
+		{
+			rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
+			page = BufferGetPage(rbuf);
+			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+			/*
+			 * If this page was incompletely split, finish the split now. We
+			 * do this while holding a lock on the left sibling, which is not
+			 * good because finishing the split could be a fairly lengthy
+			 * operation.  But this should happen very seldom.
+			 */
+			if (P_INCOMPLETE_SPLIT(lpageop))
+			{
+				_bt_finish_split(rel, rbuf, stack);
+				rbuf = InvalidBuffer;
+				continue;
+			}
+
+			if (!P_IGNORE(lpageop))
+				break;
+			if (P_RIGHTMOST(lpageop))
+				elog(ERROR, "fell off the end of index \"%s\"",
+					 RelationGetRelationName(rel));
+
+			rblkno = lpageop->btpo_next;
+		}
+		_bt_relbuf(rel, buf);
+		buf = rbuf;
+		movedright = true;
+		vacuumed = false;
+	}
+
+	/*
+	 * Now we are on the right page, so find the insert position. If we moved
+	 * right at all, we know we should insert at the start of the page. If we
+	 * didn't move right, we can use the firstlegaloff hint if the caller
+	 * supplied one, unless we vacuumed the page which might have moved tuples
+	 * around making the hint invalid. If we didn't move right or can't use
+	 * the hint, find the position by searching.
+	 */
+	if (movedright)
+		newitemoff = P_FIRSTDATAKEY(lpageop);
+	else if (firstlegaloff != InvalidOffsetNumber && !vacuumed)
+		newitemoff = firstlegaloff;
+	else
+		newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
+
+	*bufptr = buf;
+	*offsetptr = newitemoff;
+}
+
+/*----------
+ *	_bt_insertonpg() -- Insert a tuple on a particular page in the index.
+ *
+ *		This recursive procedure does the following things:
+ *
+ *			+  if necessary, splits the target page (making sure that the
+ *			   split is equitable as far as post-insert free space goes).
+ *			+  inserts the tuple.
+ *			+  if the page was split, pops the parent stack, and finds the
+ *			   right place to insert the new child pointer (by walking
+ *			   right using information stored in the parent stack).
+ *			+  invokes itself with the appropriate tuple for the right
+ *			   child page on the parent.
+ *			+  updates the metapage if a true root or fast root is split.
+ *
+ *		On entry, we must have the correct buffer in which to do the
+ *		insertion, and the buffer must be pinned and write-locked.  On return,
+ *		we will have dropped both the pin and the lock on the buffer.
+ *
+ *		When inserting to a non-leaf page, 'cbuf' is the left-sibling of the
+ *		page we're inserting the downlink for.  This function will clear the
+ *		INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *
+ *		The locking interactions in this code are critical.  You should
+ *		grok Lehman and Yao's paper before making any changes.  In addition,
+ *		you need to understand how we disambiguate duplicate keys in this
+ *		implementation, in order to be able to find our location using
+ *		L&Y "move right" operations.  Since we may insert duplicate user
+ *		keys, and since these dups may propagate up the tree, we use the
+ *		'afteritem' parameter to position ourselves correctly for the
+ *		insertion on internal pages.
+ *----------
+ */
+static void
+_bt_insertonpg(Relation rel,
+			   Buffer buf,
+			   Buffer cbuf,
+			   BTStack stack,
+			   IndexTuple itup,
+			   OffsetNumber newitemoff,
+			   bool split_only_page)
+{
+	Page		page;
+	BTPageOpaque lpageop;
+	OffsetNumber firstright = InvalidOffsetNumber;
+	Size		itemsz;
+
+	page = BufferGetPage(buf);
+	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/* child buffer must be given iff inserting on an internal page */
+	Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
+
+	/* The caller should've finished any incomplete splits already. */
+	if (P_INCOMPLETE_SPLIT(lpageop))
+		elog(ERROR, "cannot insert to incompletely split page %u",
+			 BufferGetBlockNumber(buf));
+
+	itemsz = IndexTupleSize(itup);
+	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
+								 * need to be consistent */
+
+	/*
+	 * Do we need to split the page to fit the item on it?
+	 *
+	 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
+	 * so this comparison is correct even though we appear to be accounting
+	 * only for the item and not for its line pointer.
+	 */
+	if (PageGetFreeSpace(page) < itemsz)
+	{
+		bool		is_root = P_ISROOT(lpageop);
+		bool		is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
+		bool		newitemonleft;
+		Buffer		rbuf;
+
+		/* Choose the split point */
+		firstright = _bt_findsplitloc(rel, page,
+									  newitemoff, itemsz,
+									  &newitemonleft);
+
+		/* split the buffer into left and right halves */
+		rbuf = _bt_split(rel, buf, cbuf, firstright,
+						 newitemoff, itemsz, itup, newitemonleft);
+		PredicateLockPageSplit(rel,
+							   BufferGetBlockNumber(buf),
+							   BufferGetBlockNumber(rbuf));
+
+		/*----------
+		 * By here,
+		 *
+		 *		+  our target page has been split;
+		 *		+  the original tuple has been inserted;
+		 *		+  we have write locks on both the old (left half)
+		 *		   and new (right half) buffers, after the split; and
+		 *		+  we know the key we want to insert into the parent
+		 *		   (it's the "high key" on the left child page).
+		 *
+		 * We're ready to do the parent insertion.  We need to hold onto the
+		 * locks for the child pages until we locate the parent, but we can
+		 * release them before doing the actual insertion (see Lehman and Yao
+		 * for the reasoning).
+		 *----------
+		 */
+		_bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
+	}
+	else
+	{
+		Buffer		metabuf = InvalidBuffer;
+		Page		metapg = NULL;
+		BTMetaPageData *metad = NULL;
+		OffsetNumber itup_off;
+		BlockNumber itup_blkno;
+
+		itup_off = newitemoff;
+		itup_blkno = BufferGetBlockNumber(buf);
+
+		/*
+		 * If we are doing this insert because we split a page that was the
+		 * only one on its tree level, but was not the root, it may have been
+		 * the "fast root".  We need to ensure that the fast root link points
+		 * at or above the current page.  We can safely acquire a lock on the
+		 * metapage here --- see comments for _bt_newroot().
+		 */
+		if (split_only_page)
+		{
+			Assert(!P_ISLEAF(lpageop));
+
+			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+			metapg = BufferGetPage(metabuf);
+			metad = BTPageGetMeta(metapg);
+
+			if (metad->btm_fastlevel >= lpageop->btpo.level)
+			{
+				/* no update wanted */
+				_bt_relbuf(rel, metabuf);
+				metabuf = InvalidBuffer;
+			}
+		}
+
+		/* Do the update.  No ereport(ERROR) until changes are logged */
+		START_CRIT_SECTION();
+
+		if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
+			elog(PANIC, "failed to add new item to block %u in index \"%s\"",
+				 itup_blkno, RelationGetRelationName(rel));
+
+		MarkBufferDirty(buf);
+
+		if (BufferIsValid(metabuf))
+		{
+			metad->btm_fastroot = itup_blkno;
+			metad->btm_fastlevel = lpageop->btpo.level;
+			MarkBufferDirty(metabuf);
+		}
+
+		/* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
+		if (BufferIsValid(cbuf))
+		{
+			Page		cpage = BufferGetPage(cbuf);
+			BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+
+			Assert(P_INCOMPLETE_SPLIT(cpageop));
+			cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+			MarkBufferDirty(cbuf);
+		}
+
+		/* XLOG stuff */
+		if (RelationNeedsWAL(rel))
+		{
+			xl_btree_insert xlrec;
+			xl_btree_metadata xlmeta;
+			uint8		xlinfo;
+			XLogRecPtr	recptr;
+			IndexTupleData trunctuple;
+
+			xlrec.offnum = itup_off;
+
+			XLogBeginInsert();
+			XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
+
+			if (P_ISLEAF(lpageop))
+			{
+				xlinfo = XLOG_BTREE_INSERT_LEAF;
+
+				/*
+				 * Cache the block information if we just inserted into the
+				 * rightmost leaf page of the index.
+				 */
+				if (P_RIGHTMOST(lpageop))
+					RelationSetTargetBlock(rel, BufferGetBlockNumber(buf));
+			}
+			else
+			{
+				/*
+				 * Register the left child whose INCOMPLETE_SPLIT flag was
+				 * cleared.
+				 */
+				XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
+
+				xlinfo = XLOG_BTREE_INSERT_UPPER;
+			}
+
+			if (BufferIsValid(metabuf))
+			{
+				xlmeta.root = metad->btm_root;
+				xlmeta.level = metad->btm_level;
+				xlmeta.fastroot = metad->btm_fastroot;
+				xlmeta.fastlevel = metad->btm_fastlevel;
+
+				XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+				XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
+
+				xlinfo = XLOG_BTREE_INSERT_META;
+			}
+
+			/* Read comments in _bt_pgaddtup */
+			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+			if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
+			{
+				trunctuple = *itup;
+				trunctuple.t_info = sizeof(IndexTupleData);
+				XLogRegisterBufData(0, (char *) &trunctuple,
+									sizeof(IndexTupleData));
+			}
+			else
+				XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+
+			recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+
+			if (BufferIsValid(metabuf))
+			{
+				PageSetLSN(metapg, recptr);
+			}
+			if (BufferIsValid(cbuf))
+			{
+				PageSetLSN(BufferGetPage(cbuf), recptr);
+			}
+
+			PageSetLSN(page, recptr);
+		}
+
+		END_CRIT_SECTION();
+
+		/* release buffers */
+		if (BufferIsValid(metabuf))
+			_bt_relbuf(rel, metabuf);
+		if (BufferIsValid(cbuf))
+			_bt_relbuf(rel, cbuf);
+		_bt_relbuf(rel, buf);
+	}
+}
+
+/*
+ *	_bt_split() -- split a page in the btree.
+ *
+ *		On entry, buf is the page to split, and is pinned and write-locked.
+ *		firstright is the item index of the first item to be moved to the
+ *		new right page.  newitemoff etc. tell us about the new item that
+ *		must be inserted along with the data from the old page.
+ *
+ *		When splitting a non-leaf page, 'cbuf' is the left-sibling of the
+ *		page we're inserting the downlink for.  This function will clear the
+ *		INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *
+ *		Returns the new right sibling of buf, pinned and write-locked.
+ *		The pin and lock on buf are maintained.
+ */
+static Buffer
+_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
+		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+		  bool newitemonleft)
+{
+	Buffer		rbuf;
+	Page		origpage;
+	Page		leftpage,
+				rightpage;
+	BlockNumber origpagenumber,
+				rightpagenumber;
+	BTPageOpaque ropaque,
+				lopaque,
+				oopaque;
+	Buffer		sbuf = InvalidBuffer;
+	Page		spage = NULL;
+	BTPageOpaque sopaque = NULL;
+	Size		itemsz;
+	ItemId		itemid;
+	IndexTuple	item;
+	OffsetNumber leftoff,
+				rightoff;
+	OffsetNumber maxoff;
+	OffsetNumber i;
+	bool		isleaf;
+	IndexTuple lefthikey;
+	int indnatts = IndexRelationGetNumberOfAttributes(rel);
+	int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+	/* Acquire a new page to split into */
+	rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+
+	/*
+	 * origpage is the original page to be split.  leftpage is a temporary
+	 * buffer that receives the left-sibling data, which will be copied back
+	 * into origpage on success.  rightpage is the new page that receives the
+	 * right-sibling data.  If we fail before reaching the critical section,
+	 * origpage hasn't been modified and leftpage is only workspace. In
+	 * principle we shouldn't need to worry about rightpage either, because it
+	 * hasn't been linked into the btree page structure; but to avoid leaving
+	 * possibly-confusing junk behind, we are careful to rewrite rightpage as
+	 * zeroes before throwing any error.
+	 */
+	origpage = BufferGetPage(buf);
+	leftpage = PageGetTempPage(origpage);
+	rightpage = BufferGetPage(rbuf);
+
+	origpagenumber = BufferGetBlockNumber(buf);
+	rightpagenumber = BufferGetBlockNumber(rbuf);
+
+	_bt_pageinit(leftpage, BufferGetPageSize(buf));
+	/* rightpage was already initialized by _bt_getbuf */
+
+	/*
+	 * Copy the original page's LSN into leftpage, which will become the
+	 * updated version of the page.  We need this because XLogInsert will
+	 * examine the LSN and possibly dump it in a page image.
+	 */
+	PageSetLSN(leftpage, PageGetLSN(origpage));
+
+	/* init btree private data */
+	oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+	lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
+	ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
+
+	isleaf = P_ISLEAF(oopaque);
+
+	/* if we're splitting this page, it won't be the root when we're done */
+	/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
+	lopaque->btpo_flags = oopaque->btpo_flags;
+	lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
+	ropaque->btpo_flags = lopaque->btpo_flags;
+	/* set flag in left page indicating that the right page has no downlink */
+	lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
+	lopaque->btpo_prev = oopaque->btpo_prev;
+	lopaque->btpo_next = rightpagenumber;
+	ropaque->btpo_prev = origpagenumber;
+	ropaque->btpo_next = oopaque->btpo_next;
+	lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
+	/* Since we already have write-lock on both pages, ok to read cycleid */
+	lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
+	ropaque->btpo_cycleid = lopaque->btpo_cycleid;
+
+	/*
+	 * If the page we're splitting is not the rightmost page at its level in
+	 * the tree, then the first entry on the page is the high key for the
+	 * page.  We need to copy that to the right half.  Otherwise (meaning the
+	 * rightmost page case), all the items on the right half will be user
+	 * data.
+	 */
+	rightoff = P_HIKEY;
+
+	if (!P_RIGHTMOST(oopaque))
+	{
+		itemid = PageGetItemId(origpage, P_HIKEY);
+		itemsz = ItemIdGetLength(itemid);
+		item = (IndexTuple) PageGetItem(origpage, itemid);
+		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
+						false, false) == InvalidOffsetNumber)
+		{
+			memset(rightpage, 0, BufferGetPageSize(rbuf));
+			elog(ERROR, "failed to add hikey to the right sibling"
+				 " while splitting block %u of index \"%s\"",
+				 origpagenumber, RelationGetRelationName(rel));
+		}
+		rightoff = OffsetNumberNext(rightoff);
+	}
+
+	/*
+	 * The "high key" for the new left page will be the first key that's going
+	 * to go into the new right page.  This might be either the existing data
+	 * item at position firstright, or the incoming tuple.
+	 */
+	leftoff = P_HIKEY;
+	if (!newitemonleft && newitemoff == firstright)
+	{
+		/* incoming tuple will become first on right page */
+		itemsz = newitemsz;
+		item = newitem;
+	}
+	else
+	{
+		/* existing item at firstright will become first on right page */
+		itemid = PageGetItemId(origpage, firstright);
+		itemsz = ItemIdGetLength(itemid);
+		item = (IndexTuple) PageGetItem(origpage, itemid);
+	}
+
+	/*
+	 * We must truncate included attributes of the "high key" item,
+	 * before insert it onto the leaf page.  It's the only point in insertion
+	 * process, where we perform truncation.  All other functions work with
+	 * this high key and do not change it.
+	 */
+	if (indnatts != indnkeyatts && P_ISLEAF(lopaque))
+	{
+		lefthikey = _bt_truncate_tuple(rel, item);
+		itemsz = IndexTupleSize(lefthikey);
+		itemsz = MAXALIGN(itemsz);
+	}
+	else
+		lefthikey = item;
+
+	if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
+					false, false) == InvalidOffsetNumber)
+	{
+		memset(rightpage, 0, BufferGetPageSize(rbuf));
+		elog(ERROR, "failed to add hikey to the left sibling"
+			 " while splitting block %u of index \"%s\"",
+			 origpagenumber, RelationGetRelationName(rel));
+	}
+	leftoff = OffsetNumberNext(leftoff);
+
+	/*
+	 * Now transfer all the data items to the appropriate page.
+	 *
+	 * Note: we *must* insert at least the right page's items in item-number
+	 * order, for the benefit of _bt_restore_page().
+	 */
+	maxoff = PageGetMaxOffsetNumber(origpage);
+
+	for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
+	{
+		itemid = PageGetItemId(origpage, i);
+		itemsz = ItemIdGetLength(itemid);
+		item = (IndexTuple) PageGetItem(origpage, itemid);
+
+		/* does new item belong before this one? */
+		if (i == newitemoff)
+		{
+			if (newitemonleft)
+			{
+				if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
+				{
+					memset(rightpage, 0, BufferGetPageSize(rbuf));
+					elog(ERROR, "failed to add new item to the left sibling"
+						 " while splitting block %u of index \"%s\"",
+						 origpagenumber, RelationGetRelationName(rel));
+				}
+				leftoff = OffsetNumberNext(leftoff);
+			}
+			else
+			{
+				if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
+				{
+					memset(rightpage, 0, BufferGetPageSize(rbuf));
+					elog(ERROR, "failed to add new item to the right sibling"
+						 " while splitting block %u of index \"%s\"",
+						 origpagenumber, RelationGetRelationName(rel));
+				}
+				rightoff = OffsetNumberNext(rightoff);
+			}
+		}
+
+		/* decide which page to put it on */
+		if (i < firstright)
+		{
+			if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
+			{
+				memset(rightpage, 0, BufferGetPageSize(rbuf));
+				elog(ERROR, "failed to add old item to the left sibling"
+					 " while splitting block %u of index \"%s\"",
+					 origpagenumber, RelationGetRelationName(rel));
+			}
+			leftoff = OffsetNumberNext(leftoff);
+		}
+		else
+		{
+			if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
+			{
+				memset(rightpage, 0, BufferGetPageSize(rbuf));
+				elog(ERROR, "failed to add old item to the right sibling"
+					 " while splitting block %u of index \"%s\"",
+					 origpagenumber, RelationGetRelationName(rel));
+			}
+			rightoff = OffsetNumberNext(rightoff);
+		}
+	}
+
+	/* cope with possibility that newitem goes at the end */
+	if (i <= newitemoff)
+	{
+		/*
+		 * Can't have newitemonleft here; that would imply we were told to put
+		 * *everything* on the left page, which cannot fit (if it could, we'd
+		 * not be splitting the page).
+		 */
+		Assert(!newitemonleft);
+		if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
+		{
+			memset(rightpage, 0, BufferGetPageSize(rbuf));
+			elog(ERROR, "failed to add new item to the right sibling"
+				 " while splitting block %u of index \"%s\"",
+				 origpagenumber, RelationGetRelationName(rel));
+		}
+		rightoff = OffsetNumberNext(rightoff);
+	}
+
+	/*
+	 * We have to grab the right sibling (if any) and fix the prev pointer
+	 * there. We are guaranteed that this is deadlock-free since no other
+	 * writer will be holding a lock on that page and trying to move left, and
+	 * all readers release locks on a page before trying to fetch its
+	 * neighbors.
+	 */
+
+	if (!P_RIGHTMOST(oopaque))
+	{
+		sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
+		spage = BufferGetPage(sbuf);
+		sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+		if (sopaque->btpo_prev != origpagenumber)
+		{
+			memset(rightpage, 0, BufferGetPageSize(rbuf));
+			elog(ERROR, "right sibling's left-link doesn't match: "
+				 "block %u links to %u instead of expected %u in index \"%s\"",
+				 oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
+				 RelationGetRelationName(rel));
+		}
+
+		/*
+		 * Check to see if we can set the SPLIT_END flag in the right-hand
+		 * split page; this can save some I/O for vacuum since it need not
+		 * proceed to the right sibling.  We can set the flag if the right
+		 * sibling has a different cycleid: that means it could not be part of
+		 * a group of pages that were all split off from the same ancestor
+		 * page.  If you're confused, imagine that page A splits to A B and
+		 * then again, yielding A C B, while vacuum is in progress.  Tuples
+		 * originally in A could now be in either B or C, hence vacuum must
+		 * examine both pages.  But if D, our right sibling, has a different
+		 * cycleid then it could not contain any tuples that were in A when
+		 * the vacuum started.
+		 */
+		if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
+			ropaque->btpo_flags |= BTP_SPLIT_END;
+	}
+
+	/*
+	 * Right sibling is locked, new siblings are prepared, but original page
+	 * is not updated yet.
+	 *
+	 * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
+	 * not starting the critical section till here because we haven't been
+	 * scribbling on the original page yet; see comments above.
+	 */
+	START_CRIT_SECTION();
+
+	/*
+	 * By here, the original data page has been split into two new halves, and
+	 * these are correct.  The algorithm requires that the left page never
+	 * move during a split, so we copy the new left page back on top of the
+	 * original.  Note that this is not a waste of time, since we also require
+	 * (in the page management code) that the center of a page always be
+	 * clean, and the most efficient way to guarantee this is just to compact
+	 * the data by reinserting it into a new left page.  (XXX the latter
+	 * comment is probably obsolete; but in any case it's good to not scribble
+	 * on the original page until we enter the critical section.)
+	 *
+	 * We need to do this before writing the WAL record, so that XLogInsert
+	 * can WAL log an image of the page if necessary.
+	 */
+	PageRestoreTempPage(leftpage, origpage);
+	/* leftpage, lopaque must not be used below here */
+
+	MarkBufferDirty(buf);
+	MarkBufferDirty(rbuf);
+
+	if (!P_RIGHTMOST(ropaque))
+	{
+		sopaque->btpo_prev = rightpagenumber;
+		MarkBufferDirty(sbuf);
+	}
+
+	/*
+	 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
+	 * a split.
+	 */
+	if (!isleaf)
+	{
+		Page		cpage = BufferGetPage(cbuf);
+		BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+
+		cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+		MarkBufferDirty(cbuf);
+	}
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_btree_split xlrec;
+		uint8		xlinfo;
+		XLogRecPtr	recptr;
+
+		xlrec.level = ropaque->btpo.level;
+		xlrec.firstright = firstright;
+		xlrec.newitemoff = newitemoff;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
+
+		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+		XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
+		/* Log the right sibling, because we've changed its prev-pointer. */
+		if (!P_RIGHTMOST(ropaque))
+			XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
+		if (BufferIsValid(cbuf))
+			XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
+
+		/*
+		 * Log the new item, if it was inserted on the left page. (If it was
+		 * put on the right page, we don't need to explicitly WAL log it
+		 * because it's included with all the other items on the right page.)
+		 * Show the new item as belonging to the left page buffer, so that it
+		 * is not stored if XLogInsert decides it needs a full-page image of
+		 * the left page.  We store the offset anyway, though, to support
+		 * archive compression of these records.
+		 */
+		if (newitemonleft)
+			XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
+
+		/*
+		 * We must also log the left page's high key.  There are two reasons
+		 * for that: right page's leftmost key is suppressed on non-leaf levels,
+		 * in covering indexes, included columns are truncated from high keys.
+		 * For simplicity, we don't distinguish these cases, but log the high
+		 * key every time.  Show it as belonging to the left page buffer, so
+		 * that it is not stored if XLogInsert decides it needs a full-page
+		 * image of the left page.
+		 */
+		itemid = PageGetItemId(origpage, P_HIKEY);
+		item = (IndexTuple) PageGetItem(origpage, itemid);
+		XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
+
+		/*
+		 * Log the contents of the right page in the format understood by
+		 * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
+		 * because we're going to recreate the whole page anyway, so it should
+		 * never be stored by XLogInsert.
+		 *
+		 * Direct access to page is not good but faster - we should implement
+		 * some new func in page API.  Note we only store the tuples
+		 * themselves, knowing that they were inserted in item-number order
+		 * and so the item pointers can be reconstructed.  See comments for
+		 * _bt_restore_page().
+		 */
+		XLogRegisterBufData(1,
+							(char *) rightpage + ((PageHeader) rightpage)->pd_upper,
+							((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
+
+		xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
+		recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+
+		PageSetLSN(origpage, recptr);
+		PageSetLSN(rightpage, recptr);
+		if (!P_RIGHTMOST(ropaque))
+		{
+			PageSetLSN(spage, recptr);
+		}
+		if (!isleaf)
+		{
+			PageSetLSN(BufferGetPage(cbuf), recptr);
+		}
+	}
+
+	END_CRIT_SECTION();
+
+	/* release the old right sibling */
+	if (!P_RIGHTMOST(ropaque))
+		_bt_relbuf(rel, sbuf);
+
+	/* release the child */
+	if (!isleaf)
+		_bt_relbuf(rel, cbuf);
+
+	/* split's done */
+	return rbuf;
+}
+
+/*
+ *	_bt_findsplitloc() -- find an appropriate place to split a page.
+ *
+ * The idea here is to equalize the free space that will be on each split
+ * page, *after accounting for the inserted tuple*.  (If we fail to account
+ * for it, we might find ourselves with too little room on the page that
+ * it needs to go into!)
+ *
+ * If the page is the rightmost page on its level, we instead try to arrange
+ * to leave the left split page fillfactor% full.  In this way, when we are
+ * inserting successively increasing keys (consider sequences, timestamps,
+ * etc) we will end up with a tree whose pages are about fillfactor% full,
+ * instead of the 50% full result that we'd get without this special case.
+ * This is the same as nbtsort.c produces for a newly-created tree.  Note
+ * that leaf and nonleaf pages use different fillfactors.
+ *
+ * We are passed the intended insert position of the new tuple, expressed as
+ * the offsetnumber of the tuple it must go in front of.  (This could be
+ * maxoff+1 if the tuple is to go at the end.)
+ *
+ * We return the index of the first existing tuple that should go on the
+ * righthand page, plus a boolean indicating whether the new tuple goes on
+ * the left or right page.  The bool is necessary to disambiguate the case
+ * where firstright == newitemoff.
+ */
+static OffsetNumber
+_bt_findsplitloc(Relation rel,
+				 Page page,
+				 OffsetNumber newitemoff,
+				 Size newitemsz,
+				 bool *newitemonleft)
+{
+	BTPageOpaque opaque;
+	OffsetNumber offnum;
+	OffsetNumber maxoff;
+	ItemId		itemid;
+	FindSplitData state;
+	int			leftspace,
+				rightspace,
+				goodenough,
+				olddataitemstotal,
+				olddataitemstoleft;
+	bool		goodenoughfound;
+
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+	newitemsz += sizeof(ItemIdData);
+
+	/* Total free space available on a btree page, after fixed overhead */
+	leftspace = rightspace =
+		PageGetPageSize(page) - SizeOfPageHeaderData -
+		MAXALIGN(sizeof(BTPageOpaqueData));
+
+	/* The right page will have the same high key as the old page */
+	if (!P_RIGHTMOST(opaque))
+	{
+		itemid = PageGetItemId(page, P_HIKEY);
+		rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
+							 sizeof(ItemIdData));
+	}
+
+	/* Count up total space in data items without actually scanning 'em */
+	olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
+
+	state.newitemsz = newitemsz;
+	state.is_leaf = P_ISLEAF(opaque);
+	state.is_rightmost = P_RIGHTMOST(opaque);
+	state.have_split = false;
+	if (state.is_leaf)
+		state.fillfactor = RelationGetFillFactor(rel,
+												 BTREE_DEFAULT_FILLFACTOR);
+	else
+		state.fillfactor = BTREE_NONLEAF_FILLFACTOR;
+	state.newitemonleft = false;	/* these just to keep compiler quiet */
+	state.firstright = 0;
+	state.best_delta = 0;
+	state.leftspace = leftspace;
+	state.rightspace = rightspace;
+	state.olddataitemstotal = olddataitemstotal;
+	state.newitemoff = newitemoff;
+
+	/*
+	 * Finding the best possible split would require checking all the possible
+	 * split points, because of the high-key and left-key special cases.
+	 * That's probably more work than it's worth; instead, stop as soon as we
+	 * find a "good-enough" split, where good-enough is defined as an
+	 * imbalance in free space of no more than pagesize/16 (arbitrary...) This
+	 * should let us stop near the middle on most pages, instead of plowing to
+	 * the end.
+	 */
+	goodenough = leftspace / 16;
+
+	/*
+	 * Scan through the data items and calculate space usage for a split at
+	 * each possible position.
+	 */
+	olddataitemstoleft = 0;
+	goodenoughfound = false;
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	for (offnum = P_FIRSTDATAKEY(opaque);
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		Size		itemsz;
+
+		itemid = PageGetItemId(page, offnum);
+		itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+
+		/*
+		 * Will the new item go to left or right of split?
+		 */
+		if (offnum > newitemoff)
+			_bt_checksplitloc(&state, offnum, true,
+							  olddataitemstoleft, itemsz);
+
+		else if (offnum < newitemoff)
+			_bt_checksplitloc(&state, offnum, false,
+							  olddataitemstoleft, itemsz);
+		else
+		{
+			/* need to try it both ways! */
+			_bt_checksplitloc(&state, offnum, true,
+							  olddataitemstoleft, itemsz);
+
+			_bt_checksplitloc(&state, offnum, false,
+							  olddataitemstoleft, itemsz);
+		}
+
+		/* Abort scan once we find a good-enough choice */
+		if (state.have_split && state.best_delta <= goodenough)
+		{
+			goodenoughfound = true;
+			break;
+		}
+
+		olddataitemstoleft += itemsz;
+	}
+
+	/*
+	 * If the new item goes as the last item, check for splitting so that all
+	 * the old items go to the left page and the new item goes to the right
+	 * page.
+	 */
+	if (newitemoff > maxoff && !goodenoughfound)
+		_bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);
+
+	/*
+	 * I believe it is not possible to fail to find a feasible split, but just
+	 * in case ...
+	 */
+	if (!state.have_split)
+		elog(ERROR, "could not find a feasible split point for index \"%s\"",
+			 RelationGetRelationName(rel));
+
+	*newitemonleft = state.newitemonleft;
+	return state.firstright;
+}
+
+/*
+ * Subroutine to analyze a particular possible split choice (ie, firstright
+ * and newitemonleft settings), and record the best split so far in *state.
+ *
+ * firstoldonright is the offset of the first item on the original page
+ * that goes to the right page, and firstoldonrightsz is the size of that
+ * tuple. firstoldonright can be > max offset, which means that all the old
+ * items go to the left page and only the new item goes to the right page.
+ * In that case, firstoldonrightsz is not used.
+ *
+ * olddataitemstoleft is the total size of all old items to the left of
+ * firstoldonright.
+ */
+static void
+_bt_checksplitloc(FindSplitData *state,
+				  OffsetNumber firstoldonright,
+				  bool newitemonleft,
+				  int olddataitemstoleft,
+				  Size firstoldonrightsz)
+{
+	int			leftfree,
+				rightfree;
+	Size		firstrightitemsz;
+	bool		newitemisfirstonright;
+
+	/* Is the new item going to be the first item on the right page? */
+	newitemisfirstonright = (firstoldonright == state->newitemoff
+							 && !newitemonleft);
+
+	if (newitemisfirstonright)
+		firstrightitemsz = state->newitemsz;
+	else
+		firstrightitemsz = firstoldonrightsz;
+
+	/* Account for all the old tuples */
+	leftfree = state->leftspace - olddataitemstoleft;
+	rightfree = state->rightspace -
+		(state->olddataitemstotal - olddataitemstoleft);
+
+	/*
+	 * The first item on the right page becomes the high key of the left page;
+	 * therefore it counts against left space as well as right space.
+	 */
+	leftfree -= firstrightitemsz;
+
+	/* account for the new item */
+	if (newitemonleft)
+		leftfree -= (int) state->newitemsz;
+	else
+		rightfree -= (int) state->newitemsz;
+
+	/*
+	 * If we are not on the leaf level, we will be able to discard the key
+	 * data from the first item that winds up on the right page.
+	 */
+	if (!state->is_leaf)
+		rightfree += (int) firstrightitemsz -
+			(int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
+
+	/*
+	 * If feasible split point, remember best delta.
+	 */
+	if (leftfree >= 0 && rightfree >= 0)
+	{
+		int			delta;
+
+		if (state->is_rightmost)
+		{
+			/*
+			 * If splitting a rightmost page, try to put (100-fillfactor)% of
+			 * free space on left page. See comments for _bt_findsplitloc.
+			 */
+			delta = (state->fillfactor * leftfree)
+				- ((100 - state->fillfactor) * rightfree);
+		}
+		else
+		{
+			/* Otherwise, aim for equal free space on both sides */
+			delta = leftfree - rightfree;
+		}
+
+		if (delta < 0)
+			delta = -delta;
+		if (!state->have_split || delta < state->best_delta)
+		{
+			state->have_split = true;
+			state->newitemonleft = newitemonleft;
+			state->firstright = firstoldonright;
+			state->best_delta = delta;
+		}
+	}
+}
+
+/*
+ * _bt_insert_parent() -- Insert downlink into parent after a page split.
+ *
+ * On entry, buf and rbuf are the left and right split pages, which we
+ * still hold write locks on per the L&Y algorithm.  We release the
+ * write locks once we have write lock on the parent page.  (Any sooner,
+ * and it'd be possible for some other process to try to split or delete
+ * one of these pages, and get confused because it cannot find the downlink.)
+ *
+ * stack - stack showing how we got here.  May be NULL in cases that don't
+ *			have to be efficient (concurrent ROOT split, WAL recovery)
+ * is_root - we split the true root
+ * is_only - we split a page alone on its level (might have been fast root)
+ */
+static void
+_bt_insert_parent(Relation rel,
+				  Buffer buf,
+				  Buffer rbuf,
+				  BTStack stack,
+				  bool is_root,
+				  bool is_only)
+{
+	/*
+	 * Here we have to do something Lehman and Yao don't talk about: deal with
+	 * a root split and construction of a new root.  If our stack is empty
+	 * then we have just split a node on what had been the root level when we
+	 * descended the tree.  If it was still the root then we perform a
+	 * new-root construction.  If it *wasn't* the root anymore, search to find
+	 * the next higher level that someone constructed meanwhile, and find the
+	 * right place to insert as for the normal case.
+	 *
+	 * If we have to search for the parent level, we do so by re-descending
+	 * from the root.  This is not super-efficient, but it's rare enough not
+	 * to matter.
+	 */
+	if (is_root)
+	{
+		Buffer		rootbuf;
+
+		Assert(stack == NULL);
+		Assert(is_only);
+		/* create a new root node and update the metapage */
+		rootbuf = _bt_newroot(rel, buf, rbuf);
+		/* release the split buffers */
+		_bt_relbuf(rel, rootbuf);
+		_bt_relbuf(rel, rbuf);
+		_bt_relbuf(rel, buf);
+	}
+	else
+	{
+		BlockNumber bknum = BufferGetBlockNumber(buf);
+		BlockNumber rbknum = BufferGetBlockNumber(rbuf);
+		Page		page = BufferGetPage(buf);
+		IndexTuple	new_item;
+		BTStackData fakestack;
+		IndexTuple	ritem;
+		Buffer		pbuf;
+
+		if (stack == NULL)
+		{
+			BTPageOpaque lpageop;
+
+			elog(DEBUG2, "concurrent ROOT page split");
+			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+			/* Find the leftmost page at the next level up */
+			pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false,
+									NULL);
+			/* Set up a phony stack entry pointing there */
+			stack = &fakestack;
+			stack->bts_blkno = BufferGetBlockNumber(pbuf);
+			stack->bts_offset = InvalidOffsetNumber;
+			/* bts_btentry will be initialized below */
+			stack->bts_parent = NULL;
+			_bt_relbuf(rel, pbuf);
+		}
+
+		/* get high key from left page == lowest key on new right page */
+		ritem = (IndexTuple) PageGetItem(page,
+										 PageGetItemId(page, P_HIKEY));
+
+		/* form an index tuple that points at the new right page */
+		new_item = CopyIndexTuple(ritem);
+		ItemPointerSetBlockNumber(&(new_item->t_tid), rbknum);
+//		ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY);
+
+		/*
+		 * Find the parent buffer and get the parent page.
+		 *
+		 * Oops - if we were moved right then we need to change stack item! We
+		 * want to find parent pointing to where we are, right ?	- vadim
+		 * 05/27/97
+		 */
+		ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY);
+		pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+
+		/*
+		 * Now we can unlock the right child. The left child will be unlocked
+		 * by _bt_insertonpg().
+		 */
+		_bt_relbuf(rel, rbuf);
+
+		/* Check for error only after writing children */
+		if (pbuf == InvalidBuffer)
+			elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
+				 RelationGetRelationName(rel), bknum, rbknum);
+
+		/* Recursively update the parent */
+		_bt_insertonpg(rel, pbuf, buf, stack->bts_parent,
+					   new_item, stack->bts_offset + 1,
+					   is_only);
+
+		/* be tidy */
+		pfree(new_item);
+	}
+}
+
+/*
+ * _bt_finish_split() -- Finish an incomplete split
+ *
+ * A crash or other failure can leave a split incomplete.  The insertion
+ * routines won't allow to insert on a page that is incompletely split.
+ * Before inserting on such a page, call _bt_finish_split().
+ *
+ * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
+ * and unpinned.
+ */
+void
+_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
+{
+	Page		lpage = BufferGetPage(lbuf);
+	BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
+	Buffer		rbuf;
+	Page		rpage;
+	BTPageOpaque rpageop;
+	bool		was_root;
+	bool		was_only;
+
+	Assert(P_INCOMPLETE_SPLIT(lpageop));
+
+	/* Lock right sibling, the one missing the downlink */
+	rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
+	rpage = BufferGetPage(rbuf);
+	rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+	/* Could this be a root split? */
+	if (!stack)
+	{
+		Buffer		metabuf;
+		Page		metapg;
+		BTMetaPageData *metad;
+
+		/* acquire lock on the metapage */
+		metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+		metapg = BufferGetPage(metabuf);
+		metad = BTPageGetMeta(metapg);
+
+		was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
+
+		_bt_relbuf(rel, metabuf);
+	}
+	else
+		was_root = false;
+
+	/* Was this the only page on the level before split? */
+	was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
+
+	elog(DEBUG1, "finishing incomplete split of %u/%u",
+		 BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
+
+	_bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
+}
+
+/*
+ *	_bt_getstackbuf() -- Walk back up the tree one step, and find the item
+ *						 we last looked at in the parent.
+ *
+ *		This is possible because we save the downlink from the parent item,
+ *		which is enough to uniquely identify it.  Insertions into the parent
+ *		level could cause the item to move right; deletions could cause it
+ *		to move left, but not left of the page we previously found it in.
+ *
+ *		Adjusts bts_blkno & bts_offset if changed.
+ *
+ *		Returns InvalidBuffer if item not found (should not happen).
+ */
+Buffer
+_bt_getstackbuf(Relation rel, BTStack stack, int access)
+{
+	BlockNumber blkno;
+	OffsetNumber start;
+
+	blkno = stack->bts_blkno;
+	start = stack->bts_offset;
+
+	for (;;)
+	{
+		Buffer		buf;
+		Page		page;
+		BTPageOpaque opaque;
+
+		buf = _bt_getbuf(rel, blkno, access);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque))
+		{
+			_bt_finish_split(rel, buf, stack->bts_parent);
+			continue;
+		}
+
+		if (!P_IGNORE(opaque))
+		{
+			OffsetNumber offnum,
+						minoff,
+						maxoff;
+			ItemId		itemid;
+			IndexTuple	item;
+
+			minoff = P_FIRSTDATAKEY(opaque);
+			maxoff = PageGetMaxOffsetNumber(page);
+
+			/*
+			 * start = InvalidOffsetNumber means "search the whole page". We
+			 * need this test anyway due to possibility that page has a high
+			 * key now when it didn't before.
+			 */
+			if (start < minoff)
+				start = minoff;
+
+			/*
+			 * Need this check too, to guard against possibility that page
+			 * split since we visited it originally.
+			 */
+			if (start > maxoff)
+				start = OffsetNumberNext(maxoff);
+
+			/*
+			 * These loops will check every item on the page --- but in an
+			 * order that's attuned to the probability of where it actually
+			 * is.  Scan to the right first, then to the left.
+			 */
+			for (offnum = start;
+				 offnum <= maxoff;
+				 offnum = OffsetNumberNext(offnum))
+			{
+				itemid = PageGetItemId(page, offnum);
+				item = (IndexTuple) PageGetItem(page, itemid);
+				if (BTEntrySame(item, &stack->bts_btentry))
+				{
+					/* Return accurate pointer to where link is now */
+					stack->bts_blkno = blkno;
+					stack->bts_offset = offnum;
+					return buf;
+				}
+			}
+
+			for (offnum = OffsetNumberPrev(start);
+				 offnum >= minoff;
+				 offnum = OffsetNumberPrev(offnum))
+			{
+				itemid = PageGetItemId(page, offnum);
+				item = (IndexTuple) PageGetItem(page, itemid);
+				if (BTEntrySame(item, &stack->bts_btentry))
+				{
+					/* Return accurate pointer to where link is now */
+					stack->bts_blkno = blkno;
+					stack->bts_offset = offnum;
+					return buf;
+				}
+			}
+		}
+
+		/*
+		 * The item we're looking for moved right at least one page.
+		 */
+		if (P_RIGHTMOST(opaque))
+		{
+			_bt_relbuf(rel, buf);
+			return InvalidBuffer;
+		}
+		blkno = opaque->btpo_next;
+		start = InvalidOffsetNumber;
+		_bt_relbuf(rel, buf);
+	}
+}
+
+/*
+ *	_bt_newroot() -- Create a new root page for the index.
+ *
+ *		We've just split the old root page and need to create a new one.
+ *		In order to do this, we add a new root page to the file, then lock
+ *		the metadata page and update it.  This is guaranteed to be deadlock-
+ *		free, because all readers release their locks on the metadata page
+ *		before trying to lock the root, and all writers lock the root before
+ *		trying to lock the metadata page.  We have a write lock on the old
+ *		root page, so we have not introduced any cycles into the waits-for
+ *		graph.
+ *
+ *		On entry, lbuf (the old root) and rbuf (its new peer) are write-
+ *		locked. On exit, a new root page exists with entries for the
+ *		two new children, metapage is updated and unlocked/unpinned.
+ *		The new root buffer is returned to caller which has to unlock/unpin
+ *		lbuf, rbuf & rootbuf.
+ */
+static Buffer
+_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
+{
+	Buffer		rootbuf;
+	Page		lpage,
+				rootpage;
+	BlockNumber lbkno,
+				rbkno;
+	BlockNumber rootblknum;
+	BTPageOpaque rootopaque;
+	BTPageOpaque lopaque;
+	ItemId		itemid;
+	IndexTuple	item;
+	IndexTuple	left_item;
+	Size		left_item_sz;
+	IndexTuple	right_item;
+	Size		right_item_sz;
+	Buffer		metabuf;
+	Page		metapg;
+	BTMetaPageData *metad;
+
+	lbkno = BufferGetBlockNumber(lbuf);
+	rbkno = BufferGetBlockNumber(rbuf);
+	lpage = BufferGetPage(lbuf);
+	lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+
+	/* get a new root page */
+	rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+	rootpage = BufferGetPage(rootbuf);
+	rootblknum = BufferGetBlockNumber(rootbuf);
+
+	/* acquire lock on the metapage */
+	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+	metapg = BufferGetPage(metabuf);
+	metad = BTPageGetMeta(metapg);
+
+	/*
+	 * Create downlink item for left page (old root).  Since this will be the
+	 * first item in a non-leaf page, it implicitly has minus-infinity key
+	 * value, so we need not store any actual key in it.
+	 */
+	left_item_sz = sizeof(IndexTupleData);
+	left_item = (IndexTuple) palloc(left_item_sz);
+	left_item->t_info = left_item_sz;
+	ItemPointerSet(&(left_item->t_tid), lbkno, P_HIKEY);
+
+	/*
+	 * Create downlink item for right page.  The key for it is obtained from
+	 * the "high key" position in the left page.
+	 */
+	itemid = PageGetItemId(lpage, P_HIKEY);
+	right_item_sz = ItemIdGetLength(itemid);
+	item = (IndexTuple) PageGetItem(lpage, itemid);
+	right_item = CopyIndexTuple(item);
+	ItemPointerSet(&(right_item->t_tid), rbkno, P_HIKEY);
+
+	/* NO EREPORT(ERROR) from here till newroot op is logged */
+	START_CRIT_SECTION();
+
+	/* set btree special data */
+	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+	rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+	rootopaque->btpo_flags = BTP_ROOT;
+	rootopaque->btpo.level =
+		((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
+	rootopaque->btpo_cycleid = 0;
+
+	/* update metapage data */
+	metad->btm_root = rootblknum;
+	metad->btm_level = rootopaque->btpo.level;
+	metad->btm_fastroot = rootblknum;
+	metad->btm_fastlevel = rootopaque->btpo.level;
+
+	/*
+	 * Insert the left page pointer into the new root page.  The root page is
+	 * the rightmost page on its level so there is no "high key" in it; the
+	 * two items will go into positions P_HIKEY and P_FIRSTKEY.
+	 *
+	 * Note: we *must* insert the two items in item-number order, for the
+	 * benefit of _bt_restore_page().
+	 */
+	if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
+					false, false) == InvalidOffsetNumber)
+		elog(PANIC, "failed to add leftkey to new root page"
+			 " while splitting block %u of index \"%s\"",
+			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
+
+	/*
+	 * insert the right page pointer into the new root page.
+	 */
+	if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
+					false, false) == InvalidOffsetNumber)
+		elog(PANIC, "failed to add rightkey to new root page"
+			 " while splitting block %u of index \"%s\"",
+			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
+
+	/* Clear the incomplete-split flag in the left child */
+	Assert(P_INCOMPLETE_SPLIT(lopaque));
+	lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+	MarkBufferDirty(lbuf);
+
+	MarkBufferDirty(rootbuf);
+	MarkBufferDirty(metabuf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_btree_newroot xlrec;
+		XLogRecPtr	recptr;
+		xl_btree_metadata md;
+
+		xlrec.rootblk = rootblknum;
+		xlrec.level = metad->btm_level;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
+
+		XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
+		XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
+		XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+		md.root = rootblknum;
+		md.level = metad->btm_level;
+		md.fastroot = rootblknum;
+		md.fastlevel = metad->btm_level;
+
+		XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
+
+		/*
+		 * Direct access to page is not good but faster - we should implement
+		 * some new func in page API.
+		 */
+		XLogRegisterBufData(0,
+							(char *) rootpage + ((PageHeader) rootpage)->pd_upper,
+							((PageHeader) rootpage)->pd_special -
+							((PageHeader) rootpage)->pd_upper);
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
+
+		PageSetLSN(lpage, recptr);
+		PageSetLSN(rootpage, recptr);
+		PageSetLSN(metapg, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	/* done with metapage */
+	_bt_relbuf(rel, metabuf);
+
+	pfree(left_item);
+	pfree(right_item);
+
+	return rootbuf;
+}
+
+/*
+ *	_bt_pgaddtup() -- add a tuple to a particular page in the index.
+ *
+ *		This routine adds the tuple to the page as requested.  It does
+ *		not affect pin/lock status, but you'd better have a write lock
+ *		and pin on the target buffer!  Don't forget to write and release
+ *		the buffer afterwards, either.
+ *
+ *		The main difference between this routine and a bare PageAddItem call
+ *		is that this code knows that the leftmost index tuple on a non-leaf
+ *		btree page doesn't need to have a key.  Therefore, it strips such
+ *		tuples down to just the tuple header.  CAUTION: this works ONLY if
+ *		we insert the tuples in order, so that the given itup_off does
+ *		represent the final position of the tuple!
+ */
+bool
+_bt_pgaddtup(Page page,
+			 Size itemsize,
+			 IndexTuple itup,
+			 OffsetNumber itup_off)
+{
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	IndexTupleData trunctuple;
+
+	if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
+	{
+		trunctuple = *itup;
+		trunctuple.t_info = sizeof(IndexTupleData);
+		itup = &trunctuple;
+		itemsize = sizeof(IndexTupleData);
+	}
+
+	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
+					false, false) == InvalidOffsetNumber)
+		return false;
+
+	return true;
+}
+
+/*
+ * _bt_isequal - used in _bt_doinsert in check for duplicates.
+ *
+ * This is very similar to _bt_compare, except for NULL handling.
+ * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
+ */
+static bool
+_bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
+			int keysz, ScanKey scankey)
+{
+	IndexTuple	itup;
+	int			i;
+
+	/* Better be comparing to a leaf item */
+	Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
+
+	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+
+	for (i = 1; i <= keysz; i++)
+	{
+		AttrNumber	attno;
+		Datum		datum;
+		bool		isNull;
+		int32		result;
+
+		attno = scankey->sk_attno;
+		Assert(attno == i);
+		datum = index_getattr(itup, attno, itupdesc, &isNull);
+
+		/* NULLs are never equal to anything */
+		if (isNull || (scankey->sk_flags & SK_ISNULL))
+			return false;
+
+		result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+												 scankey->sk_collation,
+												 datum,
+												 scankey->sk_argument));
+
+		if (result != 0)
+			return false;
+
+		scankey++;
+	}
+
+	/* if we get here, the keys are equal */
+	return true;
+}
+
+/*
+ * _bt_vacuum_one_page - vacuum just one index page.
+ *
+ * Try to remove LP_DEAD items from the given page.  The passed buffer
+ * must be exclusive-locked, but unlike a real VACUUM, we don't need a
+ * super-exclusive "cleanup" lock (see nbtree/README).
+ */
+static void
+_bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
+{
+	OffsetNumber deletable[MaxOffsetNumber];
+	int			ndeletable = 0;
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Page		page = BufferGetPage(buffer);
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	/*
+	 * Scan over all items to see which ones need to be deleted according to
+	 * LP_DEAD flags.
+	 */
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemId = PageGetItemId(page, offnum);
+
+		if (ItemIdIsDead(itemId))
+			deletable[ndeletable++] = offnum;
+	}
+
+	if (ndeletable > 0)
+		_bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
+
+	/*
+	 * Note: if we didn't find any LP_DEAD items, then the page's
+	 * BTP_HAS_GARBAGE hint bit is falsely set.  We do not bother expending a
+	 * separate write to clear it, however.  We will clear it when we split
+	 * the page.
+	 */
+}
diff --git a/src/backend/access/nbtree/nbtinsert.c.rej b/src/backend/access/nbtree/nbtinsert.c.rej
new file mode 100644
index 0000000000..f3fa315055
--- /dev/null
+++ b/src/backend/access/nbtree/nbtinsert.c.rej
@@ -0,0 +1,17 @@
+***************
+*** 1811,1817 ****
+  
+  		/* form an index tuple that points at the new right page */
+  		new_item = CopyIndexTuple(ritem);
+- 		ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY);
+  
+  		/*
+  		 * Find the parent buffer and get the parent page.
+--- 1811,1817 ----
+  
+  		/* form an index tuple that points at the new right page */
+  		new_item = CopyIndexTuple(ritem);
++ 		ItemPointerSetBlockNumber(&(new_item->t_tid), rbknum);
+  
+  		/*
+  		 * Find the parent buffer and get the parent page.
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index e6bfb18e7b..6d3637921c 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -985,7 +985,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
 	 * Locate the downlink of "child" in the parent (updating the stack entry
 	 * if needed)
 	 */
-	ItemPointerSet(&(stack->bts_btentry.t_tid), child, P_HIKEY);
+	ItemPointerSetBlockNumber(&(stack->bts_btentry.t_tid), child);
 	pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
 	if (pbuf == InvalidBuffer)
 		elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
@@ -1425,7 +1425,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
 
 	itemid = PageGetItemId(page, topoff);
 	itup = (IndexTuple) PageGetItem(page, itemid);
-	ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
+	ItemPointerSetBlockNumber(&(itup->t_tid), rightsib);
 
 	nextoffset = OffsetNumberNext(topoff);
 	PageIndexTupleDelete(page, nextoffset);
@@ -1444,7 +1444,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
 	MemSet(&trunctuple, 0, sizeof(IndexTupleData));
 	trunctuple.t_info = sizeof(IndexTupleData);
 	if (target != leafblkno)
-		ItemPointerSet(&trunctuple.t_tid, target, P_HIKEY);
+		ItemPointerSetBlockNumber(&trunctuple.t_tid, target);
 	else
 		ItemPointerSetInvalid(&trunctuple.t_tid);
 	if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
@@ -1763,7 +1763,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 		if (nextchild == InvalidBlockNumber)
 			ItemPointerSetInvalid(leafhikey);
 		else
-			ItemPointerSet(leafhikey, nextchild, P_HIKEY);
+			ItemPointerSetBlockNumber(leafhikey, nextchild);
 	}
 
 	/*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index d19348a206..91441b467c 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -899,7 +899,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 			 * it will be that in the future. Now the purpose is just to save
 			 * more space on inner pages of btree.
 			 */
-			keytup = index_truncate_tuple(wstate->index, oitup, indnkeyatts);
+			keytup = _bt_truncate_tuple(wstate->index, oitup);
 
 			/*  delete "wrong" high key, insert keytup as P_HIKEY. */
 			PageIndexTupleDelete(opage, P_HIKEY);
@@ -918,7 +918,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 			state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
 
 		Assert(state->btps_minkey != NULL);
-		ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY);
+		ItemPointerSetBlockNumber(&(state->btps_minkey->t_tid), oblkno);
 		_bt_buildadd(wstate, state->btps_next, state->btps_minkey);
 		pfree(state->btps_minkey);
 
@@ -972,8 +972,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		 * into the parent page as a downlink
 		 */
 		if (indnkeyatts != indnatts && P_ISLEAF(pageop))
-			state->btps_minkey = index_truncate_tuple(wstate->index,
-													  itup, indnkeyatts);
+			state->btps_minkey = _bt_truncate_tuple(wstate->index, itup);
 		else
 			state->btps_minkey = CopyIndexTuple(itup);
 	}
@@ -1028,7 +1027,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 		else
 		{
 			Assert(s->btps_minkey != NULL);
-			ItemPointerSet(&(s->btps_minkey->t_tid), blkno, P_HIKEY);
+			ItemPointerSetBlockNumber(&(s->btps_minkey->t_tid), blkno);
 			_bt_buildadd(wstate, s->btps_next, s->btps_minkey);
 			pfree(s->btps_minkey);
 			s->btps_minkey = NULL;
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 2fc5924bf0..149b52e3ad 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -2078,3 +2078,23 @@ btproperty(Oid index_oid, int attno,
 			return false;		/* punt to generic code */
 	}
 }
+
+/*
+ *	_bt_truncate_tuple() -- remove non-key (INCLUDE) attributes from index
+ *							tuple.
+ *
+ *	Transforms an ordinal B-tree leaf index tuple into pivot tuple to be used
+ *	as hikey or non-leaf page tuple with downlink.  Note that t_tid offset
+ *	will be overritten in order to represent number of present tuple attributes.
+ */
+IndexTuple
+_bt_truncate_tuple(Relation idxrel, IndexTuple olditup)
+{
+	IndexTuple		newitup;
+	int				nkeyattrs = IndexRelationGetNumberOfKeyAttributes(idxrel);
+
+	newitup = index_truncate_tuple(idxrel, olditup, nkeyattrs);
+	BTreeTupSetNAtts(newitup, nkeyattrs);
+
+	return newitup;
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index bbfe860e36..e09a389181 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -764,7 +764,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
 
 		itemid = PageGetItemId(page, poffset);
 		itup = (IndexTuple) PageGetItem(page, itemid);
-		ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
+		ItemPointerSetBlockNumber(&(itup->t_tid), rightsib);
 		nextoffset = OffsetNumberNext(poffset);
 		PageIndexTupleDelete(page, nextoffset);
 
@@ -794,7 +794,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
 	MemSet(&trunctuple, 0, sizeof(IndexTupleData));
 	trunctuple.t_info = sizeof(IndexTupleData);
 	if (xlrec->topparent != InvalidBlockNumber)
-		ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY);
+		ItemPointerSetBlockNumber(&trunctuple.t_tid, xlrec->topparent);
 	else
 		ItemPointerSetInvalid(&trunctuple.t_tid);
 	if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
@@ -904,7 +904,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 		MemSet(&trunctuple, 0, sizeof(IndexTupleData));
 		trunctuple.t_info = sizeof(IndexTupleData);
 		if (xlrec->topparent != InvalidBlockNumber)
-			ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY);
+			ItemPointerSetBlockNumber(&trunctuple.t_tid, xlrec->topparent);
 		else
 			ItemPointerSetInvalid(&trunctuple.t_tid);
 		if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 053f8aa345..6d6b22fafb 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -151,11 +151,8 @@ typedef struct BTMetaPageData
  *	as unique identifier for a given index tuple (logical position
  *	within a level). - vadim 04/09/97
  */
-#define BTTidSame(i1, i2)	\
-	((ItemPointerGetBlockNumber(&(i1)) == ItemPointerGetBlockNumber(&(i2))) && \
-	 (ItemPointerGetOffsetNumber(&(i1)) == ItemPointerGetOffsetNumber(&(i2))))
 #define BTEntrySame(i1, i2) \
-	BTTidSame((i1)->t_tid, (i2)->t_tid)
+	((ItemPointerGetBlockNumber(&(i1)->t_tid) == ItemPointerGetBlockNumber(&(i2)->t_tid)))
 
 
 /*
@@ -206,6 +203,33 @@ typedef struct BTMetaPageData
 #define P_FIRSTDATAKEY(opaque)	(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
 
+/*
+ * In B-tree index with INCLUDE clause, pivot tuples used in non-leaf pages
+ * and as hikeys are truncated.  So, such tuples don't contain included
+ * attributes.  In order to keep on-disk compatibility with upcoming suffix
+ * truncation of pivot tuples, we store number of attributes present inside
+ * tuple itself.  Thankfully, offset number is always unused in pivot tuple.
+ * So, we use high bit of offset (which is free in every tuple) as flag
+ * that offset have alternative meaning: it stores number of keys present in
+ * index tuple (12 bit is far enough for that).  And we have 3 bits reserved
+ * for future usage.
+ */
+#define BT_ALT_OFFSET_FLAG		0x8000 /* flag indicating t_tid offset has
+										  an alternative meaning */
+#define BT_N_KEYS_OFFSET_MASK	0x0FFF /* mask of bits in t_tid offset
+										  holding number of attributes
+										  actually present in index tuple */
+
+/* Set number of attributes to B-tree index tuple overriding t_tid offset */
+#define BTreeTupSetNAtts(itup, n)	\
+	ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) | BT_ALT_OFFSET_FLAG)
+/* Get number of attributes in B-tree index tuple */
+#define BtreeTupGetNAtts(itup, index)	\
+	(ItemPointerGetOffsetNumber(&(itup)->t_tid) & BT_ALT_OFFSET_FLAG ? \
+	 ItemPointerGetOffsetNumber(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK : \
+	 IndexRelationGetNumberOfAttributes(index))
+
+
 /*
  *	Operator strategy numbers for B-tree have been moved to access/stratnum.h,
  *	because many places need to use them in ScanKeyInit() calls.
@@ -545,6 +569,7 @@ extern bytea *btoptions(Datum reloptions, bool validate);
 extern bool btproperty(Oid index_oid, int attno,
 		   IndexAMProperty prop, const char *propname,
 		   bool *res, bool *isnull);
+extern IndexTuple _bt_truncate_tuple(Relation idxrel, IndexTuple olditup);
 
 /*
  * prototypes for functions in nbtvalidate.c
diff --git a/src/include/access/nbtree.h.orig b/src/include/access/nbtree.h.orig
new file mode 100644
index 0000000000..e45e46f452
--- /dev/null
+++ b/src/include/access/nbtree.h.orig
@@ -0,0 +1,577 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtree.h
+ *	  header file for postgres btree access method implementation.
+ *
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/nbtree.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef NBTREE_H
+#define NBTREE_H
+
+#include "access/amapi.h"
+#include "access/itup.h"
+#include "access/sdir.h"
+#include "access/xlogreader.h"
+#include "catalog/pg_index.h"
+#include "lib/stringinfo.h"
+#include "storage/bufmgr.h"
+#include "storage/shm_toc.h"
+
+/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
+typedef uint16 BTCycleId;
+
+/*
+ *	BTPageOpaqueData -- At the end of every page, we store a pointer
+ *	to both siblings in the tree.  This is used to do forward/backward
+ *	index scans.  The next-page link is also critical for recovery when
+ *	a search has navigated to the wrong page due to concurrent page splits
+ *	or deletions; see src/backend/access/nbtree/README for more info.
+ *
+ *	In addition, we store the page's btree level (counting upwards from
+ *	zero at a leaf page) as well as some flag bits indicating the page type
+ *	and status.  If the page is deleted, we replace the level with the
+ *	next-transaction-ID value indicating when it is safe to reclaim the page.
+ *
+ *	We also store a "vacuum cycle ID".  When a page is split while VACUUM is
+ *	processing the index, a nonzero value associated with the VACUUM run is
+ *	stored into both halves of the split page.  (If VACUUM is not running,
+ *	both pages receive zero cycleids.)	This allows VACUUM to detect whether
+ *	a page was split since it started, with a small probability of false match
+ *	if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
+ *	ago.  Also, during a split, the BTP_SPLIT_END flag is cleared in the left
+ *	(original) page, and set in the right page, but only if the next page
+ *	to its right has a different cycleid.
+ *
+ *	NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
+ *	instead.
+ */
+
+typedef struct BTPageOpaqueData
+{
+	BlockNumber btpo_prev;		/* left sibling, or P_NONE if leftmost */
+	BlockNumber btpo_next;		/* right sibling, or P_NONE if rightmost */
+	union
+	{
+		uint32		level;		/* tree level --- zero for leaf pages */
+		TransactionId xact;		/* next transaction ID, if deleted */
+	}			btpo;
+	uint16		btpo_flags;		/* flag bits, see below */
+	BTCycleId	btpo_cycleid;	/* vacuum cycle ID of latest split */
+} BTPageOpaqueData;
+
+typedef BTPageOpaqueData *BTPageOpaque;
+
+/* Bits defined in btpo_flags */
+#define BTP_LEAF		(1 << 0)	/* leaf page, i.e. not internal page */
+#define BTP_ROOT		(1 << 1)	/* root page (has no parent) */
+#define BTP_DELETED		(1 << 2)	/* page has been deleted from tree */
+#define BTP_META		(1 << 3)	/* meta-page */
+#define BTP_HALF_DEAD	(1 << 4)	/* empty, but still in tree */
+#define BTP_SPLIT_END	(1 << 5)	/* rightmost page of split group */
+#define BTP_HAS_GARBAGE (1 << 6)	/* page has LP_DEAD tuples */
+#define BTP_INCOMPLETE_SPLIT (1 << 7)	/* right sibling's downlink is missing */
+
+/*
+ * The max allowed value of a cycle ID is a bit less than 64K.  This is
+ * for convenience of pg_filedump and similar utilities: we want to use
+ * the last 2 bytes of special space as an index type indicator, and
+ * restricting cycle ID lets btree use that space for vacuum cycle IDs
+ * while still allowing index type to be identified.
+ */
+#define MAX_BT_CYCLE_ID		0xFF7F
+
+
+#define BT_ALT_OFFSET_FLAG		0x8000 /* flag indicating t_tid offset has
+										  an alternative meaning */
+#define BT_N_KEYS_OFFSET_MASK	0x0FFF /* mask of bits in t_tid offset
+										  holding number of attributes
+										  actually present in index tuple */
+
+/* Set number of attributes to B-tree index tuple overriding t_tid offset */
+#define BTreeTupSetNAtts(itup, n)	\
+	ItemPointerSetOffsetNumber(&(itup)->t_tid,(n) | BT_ALT_OFFSET_FLAG)
+/* Get number of attributes in B-tree index tuple */
+#define BtreeTupGetNAtts(itup, index)	\
+	(ItemPointerGetOffsetNumber(&(itup)->t_tid) & BT_ALT_OFFSET_FLAG ? \
+	 ItemPointerGetOffsetNumber(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK : \
+	 IndexRelationGetNumberOfAttributes(index))
+
+
+/*
+ * The Meta page is always the first page in the btree index.
+ * Its primary purpose is to point to the location of the btree root page.
+ * We also point to the "fast" root, which is the current effective root;
+ * see README for discussion.
+ */
+
+typedef struct BTMetaPageData
+{
+	uint32		btm_magic;		/* should contain BTREE_MAGIC */
+	uint32		btm_version;	/* should contain BTREE_VERSION */
+	BlockNumber btm_root;		/* current root location */
+	uint32		btm_level;		/* tree level of the root page */
+	BlockNumber btm_fastroot;	/* current "fast" root location */
+	uint32		btm_fastlevel;	/* tree level of the "fast" root page */
+} BTMetaPageData;
+
+#define BTPageGetMeta(p) \
+	((BTMetaPageData *) PageGetContents(p))
+
+#define BTREE_METAPAGE	0		/* first page is meta */
+#define BTREE_MAGIC		0x053162	/* magic number of btree pages */
+#define BTREE_VERSION	2		/* current version number */
+
+/*
+ * Maximum size of a btree index entry, including its tuple header.
+ *
+ * We actually need to be able to fit three items on every page,
+ * so restrict any one item to 1/3 the per-page available space.
+ */
+#define BTMaxItemSize(page) \
+	MAXALIGN_DOWN((PageGetPageSize(page) - \
+				   MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
+				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
+
+/*
+ * The leaf-page fillfactor defaults to 90% but is user-adjustable.
+ * For pages above the leaf level, we use a fixed 70% fillfactor.
+ * The fillfactor is applied during index build and when splitting
+ * a rightmost page; when splitting non-rightmost pages we try to
+ * divide the data equally.
+ */
+#define BTREE_MIN_FILLFACTOR		10
+#define BTREE_DEFAULT_FILLFACTOR	90
+#define BTREE_NONLEAF_FILLFACTOR	70
+
+/*
+ *	Test whether two btree entries are "the same".
+ *
+ *	Old comments:
+ *	In addition, we must guarantee that all tuples in the index are unique,
+ *	in order to satisfy some assumptions in Lehman and Yao.  The way that we
+ *	do this is by generating a new OID for every insertion that we do in the
+ *	tree.  This adds eight bytes to the size of btree index tuples.  Note
+ *	that we do not use the OID as part of a composite key; the OID only
+ *	serves as a unique identifier for a given index tuple (logical position
+ *	within a page).
+ *
+ *	New comments:
+ *	actually, we must guarantee that all tuples in A LEVEL
+ *	are unique, not in ALL INDEX. So, we can use the t_tid
+ *	as unique identifier for a given index tuple (logical position
+ *	within a level). - vadim 04/09/97
+ */
+#define BTTidSame(i1, i2)	\
+	((ItemPointerGetBlockNumber(&(i1)) == ItemPointerGetBlockNumber(&(i2))))
+#define BTEntrySame(i1, i2) \
+	BTTidSame((i1)->t_tid, (i2)->t_tid)
+
+
+/*
+ *	In general, the btree code tries to localize its knowledge about
+ *	page layout to a couple of routines.  However, we need a special
+ *	value to indicate "no page number" in those places where we expect
+ *	page numbers.  We can use zero for this because we never need to
+ *	make a pointer to the metadata page.
+ */
+
+#define P_NONE			0
+
+/*
+ * Macros to test whether a page is leftmost or rightmost on its tree level,
+ * as well as other state info kept in the opaque data.
+ */
+#define P_LEFTMOST(opaque)		((opaque)->btpo_prev == P_NONE)
+#define P_RIGHTMOST(opaque)		((opaque)->btpo_next == P_NONE)
+#define P_ISLEAF(opaque)		(((opaque)->btpo_flags & BTP_LEAF) != 0)
+#define P_ISROOT(opaque)		(((opaque)->btpo_flags & BTP_ROOT) != 0)
+#define P_ISDELETED(opaque)		(((opaque)->btpo_flags & BTP_DELETED) != 0)
+#define P_ISMETA(opaque)		(((opaque)->btpo_flags & BTP_META) != 0)
+#define P_ISHALFDEAD(opaque)	(((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
+#define P_IGNORE(opaque)		(((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
+#define P_HAS_GARBAGE(opaque)	(((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
+#define P_INCOMPLETE_SPLIT(opaque)	(((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
+
+/*
+ *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
+ *	page.  The high key is not a data key, but gives info about what range of
+ *	keys is supposed to be on this page.  The high key on a page is required
+ *	to be greater than or equal to any data key that appears on the page.
+ *	If we find ourselves trying to insert a key > high key, we know we need
+ *	to move right (this should only happen if the page was split since we
+ *	examined the parent page).
+ *
+ *	Our insertion algorithm guarantees that we can use the initial least key
+ *	on our right sibling as the high key.  Once a page is created, its high
+ *	key changes only if the page is split.
+ *
+ *	On a non-rightmost page, the high key lives in item 1 and data items
+ *	start in item 2.  Rightmost pages have no high key, so we store data
+ *	items beginning in item 1.
+ */
+
+#define P_HIKEY				((OffsetNumber) 1)
+#define P_FIRSTKEY			((OffsetNumber) 2)
+#define P_FIRSTDATAKEY(opaque)	(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
+
+
+/*
+ *	Operator strategy numbers for B-tree have been moved to access/stratnum.h,
+ *	because many places need to use them in ScanKeyInit() calls.
+ *
+ *	The strategy numbers are chosen so that we can commute them by
+ *	subtraction, thus:
+ */
+#define BTCommuteStrategyNumber(strat)	(BTMaxStrategyNumber + 1 - (strat))
+
+/*
+ *	When a new operator class is declared, we require that the user
+ *	supply us with an amproc procedure (BTORDER_PROC) for determining
+ *	whether, for two keys a and b, a < b, a = b, or a > b.  This routine
+ *	must return < 0, 0, > 0, respectively, in these three cases.  (It must
+ *	not return INT_MIN, since we may negate the result before using it.)
+ *
+ *	To facilitate accelerated sorting, an operator class may choose to
+ *	offer a second procedure (BTSORTSUPPORT_PROC).  For full details, see
+ *	src/include/utils/sortsupport.h.
+ *
+ *	To support window frames defined by "RANGE offset PRECEDING/FOLLOWING",
+ *	an operator class may choose to offer a third amproc procedure
+ *	(BTINRANGE_PROC), independently of whether it offers sortsupport.
+ *	For full details, see doc/src/sgml/btree.sgml.
+ */
+
+#define BTORDER_PROC		1
+#define BTSORTSUPPORT_PROC	2
+#define BTINRANGE_PROC		3
+#define BTNProcs			3
+
+/*
+ *	We need to be able to tell the difference between read and write
+ *	requests for pages, in order to do locking correctly.
+ */
+
+#define BT_READ			BUFFER_LOCK_SHARE
+#define BT_WRITE		BUFFER_LOCK_EXCLUSIVE
+
+/*
+ *	BTStackData -- As we descend a tree, we push the (location, downlink)
+ *	pairs from internal pages onto a private stack.  If we split a
+ *	leaf, we use this stack to walk back up the tree and insert data
+ *	into parent pages (and possibly to split them, too).  Lehman and
+ *	Yao's update algorithm guarantees that under no circumstances can
+ *	our private stack give us an irredeemably bad picture up the tree.
+ *	Again, see the paper for details.
+ */
+
+typedef struct BTStackData
+{
+	BlockNumber bts_blkno;
+	OffsetNumber bts_offset;
+	IndexTupleData bts_btentry;
+	struct BTStackData *bts_parent;
+} BTStackData;
+
+typedef BTStackData *BTStack;
+
+/*
+ * BTScanOpaqueData is the btree-private state needed for an indexscan.
+ * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
+ * details of the preprocessing), information about the current location
+ * of the scan, and information about the marked location, if any.  (We use
+ * BTScanPosData to represent the data needed for each of current and marked
+ * locations.)	In addition we can remember some known-killed index entries
+ * that must be marked before we can move off the current page.
+ *
+ * Index scans work a page at a time: we pin and read-lock the page, identify
+ * all the matching items on the page and save them in BTScanPosData, then
+ * release the read-lock while returning the items to the caller for
+ * processing.  This approach minimizes lock/unlock traffic.  Note that we
+ * keep the pin on the index page until the caller is done with all the items
+ * (this is needed for VACUUM synchronization, see nbtree/README).  When we
+ * are ready to step to the next page, if the caller has told us any of the
+ * items were killed, we re-lock the page to mark them killed, then unlock.
+ * Finally we drop the pin and step to the next page in the appropriate
+ * direction.
+ *
+ * If we are doing an index-only scan, we save the entire IndexTuple for each
+ * matched item, otherwise only its heap TID and offset.  The IndexTuples go
+ * into a separate workspace array; each BTScanPosItem stores its tuple's
+ * offset within that array.
+ */
+
+typedef struct BTScanPosItem	/* what we remember about each match */
+{
+	ItemPointerData heapTid;	/* TID of referenced heap item */
+	OffsetNumber indexOffset;	/* index item's location within page */
+	LocationIndex tupleOffset;	/* IndexTuple's offset in workspace, if any */
+} BTScanPosItem;
+
+typedef struct BTScanPosData
+{
+	Buffer		buf;			/* if valid, the buffer is pinned */
+
+	XLogRecPtr	lsn;			/* pos in the WAL stream when page was read */
+	BlockNumber currPage;		/* page referenced by items array */
+	BlockNumber nextPage;		/* page's right link when we scanned it */
+
+	/*
+	 * moreLeft and moreRight track whether we think there may be matching
+	 * index entries to the left and right of the current page, respectively.
+	 * We can clear the appropriate one of these flags when _bt_checkkeys()
+	 * returns continuescan = false.
+	 */
+	bool		moreLeft;
+	bool		moreRight;
+
+	/*
+	 * If we are doing an index-only scan, nextTupleOffset is the first free
+	 * location in the associated tuple storage workspace.
+	 */
+	int			nextTupleOffset;
+
+	/*
+	 * The items array is always ordered in index order (ie, increasing
+	 * indexoffset).  When scanning backwards it is convenient to fill the
+	 * array back-to-front, so we start at the last slot and fill downwards.
+	 * Hence we need both a first-valid-entry and a last-valid-entry counter.
+	 * itemIndex is a cursor showing which entry was last returned to caller.
+	 */
+	int			firstItem;		/* first valid index in items[] */
+	int			lastItem;		/* last valid index in items[] */
+	int			itemIndex;		/* current index in items[] */
+
+	BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+} BTScanPosData;
+
+typedef BTScanPosData *BTScanPos;
+
+#define BTScanPosIsPinned(scanpos) \
+( \
+	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
+				!BufferIsValid((scanpos).buf)), \
+	BufferIsValid((scanpos).buf) \
+)
+#define BTScanPosUnpin(scanpos) \
+	do { \
+		ReleaseBuffer((scanpos).buf); \
+		(scanpos).buf = InvalidBuffer; \
+	} while (0)
+#define BTScanPosUnpinIfPinned(scanpos) \
+	do { \
+		if (BTScanPosIsPinned(scanpos)) \
+			BTScanPosUnpin(scanpos); \
+	} while (0)
+
+#define BTScanPosIsValid(scanpos) \
+( \
+	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
+				!BufferIsValid((scanpos).buf)), \
+	BlockNumberIsValid((scanpos).currPage) \
+)
+#define BTScanPosInvalidate(scanpos) \
+	do { \
+		(scanpos).currPage = InvalidBlockNumber; \
+		(scanpos).nextPage = InvalidBlockNumber; \
+		(scanpos).buf = InvalidBuffer; \
+		(scanpos).lsn = InvalidXLogRecPtr; \
+		(scanpos).nextTupleOffset = 0; \
+	} while (0);
+
+/* We need one of these for each equality-type SK_SEARCHARRAY scan key */
+typedef struct BTArrayKeyInfo
+{
+	int			scan_key;		/* index of associated key in arrayKeyData */
+	int			cur_elem;		/* index of current element in elem_values */
+	int			mark_elem;		/* index of marked element in elem_values */
+	int			num_elems;		/* number of elems in current array value */
+	Datum	   *elem_values;	/* array of num_elems Datums */
+} BTArrayKeyInfo;
+
+typedef struct BTScanOpaqueData
+{
+	/* these fields are set by _bt_preprocess_keys(): */
+	bool		qual_ok;		/* false if qual can never be satisfied */
+	int			numberOfKeys;	/* number of preprocessed scan keys */
+	ScanKey		keyData;		/* array of preprocessed scan keys */
+
+	/* workspace for SK_SEARCHARRAY support */
+	ScanKey		arrayKeyData;	/* modified copy of scan->keyData */
+	int			numArrayKeys;	/* number of equality-type array keys (-1 if
+								 * there are any unsatisfiable array keys) */
+	int			arrayKeyCount;	/* count indicating number of array scan keys
+								 * processed */
+	BTArrayKeyInfo *arrayKeys;	/* info about each equality-type array key */
+	MemoryContext arrayContext; /* scan-lifespan context for array data */
+
+	/* info about killed items if any (killedItems is NULL if never used) */
+	int		   *killedItems;	/* currPos.items indexes of killed items */
+	int			numKilled;		/* number of currently stored items */
+
+	/*
+	 * If we are doing an index-only scan, these are the tuple storage
+	 * workspaces for the currPos and markPos respectively.  Each is of size
+	 * BLCKSZ, so it can hold as much as a full page's worth of tuples.
+	 */
+	char	   *currTuples;		/* tuple storage for currPos */
+	char	   *markTuples;		/* tuple storage for markPos */
+
+	/*
+	 * If the marked position is on the same page as current position, we
+	 * don't use markPos, but just keep the marked itemIndex in markItemIndex
+	 * (all the rest of currPos is valid for the mark position). Hence, to
+	 * determine if there is a mark, first look at markItemIndex, then at
+	 * markPos.
+	 */
+	int			markItemIndex;	/* itemIndex, or -1 if not valid */
+
+	/* keep these last in struct for efficiency */
+	BTScanPosData currPos;		/* current position data */
+	BTScanPosData markPos;		/* marked position, if any */
+} BTScanOpaqueData;
+
+typedef BTScanOpaqueData *BTScanOpaque;
+
+/*
+ * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
+ * to use bits 16-31 (see skey.h).  The uppermost bits are copied from the
+ * index's indoption[] array entry for the index attribute.
+ */
+#define SK_BT_REQFWD	0x00010000	/* required to continue forward scan */
+#define SK_BT_REQBKWD	0x00020000	/* required to continue backward scan */
+#define SK_BT_INDOPTION_SHIFT  24	/* must clear the above bits */
+#define SK_BT_DESC			(INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
+#define SK_BT_NULLS_FIRST	(INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
+
+/*
+ * external entry points for btree, in nbtree.c
+ */
+extern void btbuildempty(Relation index);
+extern bool btinsert(Relation rel, Datum *values, bool *isnull,
+		 ItemPointer ht_ctid, Relation heapRel,
+		 IndexUniqueCheck checkUnique,
+		 struct IndexInfo *indexInfo);
+extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
+extern Size btestimateparallelscan(void);
+extern void btinitparallelscan(void *target);
+extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
+extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
+extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+		 ScanKey orderbys, int norderbys);
+extern void btparallelrescan(IndexScanDesc scan);
+extern void btendscan(IndexScanDesc scan);
+extern void btmarkpos(IndexScanDesc scan);
+extern void btrestrpos(IndexScanDesc scan);
+extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
+			 IndexBulkDeleteResult *stats,
+			 IndexBulkDeleteCallback callback,
+			 void *callback_state);
+extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
+				IndexBulkDeleteResult *stats);
+extern bool btcanreturn(Relation index, int attno);
+
+/*
+ * prototypes for internal functions in nbtree.c
+ */
+extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno);
+extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
+extern void _bt_parallel_done(IndexScanDesc scan);
+extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
+
+/*
+ * prototypes for functions in nbtinsert.c
+ */
+extern bool _bt_doinsert(Relation rel, IndexTuple itup,
+			 IndexUniqueCheck checkUnique, Relation heapRel);
+extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
+extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
+extern bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
+						 OffsetNumber itup_off);
+
+/*
+ * prototypes for functions in nbtpage.c
+ */
+extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
+extern Buffer _bt_getroot(Relation rel, int access);
+extern Buffer _bt_gettrueroot(Relation rel);
+extern int	_bt_getrootheight(Relation rel);
+extern void _bt_checkpage(Relation rel, Buffer buf);
+extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
+extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
+				 BlockNumber blkno, int access);
+extern void _bt_relbuf(Relation rel, Buffer buf);
+extern void _bt_pageinit(Page page, Size size);
+extern bool _bt_page_recyclable(Page page);
+extern void _bt_delitems_delete(Relation rel, Buffer buf,
+					OffsetNumber *itemnos, int nitems, Relation heapRel);
+extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
+					OffsetNumber *itemnos, int nitems,
+					BlockNumber lastBlockVacuumed);
+extern int	_bt_pagedel(Relation rel, Buffer buf);
+
+/*
+ * prototypes for functions in nbtsearch.c
+ */
+extern BTStack _bt_search(Relation rel,
+		   int keysz, ScanKey scankey, bool nextkey,
+		   Buffer *bufP, int access, Snapshot snapshot);
+extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
+			  ScanKey scankey, bool nextkey, bool forupdate, BTStack stack,
+			  int access, Snapshot snapshot);
+extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
+			ScanKey scankey, bool nextkey);
+extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
+			Page page, OffsetNumber offnum);
+extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+				 Snapshot snapshot);
+
+/*
+ * prototypes for functions in nbtutils.c
+ */
+extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup);
+extern ScanKey _bt_mkscankey_nodata(Relation rel);
+extern void _bt_freeskey(ScanKey skey);
+extern void _bt_freestack(BTStack stack);
+extern void _bt_preprocess_array_keys(IndexScanDesc scan);
+extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
+extern void _bt_mark_array_keys(IndexScanDesc scan);
+extern void _bt_restore_array_keys(IndexScanDesc scan);
+extern void _bt_preprocess_keys(IndexScanDesc scan);
+extern IndexTuple _bt_checkkeys(IndexScanDesc scan,
+			  Page page, OffsetNumber offnum,
+			  ScanDirection dir, bool *continuescan);
+extern void _bt_killitems(IndexScanDesc scan);
+extern BTCycleId _bt_vacuum_cycleid(Relation rel);
+extern BTCycleId _bt_start_vacuum(Relation rel);
+extern void _bt_end_vacuum(Relation rel);
+extern void _bt_end_vacuum_callback(int code, Datum arg);
+extern Size BTreeShmemSize(void);
+extern void BTreeShmemInit(void);
+extern bytea *btoptions(Datum reloptions, bool validate);
+extern bool btproperty(Oid index_oid, int attno,
+		   IndexAMProperty prop, const char *propname,
+		   bool *res, bool *isnull);
+extern IndexTuple _bt_truncate_tuple(Relation idxrel, IndexTuple olditup);
+
+/*
+ * prototypes for functions in nbtvalidate.c
+ */
+extern bool btvalidate(Oid opclassoid);
+
+/*
+ * prototypes for functions in nbtsort.c
+ */
+extern IndexBuildResult *btbuild(Relation heap, Relation index,
+		struct IndexInfo *indexInfo);
+extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
+
+#endif							/* NBTREE_H */