From 8d6a512d91099e7ba298734ba3de858ad89813de Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 27 Apr 2018 12:47:39 -0700
Subject: [PATCH v14 2/7] Make heap TID a tie-breaker nbtree index column.

Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique.  This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".

Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added.  This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes.  This can increase fan-out,
especially in a multi-column index.  Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now.

Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tie-breaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved).  Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3.  contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing the newer/more strict invariants with
version 4 indexes.

A later patch will enhance the logic used by nbtree to pick a split
point.  This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at.  Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.

The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d item pointer in a new
high key during leaf page splits.  The user-facing definition of the
"1/3 of a page" restriction is already imprecise, and so does not need
to be revised.  However, there should be a compatibility note in the v12
release notes.  The new maximum allowed size is 2704 bytes on 64-bit
systems, down from 2712 bytes.
---
 contrib/amcheck/expected/check_btree.out     |   5 +-
 contrib/amcheck/sql/check_btree.sql          |   5 +-
 contrib/amcheck/verify_nbtree.c              | 344 +++++++++++++--
 contrib/pageinspect/btreefuncs.c             |   2 +-
 contrib/pageinspect/expected/btree.out       |   2 +-
 contrib/pgstattuple/expected/pgstattuple.out |  10 +-
 doc/src/sgml/indices.sgml                    |  24 +-
 src/backend/access/common/indextuple.c       |   6 +-
 src/backend/access/nbtree/README             | 160 ++++---
 src/backend/access/nbtree/nbtinsert.c        | 326 +++++++++-----
 src/backend/access/nbtree/nbtpage.c          | 196 ++++++---
 src/backend/access/nbtree/nbtree.c           |   2 +-
 src/backend/access/nbtree/nbtsearch.c        | 103 ++++-
 src/backend/access/nbtree/nbtsort.c          |  91 ++--
 src/backend/access/nbtree/nbtutils.c         | 433 +++++++++++++++++--
 src/backend/access/nbtree/nbtxlog.c          |  43 +-
 src/backend/access/rmgrdesc/nbtdesc.c        |   8 -
 src/backend/utils/sort/tuplesort.c           |  13 +-
 src/include/access/nbtree.h                  | 215 +++++++--
 src/include/access/nbtxlog.h                 |  35 +-
 src/test/regress/expected/btree_index.out    |  34 +-
 src/test/regress/expected/create_index.out   |  13 +-
 src/test/regress/expected/dependency.out     |   4 +-
 src/test/regress/expected/event_trigger.out  |   4 +-
 src/test/regress/expected/foreign_data.out   |   8 +-
 src/test/regress/expected/rowsecurity.out    |   4 +-
 src/test/regress/sql/btree_index.sql         |  37 +-
 src/test/regress/sql/create_index.sql        |  14 +-
 28 files changed, 1611 insertions(+), 530 deletions(-)

diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out
index ef5c9e1a1c..1e6079ddd2 100644
--- a/contrib/amcheck/expected/check_btree.out
+++ b/contrib/amcheck/expected/check_btree.out
@@ -130,9 +130,12 @@ SELECT bt_index_parent_check('bttest_multi_idx', true);
 --
 INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
 ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
+-- Delete many entries, and vacuum. This causes page deletions.
 DELETE FROM delete_test_table WHERE a > 40000;
 VACUUM delete_test_table;
-DELETE FROM delete_test_table WHERE a > 10;
+-- Delete most entries, and vacuum, deleting internal pages and creating "fast
+-- root"
+DELETE FROM delete_test_table WHERE a < 79990;
 VACUUM delete_test_table;
 SELECT bt_index_parent_check('delete_test_table_pkey', true);
  bt_index_parent_check 
diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql
index 0ad1631476..3f1e0d17ef 100644
--- a/contrib/amcheck/sql/check_btree.sql
+++ b/contrib/amcheck/sql/check_btree.sql
@@ -82,9 +82,12 @@ SELECT bt_index_parent_check('bttest_multi_idx', true);
 --
 INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
 ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
+-- Delete many entries, and vacuum. This causes page deletions.
 DELETE FROM delete_test_table WHERE a > 40000;
 VACUUM delete_test_table;
-DELETE FROM delete_test_table WHERE a > 10;
+-- Delete most entries, and vacuum, deleting internal pages and creating "fast
+-- root"
+DELETE FROM delete_test_table WHERE a < 79990;
 VACUUM delete_test_table;
 SELECT bt_index_parent_check('delete_test_table_pkey', true);
 
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 053ac9d192..0a005afa34 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -45,6 +45,8 @@ PG_MODULE_MAGIC;
  * block per level, which is bound by the range of BlockNumber:
  */
 #define InvalidBtreeLevel	((uint32) InvalidBlockNumber)
+#define BTreeTupleGetNKeyAtts(itup, rel)   \
+	Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel))
 
 /*
  * State associated with verifying a B-Tree index
@@ -66,6 +68,8 @@ typedef struct BtreeCheckState
 	/* B-Tree Index Relation and associated heap relation */
 	Relation	rel;
 	Relation	heaprel;
+	/* rel is heapkeyspace index? */
+	bool		heapkeyspace;
 	/* ShareLock held on heap/index, rather than AccessShareLock? */
 	bool		readonly;
 	/* Also verifying heap has no unindexed tuples? */
@@ -122,7 +126,7 @@ static void bt_index_check_internal(Oid indrelid, bool parentcheck,
 						bool heapallindexed);
 static inline void btree_index_checkable(Relation rel);
 static void bt_check_every_level(Relation rel, Relation heaprel,
-					 bool readonly, bool heapallindexed);
+					 bool heapkeyspace, bool readonly, bool heapallindexed);
 static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
 							 BtreeLevel level);
 static void bt_target_page_check(BtreeCheckState *state);
@@ -137,17 +141,22 @@ static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
 						   IndexTuple itup);
 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
 							OffsetNumber offset);
+static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
+				   OffsetNumber upperbound);
 static inline bool invariant_leq_offset(BtreeCheckState *state,
 					 BTScanInsert key,
 					 OffsetNumber upperbound);
-static inline bool invariant_geq_offset(BtreeCheckState *state,
-					 BTScanInsert key,
-					 OffsetNumber lowerbound);
-static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state,
-							   BTScanInsert key,
-							   Page nontarget,
-							   OffsetNumber upperbound);
+static inline bool invariant_g_offset(BtreeCheckState *state, BTScanInsert key,
+				   OffsetNumber lowerbound);
+static inline bool invariant_l_nontarget_offset(BtreeCheckState *state,
+							 BTScanInsert key,
+							 Page nontarget,
+							 OffsetNumber upperbound);
 static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum);
+static inline BTScanInsert bt_mkscankey_minusinfkey(Relation rel,
+													IndexTuple itup);
+static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state,
+							IndexTuple itup, bool nonpivot);
 
 /*
  * bt_index_check(index regclass, heapallindexed boolean)
@@ -204,6 +213,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
 	Oid			heapid;
 	Relation	indrel;
 	Relation	heaprel;
+	bool		heapkeyspace;
 	LOCKMODE	lockmode;
 
 	if (parentcheck)
@@ -254,7 +264,9 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
 	btree_index_checkable(indrel);
 
 	/* Check index, possibly against table it is an index on */
-	bt_check_every_level(indrel, heaprel, parentcheck, heapallindexed);
+	heapkeyspace = _bt_heapkeyspace(indrel);
+	bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
+						 heapallindexed);
 
 	/*
 	 * Release locks early. That's ok here because nothing in the called
@@ -324,8 +336,8 @@ btree_index_checkable(Relation rel)
  * parent/child check cannot be affected.)
  */
 static void
-bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
-					 bool heapallindexed)
+bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
+					 bool readonly, bool heapallindexed)
 {
 	BtreeCheckState *state;
 	Page		metapage;
@@ -346,6 +358,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
 	state = palloc0(sizeof(BtreeCheckState));
 	state->rel = rel;
 	state->heaprel = heaprel;
+	state->heapkeyspace = heapkeyspace;
 	state->readonly = readonly;
 	state->heapallindexed = heapallindexed;
 
@@ -806,7 +819,8 @@ bt_target_page_check(BtreeCheckState *state)
 	 * doesn't contain a high key, so nothing to check
 	 */
 	if (!P_RIGHTMOST(topaque) &&
-		!_bt_check_natts(state->rel, state->target, P_HIKEY))
+		!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
+						 P_HIKEY))
 	{
 		ItemId		itemid;
 		IndexTuple	itup;
@@ -839,6 +853,7 @@ bt_target_page_check(BtreeCheckState *state)
 		IndexTuple	itup;
 		size_t		tupsize;
 		BTScanInsert skey;
+		bool		lowersizelimit;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -865,7 +880,8 @@ bt_target_page_check(BtreeCheckState *state)
 					 errhint("This could be a torn page problem.")));
 
 		/* Check the number of index tuple attributes */
-		if (!_bt_check_natts(state->rel, state->target, offset))
+		if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
+							 offset))
 		{
 			char	   *itid,
 					   *htid;
@@ -906,7 +922,56 @@ bt_target_page_check(BtreeCheckState *state)
 			continue;
 
 		/* Build insertion scankey for current page offset */
-		skey = _bt_mkscankey(state->rel, itup);
+		skey = bt_mkscankey_minusinfkey(state->rel, itup);
+
+		/*
+		 * Make sure tuple size does not exceed the relevant BTREE_VERSION
+		 * specific limit.
+		 *
+		 * BTREE_VERSION 4 (which introduced heapkeyspace rules) requisitioned
+		 * a small amount of space from BTMaxItemSize() in order to ensure
+		 * that suffix truncation always has enough space to add an explicit
+		 * heap TID back to a tuple -- we pessimistically assume that every
+		 * newly inserted tuple will eventually need to have a heap TID
+		 * appended during a future leaf page split, when the tuple becomes
+		 * the basis of the new high key (pivot tuple) for the leaf page.
+		 *
+		 * Since the reclaimed space is reserved for that purpose, we must not
+		 * enforce the slightly lower limit when the extra space has been used
+		 * as intended.  In other words, there is only a cross-version
+		 * difference in the limit on tuple size within leaf pages.
+		 *
+		 * Still, we're particular about the details within BTREE_VERSION 4
+		 * internal pages.  Pivot tuples may only use the extra space for its
+		 * designated purpose.  Enforce the lower limit for pivot tuples when
+		 * an explicit heap TID isn't actually present. (In all other cases
+		 * suffix truncation is guaranteed to generate a pivot tuple that's no
+		 * larger than the first right tuple provided to it by its caller.)
+		 */
+		lowersizelimit = skey->heapkeyspace &&
+			(P_ISLEAF(topaque) || BTreeTupleGetHeapTID(itup) == NULL);
+		if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) :
+					   BTMaxItemSizeNoHeapTid(state->target)))
+		{
+			char	   *itid,
+					   *htid;
+
+			itid = psprintf("(%u,%u)", state->targetblock, offset);
+			htid = psprintf("(%u,%u)",
+							ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
+							ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("index row size %zu exceeds maximum for index \"%s\"",
+							tupsize, RelationGetRelationName(state->rel)),
+					 errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%X.",
+										itid,
+										P_ISLEAF(topaque) ? "heap" : "index",
+										htid,
+										(uint32) (state->targetlsn >> 32),
+										(uint32) state->targetlsn)));
+		}
 
 		/* Fingerprint leaf page tuples (those that point to the heap) */
 		if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))
@@ -940,9 +1005,35 @@ bt_target_page_check(BtreeCheckState *state)
 		 * grandparents (as well as great-grandparents, and so on).  We don't
 		 * go to those lengths because that would be prohibitively expensive,
 		 * and probably not markedly more effective in practice.
+		 *
+		 * On the leaf level, we check that the key is <= the highkey.
+		 * However, on non-leaf levels we check that the key is < the highkey,
+		 * because the high key is "just another separator" rather than a copy
+		 * of some existing key item; we expect it to be unique among all keys
+		 * on the same level.  (Suffix truncation will sometimes produce a
+		 * leaf highkey that is an untruncated copy of the lastleft item, but
+		 * never any other item, which necessitates weakening the leaf level
+		 * check to <=.)
+		 *
+		 * Full explanation for why a highkey is never truly a copy of another
+		 * item from the same level on internal levels:
+		 *
+		 * While the new left page's high key is copied from the first offset
+		 * on the right page during an internal page split, that's not the
+		 * full story.  In effect, internal pages are split in the middle of
+		 * the firstright tuple, not between the would-be lastleft and
+		 * firstright tuples: the firstright key ends up on the left side as
+		 * left's new highkey, and the firstright downlink ends up on the
+		 * right side as right's new "negative infinity" item.  The negative
+		 * infinity tuple is truncated to zero attributes, so we're only left
+		 * with the downlink.  In other words, the copying is just an
+		 * implementation detail of splitting in the middle of a (pivot)
+		 * tuple. (See also: "Notes About Data Representation" in the nbtree
+		 * README.)
 		 */
 		if (!P_RIGHTMOST(topaque) &&
-			!invariant_leq_offset(state, skey, P_HIKEY))
+			!(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
+			  invariant_l_offset(state, skey, P_HIKEY)))
 		{
 			char	   *itid,
 					   *htid;
@@ -968,11 +1059,10 @@ bt_target_page_check(BtreeCheckState *state)
 		 * * Item order check *
 		 *
 		 * Check that items are stored on page in logical order, by checking
-		 * current item is less than or equal to next item (if any).
+		 * current item is strictly less than next item (if any).
 		 */
 		if (OffsetNumberNext(offset) <= max &&
-			!invariant_leq_offset(state, skey,
-								  OffsetNumberNext(offset)))
+			!invariant_l_offset(state, skey, OffsetNumberNext(offset)))
 		{
 			char	   *itid,
 					   *htid,
@@ -1035,7 +1125,7 @@ bt_target_page_check(BtreeCheckState *state)
 			rightkey = bt_right_page_check_scankey(state);
 
 			if (rightkey &&
-				!invariant_geq_offset(state, rightkey, max))
+				!invariant_g_offset(state, rightkey, max))
 			{
 				/*
 				 * As explained at length in bt_right_page_check_scankey(),
@@ -1213,9 +1303,9 @@ bt_right_page_check_scankey(BtreeCheckState *state)
 	 * continued existence of target block as non-ignorable (not half-dead or
 	 * deleted) implies that target page was not merged into from the right by
 	 * deletion; the key space at or after target never moved left.  Target's
-	 * parent either has the same downlink to target as before, or a <=
+	 * parent either has the same downlink to target as before, or a <
 	 * downlink due to deletion at the left of target.  Target either has the
-	 * same highkey as before, or a highkey <= before when there is a page
+	 * same highkey as before, or a highkey < before when there is a page
 	 * split. (The rightmost concurrently-split-from-target-page page will
 	 * still have the same highkey as target was originally found to have,
 	 * which for our purposes is equivalent to target's highkey itself never
@@ -1304,7 +1394,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
 	 * memory remaining allocated.
 	 */
 	firstitup = (IndexTuple) PageGetItem(rightpage, rightitem);
-	return _bt_mkscankey(state->rel, firstitup);
+	return bt_mkscankey_minusinfkey(state->rel, firstitup);
 }
 
 /*
@@ -1367,7 +1457,8 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
 
 	/*
 	 * Verify child page has the downlink key from target page (its parent) as
-	 * a lower bound.
+	 * a lower bound; downlink must be strictly less than all keys on the
+	 * page.
 	 *
 	 * Check all items, rather than checking just the first and trusting that
 	 * the operator class obeys the transitive law.
@@ -1416,14 +1507,29 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
 	{
 		/*
 		 * Skip comparison of target page key against "negative infinity"
-		 * item, if any.  Checking it would indicate that it's not an upper
-		 * bound, but that's only because of the hard-coding within
-		 * _bt_compare().
+		 * item, if any.  Checking it would indicate that it's not a strict
+		 * lower bound, but that's only because of the hard-coding for
+		 * negative infinity items within _bt_compare().
+		 *
+		 * If nbtree didn't truncate negative infinity tuples during internal
+		 * page splits then we'd expect child's negative infinity key to be
+		 * equal to the scankey/downlink from target/parent (it would be a
+		 * "low key" in this hypothetical scenario, and so it would still need
+		 * to be treated as a special case here).
+		 *
+		 * Negative infinity items can be thought of as a strict lower bound
+		 * that works transitively, with the last non-negative-infinity pivot
+		 * followed during a descent from the root as its "true" strict lower
+		 * bound.  Only a small number of negative infinity items are truly
+		 * negative infinity; those that are the first items of leftmost
+		 * internal pages.  In more general terms, a negative infinity item is
+		 * only negative infinity with respect to the subtree that the page is
+		 * at the root of.
 		 */
 		if (offset_is_negative_infinity(copaque, offset))
 			continue;
 
-		if (!invariant_leq_nontarget_offset(state, targetkey, child, offset))
+		if (!invariant_l_nontarget_offset(state, targetkey, child, offset))
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg("down-link lower bound invariant violated for index \"%s\"",
@@ -1855,6 +1961,66 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
 	return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque);
 }
 
+/*
+ * Does the invariant hold that the key is strictly less than a given upper
+ * bound offset item?
+ *
+ * If this function returns false, convention is that caller throws error due
+ * to corruption.
+ */
+static inline bool
+invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
+				   OffsetNumber upperbound)
+{
+	int32		cmp;
+
+	/* pg_upgrade'd indexes may legally have equal sibling tuples */
+	if (!key->heapkeyspace)
+		return invariant_leq_offset(state, key, upperbound);
+
+	cmp = _bt_compare(state->rel, key, state->target, upperbound);
+
+	/*
+	 * _bt_compare() is capable of determining that a scankey with a
+	 * filled-out attribute is greater than pivot tuples where the comparison
+	 * is resolved at a truncated attribute (value of attribute in pivot is
+	 * minus infinity).  It is even capable of determining that a "minus
+	 * infinity value" from a "minusinfkey" scankey is equal to a pivot's
+	 * truncated attribute.  However, it is not capable of determining that a
+	 * scankey ("minusinfkey" or otherwise) is _less than_ a tuple on the
+	 * basis of a comparison resolved at _scankey_ minus infinity attribute.
+	 *
+	 * Somebody could teach _bt_compare() to handle this on its own, but that
+	 * would add overhead to index scans.  Complete an extra step to make it
+	 * work here instead.
+	 */
+	if (cmp == 0)
+	{
+		BTPageOpaque topaque;
+		ItemId		itemid;
+		IndexTuple	ritup;
+		int			uppnkeyatts;
+		ItemPointer rheaptid;
+		bool		nonpivot;
+
+		itemid = PageGetItemId(state->target, upperbound);
+		ritup = (IndexTuple) PageGetItem(state->target, itemid);
+		topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
+		nonpivot = P_ISLEAF(topaque) && upperbound >= P_FIRSTDATAKEY(topaque);
+
+		/* Get number of keys + heap TID for item to the right */
+		uppnkeyatts = BTreeTupleGetNKeyAtts(ritup, state->rel);
+		rheaptid = BTreeTupleGetHeapTIDCareful(state, ritup, nonpivot);
+
+		if (key->keysz == uppnkeyatts)
+			return key->scantid == NULL && rheaptid != NULL;
+
+		return key->keysz < uppnkeyatts;
+	}
+
+	return cmp < 0;
+}
+
 /*
  * Does the invariant hold that the key is less than or equal to a given upper
  * bound offset item?
@@ -1874,42 +2040,84 @@ invariant_leq_offset(BtreeCheckState *state, BTScanInsert key,
 }
 
 /*
- * Does the invariant hold that the key is greater than or equal to a given
- * lower bound offset item?
+ * Does the invariant hold that the key is strictly greater than a given lower
+ * bound offset item?
  *
  * If this function returns false, convention is that caller throws error due
  * to corruption.
  */
 static inline bool
-invariant_geq_offset(BtreeCheckState *state, BTScanInsert key,
-					 OffsetNumber lowerbound)
+invariant_g_offset(BtreeCheckState *state, BTScanInsert key,
+				   OffsetNumber lowerbound)
 {
 	int32		cmp;
 
 	cmp = _bt_compare(state->rel, key, state->target, lowerbound);
 
-	return cmp >= 0;
+	/* pg_upgrade'd indexes may legally have equal sibling tuples */
+	if (!key->heapkeyspace)
+		return cmp >= 0;
+
+	/*
+	 * No need to consider the possibility that scankey has attributes that we
+	 * need to force to be interpreted as negative infinity.  _bt_compare() is
+	 * able to determine that scankey is greater than negative infinity.  The
+	 * distinction between "==" and "<" isn't interesting here, since
+	 * corruption is indicated either way.
+	 */
+	return cmp > 0;
 }
 
 /*
- * Does the invariant hold that the key is less than or equal to a given upper
+ * Does the invariant hold that the key is strictly less than a given upper
  * bound offset item, with the offset relating to a caller-supplied page that
- * is not the current target page? Caller's non-target page is typically a
- * child page of the target, checked as part of checking a property of the
- * target page (i.e. the key comes from the target).
+ * is not the current target page?
+ *
+ * Caller's non-target page is a child page of the target, checked as part of
+ * checking a property of the target page (i.e. the key comes from the
+ * target).
  *
  * If this function returns false, convention is that caller throws error due
  * to corruption.
  */
 static inline bool
-invariant_leq_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
-							   Page nontarget, OffsetNumber upperbound)
+invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
+							 Page nontarget, OffsetNumber upperbound)
 {
 	int32		cmp;
 
 	cmp = _bt_compare(state->rel, key, nontarget, upperbound);
 
-	return cmp <= 0;
+	/* pg_upgrade'd indexes may legally have equal sibling tuples */
+	if (!key->heapkeyspace)
+		return cmp <= 0;
+
+	/* See invariant_l_offset() for an explanation of this extra step */
+	if (cmp == 0)
+	{
+		ItemId		itemid;
+		IndexTuple	child;
+		int			uppnkeyatts;
+		ItemPointer childheaptid;
+		BTPageOpaque copaque;
+		bool		nonpivot;
+
+		itemid = PageGetItemId(nontarget, upperbound);
+		child = (IndexTuple) PageGetItem(nontarget, itemid);
+		copaque = (BTPageOpaque) PageGetSpecialPointer(nontarget);
+		nonpivot = P_ISLEAF(copaque) && upperbound >= P_FIRSTDATAKEY(copaque);
+
+		/* Get number of keys + heap TID for child/non-target item */
+		uppnkeyatts = BTreeTupleGetNKeyAtts(child, state->rel);
+		childheaptid = BTreeTupleGetHeapTIDCareful(state, child, nonpivot);
+
+		if (key->keysz == uppnkeyatts)
+			return key->scantid == NULL && childheaptid != NULL;
+
+		return key->keysz < uppnkeyatts;
+	}
+
+	return cmp < 0;
 }
 
 /*
@@ -2065,3 +2273,61 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
 
 	return page;
 }
+
+/*
+ * _bt_mkscankey() wrapper that automatically sets insertion scankey to have
+ * minus infinity values for truncated attributes from itup (when itup is a
+ * pivot tuple with one or more truncated attributes).
+ *
+ * In a non-corrupt heapkeyspace index, all pivot tuples on a level have
+ * unique keys, so the !minusinfkey optimization correctly guides scans that
+ * aren't interested in relocating a leaf page using leaf page's high key
+ * (i.e. optimization can safely be used by the vast majority of all
+ * _bt_search() calls).  nbtree verification should always use "minusinfkey"
+ * semantics, though, because the !minusinfkey optimization might mask a
+ * problem in a corrupt index.
+ *
+ * For example, invariant_g_offset() might miss a cross-page invariant failure
+ * on an internal level if the scankey built from the first item on the
+ * target's right sibling page happened to be equal to (not greater than) the
+ * last item on target page.  The !minusinfkey tie-breaker might otherwise
+ * cause amcheck to conclude that the scankey is greater, missing index
+ * corruption.  It's unlikely that the same problem would not be caught some
+ * other way, but the !minusinfkey optimization has no upside for amcheck, so
+ * it seems sensible to always avoid it.
+ */
+static inline BTScanInsert
+bt_mkscankey_minusinfkey(Relation rel, IndexTuple itup)
+{
+	BTScanInsert skey;
+
+	skey = _bt_mkscankey(rel, itup);
+	skey->minusinfkey = true;
+
+	return skey;
+}
+
+/*
+ * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must
+ * be present in cases where that is mandatory.
+ *
+ * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK
+ * bit is effectively a proxy for whether or not the tuple is a pivot tuple.
+ * It may become more useful in the future, when non-pivot tuples support their
+ * own alternative INDEX_ALT_TID_MASK representation.
+ */
+static inline ItemPointer
+BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup,
+							bool nonpivot)
+{
+	ItemPointer result = BTreeTupleGetHeapTID(itup);
+	BlockNumber targetblock = state->targetblock;
+
+	if (result == NULL && nonpivot)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
+						targetblock, RelationGetRelationName(state->rel))));
+
+	return result;
+}
diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c
index bfa0c04c2f..8d27c9b0f6 100644
--- a/contrib/pageinspect/btreefuncs.c
+++ b/contrib/pageinspect/btreefuncs.c
@@ -561,7 +561,7 @@ bt_metap(PG_FUNCTION_ARGS)
 	 * Get values of extended metadata if available, use default values
 	 * otherwise.
 	 */
-	if (metad->btm_version == BTREE_VERSION)
+	if (metad->btm_version >= BTREE_NOVAC_VERSION)
 	{
 		values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
 		values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out
index 2aaa4df53b..07c2dcd771 100644
--- a/contrib/pageinspect/expected/btree.out
+++ b/contrib/pageinspect/expected/btree.out
@@ -5,7 +5,7 @@ CREATE INDEX test1_a_idx ON test1 USING btree (a);
 SELECT * FROM bt_metap('test1_a_idx');
 -[ RECORD 1 ]-----------+-------
 magic                   | 340322
-version                 | 3
+version                 | 4
 root                    | 1
 level                   | 0
 fastroot                | 1
diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out
index 9858ea69d4..9920dbfd40 100644
--- a/contrib/pgstattuple/expected/pgstattuple.out
+++ b/contrib/pgstattuple/expected/pgstattuple.out
@@ -48,7 +48,7 @@ select version, tree_level,
     from pgstatindex('test_pkey');
  version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation 
 ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
-       3 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
+       4 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
 (1 row)
 
 select version, tree_level,
@@ -58,7 +58,7 @@ select version, tree_level,
     from pgstatindex('test_pkey'::text);
  version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation 
 ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
-       3 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
+       4 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
 (1 row)
 
 select version, tree_level,
@@ -68,7 +68,7 @@ select version, tree_level,
     from pgstatindex('test_pkey'::name);
  version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation 
 ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
-       3 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
+       4 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
 (1 row)
 
 select version, tree_level,
@@ -78,7 +78,7 @@ select version, tree_level,
     from pgstatindex('test_pkey'::regclass);
  version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation 
 ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
-       3 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
+       4 |          0 |          1 |             0 |              0 |          0 |           0 |             0 |              NaN |                NaN
 (1 row)
 
 select pg_relpages('test');
@@ -232,7 +232,7 @@ create index test_partition_hash_idx on test_partition using hash (a);
 select pgstatindex('test_partition_idx');
          pgstatindex          
 ------------------------------
- (3,0,8192,0,0,0,0,0,NaN,NaN)
+ (4,0,8192,0,0,0,0,0,NaN,NaN)
 (1 row)
 
 select pgstathashindex('test_partition_hash_idx');
diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml
index 46f427b312..21c978503a 100644
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -504,8 +504,9 @@ CREATE INDEX test2_mm_idx ON test2 (major, minor);
 
   <para>
    By default, B-tree indexes store their entries in ascending order
-   with nulls last.  This means that a forward scan of an index on
-   column <literal>x</literal> produces output satisfying <literal>ORDER BY x</literal>
+   with nulls last (table TID is treated as a tiebreaker column among
+   otherwise equal entries).  This means that a forward scan of an
+   index on column <literal>x</literal> produces output satisfying <literal>ORDER BY x</literal>
    (or more verbosely, <literal>ORDER BY x ASC NULLS LAST</literal>).  The
    index can also be scanned backward, producing output satisfying
    <literal>ORDER BY x DESC</literal>
@@ -1162,10 +1163,21 @@ CREATE INDEX tab_x_y ON tab(x, y);
    the extra columns are trailing columns; making them be leading columns is
    unwise for the reasons explained in <xref linkend="indexes-multicolumn"/>.
    However, this method doesn't support the case where you want the index to
-   enforce uniqueness on the key column(s).  Also, explicitly marking
-   non-searchable columns as <literal>INCLUDE</literal> columns makes the
-   index slightly smaller, because such columns need not be stored in upper
-   B-tree levels.
+   enforce uniqueness on the key column(s).
+  </para>
+
+  <para>
+   <firstterm>Suffix truncation</firstterm> always removes non-key
+   columns from upper B-Tree levels.  As payload columns, they are
+   never used to guide index scans.  The truncation process also
+   removes one or more trailing key column(s) when the remaining
+   prefix of key column(s) happens to be sufficient to describe tuples
+   on the lowest B-Tree level.  In practice, covering indexes without
+   an <literal>INCLUDE</literal> clause often avoid storing columns
+   that are effectively payload in the upper levels.  However,
+   explicitly defining payload columns as non-key columns
+   <emphasis>reliably</emphasis> keeps the tuples in upper levels
+   small.
   </para>
 
   <para>
diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c
index 32c0ebb93a..cb23be859d 100644
--- a/src/backend/access/common/indextuple.c
+++ b/src/backend/access/common/indextuple.c
@@ -536,7 +536,11 @@ index_truncate_tuple(TupleDesc sourceDescriptor, IndexTuple source,
 	bool		isnull[INDEX_MAX_KEYS];
 	IndexTuple	truncated;
 
-	Assert(leavenatts < sourceDescriptor->natts);
+	Assert(leavenatts <= sourceDescriptor->natts);
+
+	/* Easy case: no truncation actually required */
+	if (leavenatts == sourceDescriptor->natts)
+		return CopyIndexTuple(source);
 
 	/* Create temporary descriptor to scribble on */
 	truncdesc = palloc(TupleDescSize(sourceDescriptor));
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 3680e69b89..9c0b4718b6 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -28,37 +28,50 @@ right-link to find the new page containing the key range you're looking
 for.  This might need to be repeated, if the page has been split more than
 once.
 
+Lehman and Yao talk about pairs of "separator" keys and downlinks in
+internal pages rather than tuples or records.  We use the term "pivot"
+tuple to refer to tuples which don't point to heap tuples, that are used
+only for tree navigation.  All tuples on non-leaf pages and high keys on
+leaf pages are pivot tuples.  Since pivot tuples are only used to represent
+which part of the key space belongs on each page, they can have attribute
+values copied from non-pivot tuples that were deleted and killed by VACUUM
+some time ago.  A pivot tuple may contain a "separator" key and downlink,
+just a separator key (i.e. the downlink value is implicitly undefined), or
+just a downlink (i.e. all attributes are truncated away).  We aren't always
+clear on which case applies, but it should be obvious from context.
+
+The requirement that all btree keys be unique is satisfied by treating heap
+TID as a tiebreaker attribute.  Logical duplicates are sorted in heap TID
+order.  This is necessary because Lehman and Yao also require that the key
+range for a subtree S is described by Ki < v <= Ki+1 where Ki and Ki+1 are
+the adjacent keys in the parent page (Ki must be _strictly_ less than v,
+which can be assured by having reliably unique keys).
+
+A search where the key is equal to a pivot tuple in an upper tree level
+must descend to the left of that pivot to ensure it finds any equal keys.
+The equal item(s) being searched for must therefore be to the left of that
+downlink page on the next level down.  A handy property of this design is
+that a scan where all attributes/keys are used behaves just the same as a
+scan where only some prefix of attributes are used; equality never needs to
+be treated as a special case.
+
+In practice, exact equality with pivot tuples on internal pages is
+extremely rare when all attributes (including even the heap TID attribute)
+are used in a search.  This is due to suffix truncation: truncated
+attributes are treated as having the value negative infinity, and
+truncation almost always manages to at least truncate away the trailing
+heap TID attribute.  While Lehman and Yao don't have anything to say about
+suffix truncation, the design used by nbtree is perfectly complementary.
+The later section on suffix truncation will be helpful if it's unclear how
+the Lehman & Yao invariants work with a real world example involving
+suffix truncation.
+
 Differences to the Lehman & Yao algorithm
 -----------------------------------------
 
 We have made the following changes in order to incorporate the L&Y algorithm
 into Postgres:
 
-The requirement that all btree keys be unique is too onerous,
-but the algorithm won't work correctly without it.  Fortunately, it is
-only necessary that keys be unique on a single tree level, because L&Y
-only use the assumption of key uniqueness when re-finding a key in a
-parent page (to determine where to insert the key for a split page).
-Therefore, we can use the link field to disambiguate multiple
-occurrences of the same user key: only one entry in the parent level
-will be pointing at the page we had split.  (Indeed we need not look at
-the real "key" at all, just at the link field.)  We can distinguish
-items at the leaf level in the same way, by examining their links to
-heap tuples; we'd never have two items for the same heap tuple.
-
-Lehman and Yao assume that the key range for a subtree S is described
-by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent
-page.  This does not work for nonunique keys (for example, if we have
-enough equal keys to spread across several leaf pages, there *must* be
-some equal bounding keys in the first level up).  Therefore we assume
-Ki <= v <= Ki+1 instead.  A search that finds exact equality to a
-bounding key in an upper tree level must descend to the left of that
-key to ensure it finds any equal keys in the preceding page.  An
-insertion that sees the high key of its target page is equal to the key
-to be inserted has a choice whether or not to move right, since the new
-key could go on either page.  (Currently, we try to find a page where
-there is room for the new key without a split.)
-
 Lehman and Yao don't require read locks, but assume that in-memory
 copies of tree pages are unshared.  Postgres shares in-memory buffers
 among backends.  As a result, we do page-level read locking on btree
@@ -194,9 +207,7 @@ be prepared for the possibility that the item it wants is to the left of
 the recorded position (but it can't have moved left out of the recorded
 page).  Since we hold a lock on the lower page (per L&Y) until we have
 re-found the parent item that links to it, we can be assured that the
-parent item does still exist and can't have been deleted.  Also, because
-we are matching downlink page numbers and not data keys, we don't have any
-problem with possibly misidentifying the parent item.
+parent item does still exist and can't have been deleted.
 
 Page Deletion
 -------------
@@ -595,36 +606,56 @@ scankey point to comparison functions that return boolean, such as int4lt.
 There might be more than one scankey entry for a given index column, or
 none at all.  (We require the keys to appear in index column order, but
 the order of multiple keys for a given column is unspecified.)  An
-insertion scankey uses the same array-of-ScanKey data structure, but the
+insertion scankey uses a similar array-of-ScanKey data structure, but the
 sk_func pointers point to btree comparison support functions (ie, 3-way
 comparators that return int4 values interpreted as <0, =0, >0).  In an
-insertion scankey there is exactly one entry per index column.  Insertion
-scankeys are built within the btree code (eg, by _bt_mkscankey()) and are
-used to locate the starting point of a scan, as well as for locating the
-place to insert a new index tuple.  (Note: in the case of an insertion
-scankey built from a search scankey, there might be fewer keys than
-index columns, indicating that we have no constraints for the remaining
-index columns.)  After we have located the starting point of a scan, the
-original search scankey is consulted as each index entry is sequentially
-scanned to decide whether to return the entry and whether the scan can
-stop (see _bt_checkkeys()).
+insertion scankey there is at most one entry per index column.  There is
+also other data about the rules used to locate where to begin the scan,
+such as whether or not the scan is a "nextkey" scan.  Insertion scankeys
+are built within the btree code (eg, by _bt_mkscankey()) and are used to
+locate the starting point of a scan, as well as for locating the place to
+insert a new index tuple.  (Note: in the case of an insertion scankey built
+from a search scankey or built from a truncated pivot tuple, there might be
+fewer keys than index columns, indicating that we have no constraints for
+the remaining index columns.) After we have located the starting point of a
+scan, the original search scankey is consulted as each index entry is
+sequentially scanned to decide whether to return the entry and whether the
+scan can stop (see _bt_checkkeys()).
 
-We use term "pivot" index tuples to distinguish tuples which don't point
-to heap tuples, but rather used for tree navigation.  Pivot tuples includes
-all tuples on non-leaf pages and high keys on leaf pages.  Note that pivot
-index tuples are only used to represent which part of the key space belongs
-on each page, and can have attribute values copied from non-pivot tuples
-that were deleted and killed by VACUUM some time ago.  In principle, we could
-truncate away attributes that are not needed for a page high key during a leaf
-page split, provided that the remaining attributes distinguish the last index
-tuple on the post-split left page as belonging on the left page, and the first
-index tuple on the post-split right page as belonging on the right page.  This
-optimization is sometimes called suffix truncation, and may appear in a future
-release. Since the high key is subsequently reused as the downlink in the
-parent page for the new right page, suffix truncation can increase index
-fan-out considerably by keeping pivot tuples short.  INCLUDE indexes similarly
-truncate away non-key attributes at the time of a leaf page split,
-increasing fan-out.
+Notes about suffix truncation
+-----------------------------
+
+We truncate away suffix key attributes that are not needed for a page high
+key during a leaf page split.  The remaining attributes must distinguish
+the last index tuple on the post-split left page as belonging on the left
+page, and the first index tuple on the post-split right page as belonging
+on the right page.  Tuples logically retain truncated key attributes,
+though they implicitly have "negative infinity" as their value, and have no
+storage overhead.  Since the high key is subsequently reused as the
+downlink in the parent page for the new right page, suffix truncation makes
+pivot tuples short.  INCLUDE indexes are guaranteed to have non-key
+attributes truncated at the time of a leaf page split, but may also have
+some key attributes truncated away, based on the usual criteria for key
+attributes.  They are not a special case, since non-key attributes are
+merely payload to B-Tree searches.
+
+The goal of suffix truncation of key attributes is to improve index
+fan-out.  The technique was first described by Bayer and Unterauer (R.Bayer
+and K.Unterauer, Prefix B-Trees, ACM Transactions on Database Systems, Vol
+2, No. 1, March 1977, pp 11-26).  The Postgres implementation is loosely
+based on their paper.  Note that Postgres only implements what the paper
+refers to as simple prefix B-Trees.  Note also that the paper assumes that
+the tree has keys that consist of single strings that maintain the "prefix
+property", much like strings that are stored in a suffix tree (comparisons
+of earlier bytes must always be more significant than comparisons of later
+bytes, and, in general, the strings must compare in a way that doesn't
+break transitive consistency as they're split into pieces).  Suffix
+truncation in Postgres currently only works at the whole-attribute
+granularity, but it would be straightforward to invent opclass
+infrastructure that manufactures a smaller attribute value in the case of
+variable-length types, such as text.  An opclass support function could
+manufacture the shortest possible key value that still correctly separates
+each half of a leaf page split.
 
 Notes About Data Representation
 -------------------------------
@@ -637,20 +668,26 @@ don't need to renumber any existing pages when splitting the root.)
 
 The Postgres disk block data format (an array of items) doesn't fit
 Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
-so we have to play some games.
+so we have to play some games.  (Presumably things are explained this
+way because of internal page splits, which conceptually split at the
+middle of an existing pivot tuple -- the tuple's "separator" key goes on
+the left side of the split as the left side's new high key, while the
+tuple's pointer/downlink goes on the right side as the first/minus
+infinity downlink.)
 
 On a page that is not rightmost in its tree level, the "high key" is
 kept in the page's first item, and real data items start at item 2.
 The link portion of the "high key" item goes unused.  A page that is
-rightmost has no "high key", so data items start with the first item.
-Putting the high key at the left, rather than the right, may seem odd,
-but it avoids moving the high key as we add data items.
+rightmost has no "high key" (it's implicitly positive infinity), so
+data items start with the first item.  Putting the high key at the
+left, rather than the right, may seem odd, but it avoids moving the
+high key as we add data items.
 
 On a leaf page, the data items are simply links to (TIDs of) tuples
 in the relation being indexed, with the associated key values.
 
 On a non-leaf page, the data items are down-links to child pages with
-bounding keys.  The key in each data item is the *lower* bound for
+bounding keys.  The key in each data item is a strict lower bound for
 keys on that child page, so logically the key is to the left of that
 downlink.  The high key (if present) is the upper bound for the last
 downlink.  The first data item on each such page has no lower bound
@@ -658,4 +695,5 @@ downlink.  The first data item on each such page has no lower bound
 routines must treat it accordingly.  The actual key stored in the
 item is irrelevant, and need not be stored at all.  This arrangement
 corresponds to the fact that an L&Y non-leaf page has one more pointer
-than key.
+than key.  Suffix truncation's negative infinity attributes behave in
+the same way.
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index f143ea8be2..818683ac2e 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -64,14 +64,16 @@ static OffsetNumber _bt_findinsertloc(Relation rel,
 				  Relation heapRel);
 static bool _bt_useduplicatepage(Relation rel, Relation heapRel, Buffer buf,
 					 bool *restorebinsrch, Size itemsz);
-static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
+static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
+			   Buffer buf,
+			   Buffer cbuf,
 			   BTStack stack,
 			   IndexTuple itup,
 			   OffsetNumber newitemoff,
 			   bool split_only_page);
-static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf,
-		  OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
-		  IndexTuple newitem, bool newitemonleft);
+static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
+		  Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff,
+		  Size newitemsz, IndexTuple newitem, bool newitemonleft);
 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 				  BTStack stack, bool is_root, bool is_only);
 static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
@@ -120,6 +122,9 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 
 	/* we need an insertion scan key to do our search, so build one */
 	itup_key = _bt_mkscankey(rel, itup);
+	/* No scantid until uniqueness established in checkingunique case */
+	if (checkingunique && itup_key->heapkeyspace)
+		itup_key->scantid = NULL;
 
 	/*
 	 * It's very common to have an index on an auto-incremented or
@@ -225,12 +230,13 @@ top:
 	 * NOTE: obviously, _bt_check_unique can only detect keys that are already
 	 * in the index; so it cannot defend against concurrent insertions of the
 	 * same key.  We protect against that by means of holding a write lock on
-	 * the target page.  Any other would-be inserter of the same key must
-	 * acquire a write lock on the same target page, so only one would-be
-	 * inserter can be making the check at one time.  Furthermore, once we are
-	 * past the check we hold write locks continuously until we have performed
-	 * our insertion, so no later inserter can fail to see our insertion.
-	 * (This requires some care in _bt_findinsertloc.)
+	 * the first page the value could be on, regardless of the value of its
+	 * implicit heap TID tie-breaker attribute.  Any other would-be inserter
+	 * of the same key must acquire a write lock on the same page, so only one
+	 * would-be inserter can be making the check at one time.  Furthermore,
+	 * once we are past the check we hold write locks continuously until we
+	 * have performed our insertion, so no later inserter can fail to see our
+	 * insertion.  (This requires some care in _bt_findinsertloc.)
 	 *
 	 * If we must wait for another xact, we release the lock while waiting,
 	 * and then must start over completely.
@@ -267,6 +273,10 @@ top:
 				_bt_freestack(stack);
 			goto top;
 		}
+
+		/* Uniqueness is established -- restore heap tid as scantid */
+		if (itup_key->heapkeyspace)
+			itup_key->scantid = &itup->t_tid;
 	}
 
 	if (checkUnique != UNIQUE_CHECK_EXISTING)
@@ -275,12 +285,12 @@ top:
 
 		/*
 		 * The only conflict predicate locking cares about for indexes is when
-		 * an index tuple insert conflicts with an existing lock.  Since the
-		 * actual location of the insert is hard to predict because of the
-		 * random search used to prevent O(N^2) performance when there are
-		 * many duplicate entries, we can just use the "first valid" page.
-		 * This reasoning also applies to INCLUDE indexes, whose extra
-		 * attributes are not considered part of the key space.
+		 * an index tuple insert conflicts with an existing lock.  The actual
+		 * location of the insert is unsettled in the checkingunique case
+		 * because scantid was not filled in initially, but it's okay to use
+		 * the "first valid" page instead.  This reasoning also applies to
+		 * INCLUDE indexes, whose extra attributes are not considered part of
+		 * the key space.
 		 */
 		CheckForSerializableConflictIn(rel, NULL, buf);
 
@@ -291,8 +301,8 @@ top:
 		 */
 		newitemoff = _bt_findinsertloc(rel, itup_key, &buf, checkingunique,
 									   itup, stack, heapRel);
-		_bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, newitemoff,
-					   false);
+		_bt_insertonpg(rel, itup_key, buf, InvalidBuffer, stack, itup,
+					   newitemoff, false);
 	}
 	else
 	{
@@ -313,7 +323,8 @@ top:
  *
  * Sets state in itup_key sufficient for later _bt_findinsertloc() call to
  * reuse most of the work of our initial binary search to find conflicting
- * tuples.
+ * tuples.  This won't be usable if caller's tuple is determined to not belong
+ * on buf following scantid being filled-in.
  *
  * Returns InvalidTransactionId if there is no conflict, else an xact ID
  * we must wait for to see if it commits a conflicting tuple.   If an actual
@@ -362,6 +373,7 @@ _bt_check_unique(Relation rel, BTScanInsert itup_key,
 	 * Scan over all equal tuples, looking for live conflicts.
 	 */
 	Assert(itup_key->low == offset);
+	Assert(itup_key->scantid == NULL);
 	for (;;)
 	{
 		ItemId		curitemid;
@@ -399,16 +411,14 @@ _bt_check_unique(Relation rel, BTScanInsert itup_key,
 			/*
 			 * We can skip items that are marked killed.
 			 *
-			 * Formerly, we applied _bt_isequal() before checking the kill
-			 * flag, so as to fall out of the item loop as soon as possible.
-			 * However, in the presence of heavy update activity an index may
-			 * contain many killed items with the same key; running
-			 * _bt_isequal() on each killed item gets expensive. Furthermore
-			 * it is likely that the non-killed version of each key appears
-			 * first, so that we didn't actually get to exit any sooner
-			 * anyway. So now we just advance over killed items as quickly as
-			 * we can. We only apply _bt_isequal() when we get to a non-killed
-			 * item.
+			 * In the presence of heavy update activity an index may contain
+			 * many killed items with the same key; running _bt_isequal() on
+			 * each killed item gets expensive. Just advance over killed items
+			 * as quickly as we can. We only apply _bt_isequal() when we get
+			 * to a non-killed item. Even those comparisons could be avoided
+			 * (in the common case where there is only one page to visit) by
+			 * reusing bounds, but just skipping dead items is sufficiently
+			 * effective.
 			 */
 			if (!ItemIdIsDead(curitemid))
 			{
@@ -633,16 +643,16 @@ _bt_check_unique(Relation rel, BTScanInsert itup_key,
 /*
  *	_bt_findinsertloc() -- Finds an insert location for a tuple
  *
- *		If the new key is equal to one or more existing keys, we can
- *		legitimately place it anywhere in the series of equal keys --- in fact,
- *		if the new key is equal to the page's "high key" we can place it on
- *		the next page.  If it is equal to the high key, and there's not room
- *		to insert the new tuple on the current page without splitting, then
- *		we can move right hoping to find more free space and avoid a split.
- *		(We should not move right indefinitely, however, since that leads to
- *		O(N^2) insertion behavior in the presence of many equal keys.)
- *		Once we have chosen the page to put the key on, we'll insert it before
- *		any existing equal keys because of the way _bt_binsrch() works.
+ *		On entry, *bufptr contains the page that the new tuple belongs on.
+ *		Occasionally, this won't be exactly right for callers that just
+ *		called _bt_check_unique(), and did initial search without using a
+ *		scantid.  They'll have to insert into a page somewhere to the right
+ *		in rare cases where there are many physical duplicates in a unique
+ *		index, and their scantid directs us to some page full of duplicates
+ *		to the right, where the new tuple must go.  (Actually, since
+ *		!heapkeyspace pg_upgraded'd non-unique indexes never get a scantid,
+ *		they too may require that we move right.  We treat them somewhat like
+ *		unique indexes.)
  *
  *		_bt_check_unique() callers arrange for their insertion scan key to
  *		save the progress of the last binary search performed.  No additional
@@ -685,28 +695,26 @@ _bt_findinsertloc(Relation rel,
 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 								 * need to be consistent */
 
-	/*
-	 * Check whether the item can fit on a btree page at all. (Eventually, we
-	 * ought to try to apply TOAST methods if not.) We actually need to be
-	 * able to fit three items on every page, so restrict any one item to 1/3
-	 * the per-page available space. Note that at this point, itemsz doesn't
-	 * include the ItemId.
-	 *
-	 * NOTE: if you change this, see also the similar code in _bt_buildadd().
-	 */
-	if (itemsz > BTMaxItemSize(page))
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
-						itemsz, BTMaxItemSize(page),
-						RelationGetRelationName(rel)),
-				 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
-						 "Consider a function index of an MD5 hash of the value, "
-						 "or use full text indexing."),
-				 errtableconstraint(heapRel,
-									RelationGetRelationName(rel))));
+	/* Check 1/3 of a page restriction */
+	if (unlikely(itemsz > BTMaxItemSize(page)))
+		_bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
+							 newtup);
 
+	/*
+	 * We may have to walk right through leaf pages to find the one leaf page
+	 * that we must insert on to, though only when inserting into unique
+	 * indexes.  This is necessary because a scantid is not used by the
+	 * insertion scan key initially in the case of unique indexes -- a scantid
+	 * is only set after the absence of duplicates (whose heap tuples are not
+	 * dead or recently dead) has been established by _bt_check_unique().
+	 * Non-unique index insertions will break out of the loop immediately.
+	 *
+	 * (Actually, non-unique indexes may still need to grovel through leaf
+	 * pages full of duplicates with a pg_upgrade'd !heapkeyspace index.)
+	 */
 	Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop));
+	Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
+	Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
 	for (;;)
 	{
 		int			cmpval;
@@ -714,6 +722,13 @@ _bt_findinsertloc(Relation rel,
 		BlockNumber rblkno;
 
 		/*
+		 * Fastpaths that avoid extra high key check.
+		 *
+		 * No need to check high key when inserting into a non-unique index;
+		 * _bt_search() already checked this when it checked if a move to the
+		 * right was required for leaf page.  Insertion scankey's scantid
+		 * would have been filled out at the time.
+		 *
 		 * The checkingunique (restorebinsrch) case may well have established
 		 * bounds within _bt_check_unique()'s binary search that preclude the
 		 * need for a further high key check.  This fastpath isn't used when
@@ -721,22 +736,33 @@ _bt_findinsertloc(Relation rel,
 		 * when it looks like the new item belongs last on the page, but it
 		 * might go on a later page instead.
 		 */
-		if (restorebinsrch && itup_key->low <= itup_key->stricthigh &&
-			itup_key->stricthigh <= PageGetMaxOffsetNumber(page))
+		if (!checkingunique && itup_key->heapkeyspace)
+			break;
+		else if (restorebinsrch && itup_key->low <= itup_key->stricthigh &&
+				 itup_key->stricthigh <= PageGetMaxOffsetNumber(page))
 			break;
 
 		if (P_RIGHTMOST(lpageop))
 			break;
 		cmpval = _bt_compare(rel, itup_key, page, P_HIKEY);
-
-		/*
-		 * May have to handle case where there is a choice of which page to
-		 * place new tuple on, and we must balance space utilization as best
-		 * we can.
-		 */
-		if (cmpval != 0 || _bt_useduplicatepage(rel, heapRel, buf,
-												&restorebinsrch, itemsz))
-			break;
+		if (itup_key->heapkeyspace)
+		{
+			if (cmpval <= 0)
+				break;
+		}
+		else
+		{
+			/*
+			 * pg_upgrade'd !heapkeyspace index.
+			 *
+			 * May have to handle legacy case where there is a choice of which
+			 * page to place new tuple on, and we must balance space
+			 * utilization as best we can.
+			 */
+			if (cmpval != 0 || _bt_useduplicatepage(rel, heapRel, buf,
+													&restorebinsrch, itemsz))
+				break;
+		}
 
 		/*
 		 * step right to next non-dead page
@@ -745,6 +771,8 @@ _bt_findinsertloc(Relation rel,
 		 * page; else someone else's _bt_check_unique scan could fail to see
 		 * our insertion.  write locks on intermediate dead pages won't do
 		 * because we don't know when they will get de-linked from the tree.
+		 * (this is more aggressive than it needs to be for non-unique
+		 * !heapkeyspace indexes.)
 		 */
 		rbuf = InvalidBuffer;
 
@@ -814,9 +842,17 @@ _bt_findinsertloc(Relation rel,
 /*
  *	_bt_useduplicatepage() -- Settle for this page of duplicates?
  *
+ *		Prior to PostgreSQL 12/Btree version 4, heap TID was never treated
+ *		as a part of the keyspace.  If there were many tuples of the same
+ *		value spanning more than one leaf page, a new tuple of that same
+ *		value could legally be placed on any one of the pages.
+ *
  *		This function handles the question of whether or not an insertion
- *		of a duplicate into a pg_upgrade'd !heapkeyspace index should
- *		insert on the page contained in buf when a choice must be made.
+ *		of a duplicate into a pg_upgrade'd !heapkeyspace index should insert
+ *		on the page contained in buf when a choice must be made.  It is only
+ *		used with pg_upgrade'd version 2 and version 3 indexes (!heapkeyspace
+ *		indexes).
+ *
  *		Preemptive microvacuuming is performed here when that could allow
  *		caller to insert on to the page in buf.
  *
@@ -904,6 +940,7 @@ _bt_useduplicatepage(Relation rel, Relation heapRel, Buffer buf,
  */
 static void
 _bt_insertonpg(Relation rel,
+			   BTScanInsert itup_key,
 			   Buffer buf,
 			   Buffer cbuf,
 			   BTStack stack,
@@ -926,7 +963,7 @@ _bt_insertonpg(Relation rel,
 		   BTreeTupleGetNAtts(itup, rel) ==
 		   IndexRelationGetNumberOfAttributes(rel));
 	Assert(P_ISLEAF(lpageop) ||
-		   BTreeTupleGetNAtts(itup, rel) ==
+		   BTreeTupleGetNAtts(itup, rel) <=
 		   IndexRelationGetNumberOfKeyAttributes(rel));
 
 	/* The caller should've finished any incomplete splits already. */
@@ -976,8 +1013,8 @@ _bt_insertonpg(Relation rel,
 									  &newitemonleft);
 
 		/* split the buffer into left and right halves */
-		rbuf = _bt_split(rel, buf, cbuf, firstright,
-						 newitemoff, itemsz, itup, newitemonleft);
+		rbuf = _bt_split(rel, itup_key, buf, cbuf, firstright, newitemoff,
+						 itemsz, itup, newitemonleft);
 		PredicateLockPageSplit(rel,
 							   BufferGetBlockNumber(buf),
 							   BufferGetBlockNumber(rbuf));
@@ -1059,7 +1096,7 @@ _bt_insertonpg(Relation rel,
 		if (BufferIsValid(metabuf))
 		{
 			/* upgrade meta-page if needed */
-			if (metad->btm_version < BTREE_VERSION)
+			if (metad->btm_version < BTREE_NOVAC_VERSION)
 				_bt_upgrademetapage(metapg);
 			metad->btm_fastroot = itup_blkno;
 			metad->btm_fastlevel = lpageop->btpo.level;
@@ -1114,6 +1151,8 @@ _bt_insertonpg(Relation rel,
 
 			if (BufferIsValid(metabuf))
 			{
+				Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+				xlmeta.version = metad->btm_version;
 				xlmeta.root = metad->btm_root;
 				xlmeta.level = metad->btm_level;
 				xlmeta.fastroot = metad->btm_fastroot;
@@ -1181,17 +1220,19 @@ _bt_insertonpg(Relation rel,
  *		new right page.  newitemoff etc. tell us about the new item that
  *		must be inserted along with the data from the old page.
  *
- *		When splitting a non-leaf page, 'cbuf' is the left-sibling of the
- *		page we're inserting the downlink for.  This function will clear the
- *		INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *		itup_key is used for suffix truncation on leaf pages (internal
+ *		page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
+ *		is the left-sibling of the page we're inserting the downlink for.
+ *		This function will clear the INCOMPLETE_SPLIT flag on it, and
+ *		release the buffer.
  *
  *		Returns the new right sibling of buf, pinned and write-locked.
  *		The pin and lock on buf are maintained.
  */
 static Buffer
-_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
-		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
-		  bool newitemonleft)
+_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
+		  OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
+		  IndexTuple newitem, bool newitemonleft)
 {
 	Buffer		rbuf;
 	Page		origpage;
@@ -1286,7 +1327,8 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 		itemid = PageGetItemId(origpage, P_HIKEY);
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
-		Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
+		Assert(BTreeTupleGetNAtts(item, rel) > 0);
+		Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
 		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
 						false, false) == InvalidOffsetNumber)
 		{
@@ -1299,9 +1341,30 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 	}
 
 	/*
-	 * The "high key" for the new left page will be the first key that's going
-	 * to go into the new right page.  This might be either the existing data
+	 * The "high key" for the new left page will be the first key that's
+	 * going to go into the new right page, or possibly a truncated version
+	 * if this is a leaf page split.  This might be either the existing data
 	 * item at position firstright, or the incoming tuple.
+	 *
+	 * The high key for the left page is formed using the first item on the
+	 * right page, which may seem to be contrary to Lehman & Yao's approach
+	 * of using the left page's last item as its new high key when splitting
+	 * on the leaf level.  It isn't, though: suffix truncation will leave
+	 * the left page's high key fully equal to the last item on the left
+	 * page when two tuples with equal key values (excluding heap TID)
+	 * enclose the split point.  It isn't actually necessary for a new leaf
+	 * high key to be equal to the last item on the left for the L&Y
+	 * "subtree" invariant to hold.  It's sufficient to make sure that the
+	 * new leaf high key is strictly less than the first item on the right
+	 * leaf page, and greater than or equal to (i.e. not necessarily equal
+	 * to) the last item on the left leaf page.
+	 *
+	 * In other words, when suffix truncation isn't possible, L&Y's exact
+	 * approach to leaf splits is taken.  (Actually, even that is slightly
+	 * inaccurate.  A tuple with all the keys from firstright but the heap
+	 * TID from lastleft will be used as the new high key, since the last
+	 * left tuple could be physically larger despite being opclass-equal in
+	 * respect of all attributes prior to the heap TID attribute.)
 	 */
 	leftoff = P_HIKEY;
 	if (!newitemonleft && newitemoff == firstright)
@@ -1319,25 +1382,58 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 	}
 
 	/*
-	 * Truncate non-key (INCLUDE) attributes of the high key item before
-	 * inserting it on the left page.  This only needs to happen at the leaf
+	 * Truncate unneeded key attributes of the high key item before
+	 * inserting it on the left page.  This can only happen at the leaf
 	 * level, since in general all pivot tuple values originate from leaf
 	 * level high keys.  This isn't just about avoiding unnecessary work,
-	 * though; truncating unneeded key attributes (more aggressive suffix
-	 * truncation) can only be performed at the leaf level anyway.  This is
-	 * because a pivot tuple in a grandparent page must guide a search not
-	 * only to the correct parent page, but also to the correct leaf page.
+	 * though; truncating unneeded key suffix attributes can only be
+	 * performed at the leaf level anyway.  This is because a pivot tuple in
+	 * a grandparent page must guide a search not only to the correct parent
+	 * page, but also to the correct leaf page.
 	 */
-	if (indnatts != indnkeyatts && isleaf)
+	if (isleaf && (itup_key->heapkeyspace || indnatts != indnkeyatts))
 	{
-		lefthikey = _bt_nonkey_truncate(rel, item);
+		IndexTuple	lastleft;
+
+		/*
+		 * Determine which tuple will become the last on the left page.  The
+		 * last left tuple and the first right tuple enclose the split point,
+		 * and are needed to determine how far truncation can go while still
+		 * leaving us with a high key that distinguishes the left side from
+		 * the right side.
+		 */
+		if (newitemonleft && newitemoff == firstright)
+		{
+			/* incoming tuple will become last on left page */
+			lastleft = newitem;
+		}
+		else
+		{
+			OffsetNumber lastleftoff;
+
+			/* item just before firstright will become last on left page */
+			lastleftoff = OffsetNumberPrev(firstright);
+			Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
+			itemid = PageGetItemId(origpage, lastleftoff);
+			lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+		}
+
+		/*
+		 * Truncate first item on the right side to create a new high key for
+		 * the left side.  The high key must be strictly less than all tuples
+		 * on the right side of the split, but can be equal to the last item
+		 * on the left side of the split within leaf pages.
+		 */
+		Assert(lastleft != item);
+		lefthikey = _bt_truncate(rel, lastleft, item, itup_key);
 		itemsz = IndexTupleSize(lefthikey);
 		itemsz = MAXALIGN(itemsz);
 	}
 	else
 		lefthikey = item;
 
-	Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
+	Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
+	Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
 	if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
 					false, false) == InvalidOffsetNumber)
 	{
@@ -1530,7 +1626,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 		xl_btree_split xlrec;
 		uint8		xlinfo;
 		XLogRecPtr	recptr;
-		bool		loglhikey = false;
 
 		xlrec.level = ropaque->btpo.level;
 		xlrec.firstright = firstright;
@@ -1559,22 +1654,10 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 		if (newitemonleft)
 			XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
 
-		/* Log left page */
-		if (!isleaf || indnatts != indnkeyatts)
-		{
-			/*
-			 * We must also log the left page's high key.  There are two
-			 * reasons for that: right page's leftmost key is suppressed on
-			 * non-leaf levels and in covering indexes included columns are
-			 * truncated from high keys.  Show it as belonging to the left
-			 * page buffer, so that it is not stored if XLogInsert decides it
-			 * needs a full-page image of the left page.
-			 */
-			itemid = PageGetItemId(origpage, P_HIKEY);
-			item = (IndexTuple) PageGetItem(origpage, itemid);
-			XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
-			loglhikey = true;
-		}
+		/* Log the left page's new high key */
+		itemid = PageGetItemId(origpage, P_HIKEY);
+		item = (IndexTuple) PageGetItem(origpage, itemid);
+		XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
 
 		/*
 		 * Log the contents of the right page in the format understood by
@@ -1590,9 +1673,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 							(char *) rightpage + ((PageHeader) rightpage)->pd_upper,
 							((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
 
-		xlinfo = newitemonleft ?
-			(loglhikey ? XLOG_BTREE_SPLIT_L_HIGHKEY : XLOG_BTREE_SPLIT_L) :
-			(loglhikey ? XLOG_BTREE_SPLIT_R_HIGHKEY : XLOG_BTREE_SPLIT_R);
+		xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
 		recptr = XLogInsert(RM_BTREE_ID, xlinfo);
 
 		PageSetLSN(origpage, recptr);
@@ -1955,7 +2036,7 @@ _bt_insert_parent(Relation rel,
 			_bt_relbuf(rel, pbuf);
 		}
 
-		/* get high key from left page == lower bound for new right page */
+		/* get high key from left, a strict lower bound for new right page */
 		ritem = (IndexTuple) PageGetItem(page,
 										 PageGetItemId(page, P_HIKEY));
 
@@ -1985,7 +2066,7 @@ _bt_insert_parent(Relation rel,
 				 RelationGetRelationName(rel), bknum, rbknum);
 
 		/* Recursively update the parent */
-		_bt_insertonpg(rel, pbuf, buf, stack->bts_parent,
+		_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
 					   new_item, stack->bts_offset + 1,
 					   is_only);
 
@@ -2246,7 +2327,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	START_CRIT_SECTION();
 
 	/* upgrade metapage if needed */
-	if (metad->btm_version < BTREE_VERSION)
+	if (metad->btm_version < BTREE_NOVAC_VERSION)
 		_bt_upgrademetapage(metapg);
 
 	/* set btree special data */
@@ -2281,7 +2362,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	/*
 	 * insert the right page pointer into the new root page.
 	 */
-	Assert(BTreeTupleGetNAtts(right_item, rel) ==
+	Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
+	Assert(BTreeTupleGetNAtts(right_item, rel) <=
 		   IndexRelationGetNumberOfKeyAttributes(rel));
 	if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
 					false, false) == InvalidOffsetNumber)
@@ -2314,6 +2396,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
 		XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 
+		Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+		md.version = metad->btm_version;
 		md.root = rootblknum;
 		md.level = metad->btm_level;
 		md.fastroot = rootblknum;
@@ -2378,6 +2462,7 @@ _bt_pgaddtup(Page page,
 	{
 		trunctuple = *itup;
 		trunctuple.t_info = sizeof(IndexTupleData);
+		/* Deliberately zero INDEX_ALT_TID_MASK bits */
 		BTreeTupleSetNAtts(&trunctuple, 0);
 		itup = &trunctuple;
 		itemsize = sizeof(IndexTupleData);
@@ -2393,8 +2478,8 @@ _bt_pgaddtup(Page page,
 /*
  * _bt_isequal - used in _bt_doinsert in check for duplicates.
  *
- * This is very similar to _bt_compare, except for NULL handling.
- * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
+ * This is very similar to _bt_compare, except for NULL and negative infinity
+ * handling.  Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
  */
 static bool
 _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, Page page,
@@ -2407,6 +2492,7 @@ _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, Page page,
 	/* Better be comparing to a non-pivot item */
 	Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
 	Assert(offnum >= P_FIRSTDATAKEY((BTPageOpaque) PageGetSpecialPointer(page)));
+	Assert(itup_key->scantid == NULL);
 
 	scankey = itup_key->scankeys;
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 56041c3d38..72af1ef3c1 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -34,6 +34,7 @@
 #include "utils/snapmgr.h"
 
 static void _bt_cachemetadata(Relation rel, BTMetaPageData *metad);
+static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
 static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
 						 bool *rightsib_empty);
@@ -77,7 +78,9 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
 }
 
 /*
- *	_bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new.
+ *	_bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
+ *		3, the last version that can be updated without broadly affecting
+ *		on-disk compatibility.  (A REINDEX is required to upgrade to v4.)
  *
  *		This routine does purely in-memory image upgrade.  Caller is
  *		responsible for locking, WAL-logging etc.
@@ -93,11 +96,11 @@ _bt_upgrademetapage(Page page)
 
 	/* It must be really a meta page of upgradable version */
 	Assert(metaopaque->btpo_flags & BTP_META);
-	Assert(metad->btm_version < BTREE_VERSION);
+	Assert(metad->btm_version < BTREE_NOVAC_VERSION);
 	Assert(metad->btm_version >= BTREE_MIN_VERSION);
 
 	/* Set version number and fill extra fields added into version 3 */
-	metad->btm_version = BTREE_VERSION;
+	metad->btm_version = BTREE_NOVAC_VERSION;
 	metad->btm_oldest_btpo_xact = InvalidTransactionId;
 	metad->btm_last_cleanup_num_heap_tuples = -1.0;
 
@@ -107,43 +110,79 @@ _bt_upgrademetapage(Page page)
 }
 
 /*
- * Cache metadata from meta page to rel->rd_amcache.
+ * Cache metadata from input meta page to rel->rd_amcache.
  */
 static void
-_bt_cachemetadata(Relation rel, BTMetaPageData *metad)
+_bt_cachemetadata(Relation rel, BTMetaPageData *input)
 {
+	BTMetaPageData *cached_metad;
+
 	/* We assume rel->rd_amcache was already freed by caller */
 	Assert(rel->rd_amcache == NULL);
 	rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
 										 sizeof(BTMetaPageData));
 
-	/*
-	 * Meta page should be of supported version (should be already checked by
-	 * caller).
-	 */
-	Assert(metad->btm_version >= BTREE_MIN_VERSION &&
-		   metad->btm_version <= BTREE_VERSION);
+	/* Meta page should be of supported version */
+	Assert(input->btm_version >= BTREE_MIN_VERSION &&
+		   input->btm_version <= BTREE_VERSION);
 
-	if (metad->btm_version == BTREE_VERSION)
+	cached_metad = (BTMetaPageData *) rel->rd_amcache;
+	if (input->btm_version >= BTREE_NOVAC_VERSION)
 	{
-		/* Last version of meta-data, no need to upgrade */
-		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+		/* Version with compatible meta-data, no need to upgrade */
+		memcpy(cached_metad, input, sizeof(BTMetaPageData));
 	}
 	else
 	{
-		BTMetaPageData *cached_metad = (BTMetaPageData *) rel->rd_amcache;
-
 		/*
 		 * Upgrade meta-data: copy available information from meta-page and
 		 * fill new fields with default values.
+		 *
+		 * Note that we cannot upgrade to version 4+ without a REINDEX, since
+		 * extensive on-disk changes are required.
 		 */
-		memcpy(rel->rd_amcache, metad, offsetof(BTMetaPageData, btm_oldest_btpo_xact));
-		cached_metad->btm_version = BTREE_VERSION;
+		memcpy(cached_metad, input, offsetof(BTMetaPageData, btm_oldest_btpo_xact));
+		cached_metad->btm_version = BTREE_NOVAC_VERSION;
 		cached_metad->btm_oldest_btpo_xact = InvalidTransactionId;
 		cached_metad->btm_last_cleanup_num_heap_tuples = -1.0;
 	}
 }
 
+/*
+ * Get metadata from share-locked buffer containing metapage, while performing
+ * standard sanity checks.  Sanity checks here must match _bt_getroot().
+ */
+static BTMetaPageData *
+_bt_getmeta(Relation rel, Buffer metabuf)
+{
+	Page		metapg;
+	BTPageOpaque metaopaque;
+	BTMetaPageData *metad;
+
+	metapg = BufferGetPage(metabuf);
+	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+	metad = BTPageGetMeta(metapg);
+
+	/* sanity-check the metapage */
+	if (!P_ISMETA(metaopaque) ||
+		metad->btm_magic != BTREE_MAGIC)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("index \"%s\" is not a btree",
+						RelationGetRelationName(rel))));
+
+	if (metad->btm_version < BTREE_MIN_VERSION ||
+		metad->btm_version > BTREE_VERSION)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDEX_CORRUPTED),
+				 errmsg("version mismatch in index \"%s\": file version %d, "
+						"current version %d, minimal supported version %d",
+						RelationGetRelationName(rel),
+						metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
+
+	return metad;
+}
+
 /*
  *	_bt_update_meta_cleanup_info() -- Update cleanup-related information in
  *									  the metapage.
@@ -167,7 +206,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 	metad = BTPageGetMeta(metapg);
 
 	/* outdated version of metapage always needs rewrite */
-	if (metad->btm_version < BTREE_VERSION)
+	if (metad->btm_version < BTREE_NOVAC_VERSION)
 		needsRewrite = true;
 	else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
 			 metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
@@ -186,7 +225,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 	START_CRIT_SECTION();
 
 	/* upgrade meta-page if needed */
-	if (metad->btm_version < BTREE_VERSION)
+	if (metad->btm_version < BTREE_NOVAC_VERSION)
 		_bt_upgrademetapage(metapg);
 
 	/* update cleanup-related information */
@@ -202,6 +241,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 		XLogBeginInsert();
 		XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 
+		Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+		md.version = metad->btm_version;
 		md.root = metad->btm_root;
 		md.level = metad->btm_level;
 		md.fastroot = metad->btm_fastroot;
@@ -376,7 +417,7 @@ _bt_getroot(Relation rel, int access)
 		START_CRIT_SECTION();
 
 		/* upgrade metapage if needed */
-		if (metad->btm_version < BTREE_VERSION)
+		if (metad->btm_version < BTREE_NOVAC_VERSION)
 			_bt_upgrademetapage(metapg);
 
 		metad->btm_root = rootblkno;
@@ -400,6 +441,8 @@ _bt_getroot(Relation rel, int access)
 			XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
 			XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 
+			Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+			md.version = metad->btm_version;
 			md.root = rootblkno;
 			md.level = 0;
 			md.fastroot = rootblkno;
@@ -595,37 +638,12 @@ _bt_getrootheight(Relation rel)
 {
 	BTMetaPageData *metad;
 
-	/*
-	 * We can get what we need from the cached metapage data.  If it's not
-	 * cached yet, load it.  Sanity checks here must match _bt_getroot().
-	 */
 	if (rel->rd_amcache == NULL)
 	{
 		Buffer		metabuf;
-		Page		metapg;
-		BTPageOpaque metaopaque;
 
 		metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
-		metapg = BufferGetPage(metabuf);
-		metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
-		metad = BTPageGetMeta(metapg);
-
-		/* sanity-check the metapage */
-		if (!P_ISMETA(metaopaque) ||
-			metad->btm_magic != BTREE_MAGIC)
-			ereport(ERROR,
-					(errcode(ERRCODE_INDEX_CORRUPTED),
-					 errmsg("index \"%s\" is not a btree",
-							RelationGetRelationName(rel))));
-
-		if (metad->btm_version < BTREE_MIN_VERSION ||
-			metad->btm_version > BTREE_VERSION)
-			ereport(ERROR,
-					(errcode(ERRCODE_INDEX_CORRUPTED),
-					 errmsg("version mismatch in index \"%s\": file version %d, "
-							"current version %d, minimal supported version %d",
-							RelationGetRelationName(rel),
-							metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
+		metad = _bt_getmeta(rel, metabuf);
 
 		/*
 		 * If there's no root page yet, _bt_getroot() doesn't expect a cache
@@ -642,19 +660,70 @@ _bt_getrootheight(Relation rel)
 		 * Cache the metapage data for next time
 		 */
 		_bt_cachemetadata(rel, metad);
-
+		/* We shouldn't have cached it if any of these fail */
+		Assert(metad->btm_magic == BTREE_MAGIC);
+		Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+		Assert(metad->btm_fastroot != P_NONE);
 		_bt_relbuf(rel, metabuf);
 	}
 
+	/* Get cached page */
 	metad = (BTMetaPageData *) rel->rd_amcache;
-	/* We shouldn't have cached it if any of these fail */
-	Assert(metad->btm_magic == BTREE_MAGIC);
-	Assert(metad->btm_version == BTREE_VERSION);
-	Assert(metad->btm_fastroot != P_NONE);
 
 	return metad->btm_fastlevel;
 }
 
+/*
+ *	_bt_heapkeyspace() -- is heap TID being treated as a key?
+ *
+ *		This is used to determine the rules that must be used to descend a
+ *		btree.  Version 4 indexes treat heap TID as a tie-breaker attribute.
+ *		pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
+ *		performance when inserting a new BTScanInsert-wise duplicate tuple
+ *		among many leaf pages already full of such duplicates.
+ */
+bool
+_bt_heapkeyspace(Relation rel)
+{
+	BTMetaPageData *metad;
+
+	if (rel->rd_amcache == NULL)
+	{
+		Buffer		metabuf;
+
+		metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+		metad = _bt_getmeta(rel, metabuf);
+
+		/*
+		 * If there's no root page yet, _bt_getroot() doesn't expect a cache
+		 * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
+		 * be changed to allow this case.)
+		 */
+		if (metad->btm_root == P_NONE)
+		{
+			uint32		btm_version = metad->btm_version;
+
+			_bt_relbuf(rel, metabuf);
+			return btm_version > BTREE_NOVAC_VERSION;
+		}
+
+		/*
+		 * Cache the metapage data for next time
+		 */
+		_bt_cachemetadata(rel, metad);
+		/* We shouldn't have cached it if any of these fail */
+		Assert(metad->btm_magic == BTREE_MAGIC);
+		Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+		Assert(metad->btm_fastroot != P_NONE);
+		_bt_relbuf(rel, metabuf);
+	}
+
+	/* Get cached page */
+	metad = (BTMetaPageData *) rel->rd_amcache;
+
+	return metad->btm_version > BTREE_NOVAC_VERSION;
+}
+
 /*
  *	_bt_checkpage() -- Verify that a freshly-read page looks sane.
  */
@@ -1123,11 +1192,12 @@ _bt_is_page_halfdead(Relation rel, BlockNumber blk)
  * right sibling.
  *
  * "child" is the leaf page we wish to delete, and "stack" is a search stack
- * leading to it (approximately).  Note that we will update the stack
- * entry(s) to reflect current downlink positions --- this is essentially the
- * same as the corresponding step of splitting, and is not expected to affect
- * caller.  The caller should initialize *target and *rightsib to the leaf
- * page and its right sibling.
+ * leading to it (it actually leads to the leftmost leaf page with a high key
+ * matching that of the page to be deleted in !heapkeyspace indexes).  Note
+ * that we will update the stack entry(s) to reflect current downlink
+ * positions --- this is essentially the same as the corresponding step of
+ * splitting, and is not expected to affect caller.  The caller should
+ * initialize *target and *rightsib to the leaf page and its right sibling.
  *
  * Note: it's OK to release page locks on any internal pages between the leaf
  * and *topparent, because a safe deletion can't become unsafe due to
@@ -1149,8 +1219,10 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
 	BlockNumber leftsib;
 
 	/*
-	 * Locate the downlink of "child" in the parent (updating the stack entry
-	 * if needed)
+	 * Locate the downlink of "child" in the parent, updating the stack entry
+	 * if needed.  This is how !heapkeyspace indexes deal with having
+	 * non-unique high keys in leaf level pages.  Even heapkeyspace indexes
+	 * can have a stale stack due to insertions into the parent.
 	 */
 	stack->bts_btentry = child;
 	pbuf = _bt_getstackbuf(rel, stack);
@@ -1422,6 +1494,8 @@ _bt_pagedel(Relation rel, Buffer buf)
 
 				/* we need an insertion scan key for the search, so build one */
 				itup_key = _bt_mkscankey(rel, targetkey);
+				/* absent attributes need explicit minus infinity values */
+				itup_key->minusinfkey = true;
 				/* get stack to leaf page by searching index */
 				stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
 				/* don't need a lock or second pin on the page */
@@ -1969,7 +2043,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 	if (BufferIsValid(metabuf))
 	{
 		/* upgrade metapage if needed */
-		if (metad->btm_version < BTREE_VERSION)
+		if (metad->btm_version < BTREE_NOVAC_VERSION)
 			_bt_upgrademetapage(metapg);
 		metad->btm_fastroot = rightsib;
 		metad->btm_fastlevel = targetlevel;
@@ -2017,6 +2091,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 		{
 			XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 
+			Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+			xlmeta.version = metad->btm_version;
 			xlmeta.root = metad->btm_root;
 			xlmeta.level = metad->btm_level;
 			xlmeta.fastroot = metad->btm_fastroot;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 98917de2ef..ec2edae850 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -794,7 +794,7 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 	metapg = BufferGetPage(metabuf);
 	metad = BTPageGetMeta(metapg);
 
-	if (metad->btm_version < BTREE_VERSION)
+	if (metad->btm_version < BTREE_NOVAC_VERSION)
 	{
 		/*
 		 * Do cleanup if metapage needs upgrade, because we don't have
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 7940297305..2999971cfd 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -152,8 +152,12 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		 * downlink (block) to uniquely identify the index entry, in case it
 		 * moves right while we're working lower in the tree.  See the paper
 		 * by Lehman and Yao for how this is detected and handled. (We use the
-		 * child link to disambiguate duplicate keys in the index -- Lehman
-		 * and Yao disallow duplicate keys.)
+		 * child link during the second half of a page split -- if caller ends
+		 * up splitting the child it usually ends up inserting a new pivot
+		 * tuple for child's new right sibling immediately after the original
+		 * bts_offset offset recorded here.  The downlink block will be needed
+		 * to check if bts_offset remains the position of this same pivot
+		 * tuple.)
 		 */
 		new_stack = (BTStack) palloc(sizeof(BTStackData));
 		new_stack->bts_blkno = par_blkno;
@@ -251,11 +255,13 @@ _bt_moveright(Relation rel,
 	/*
 	 * When nextkey = false (normal case): if the scan key that brought us to
 	 * this page is > the high key stored on the page, then the page has split
-	 * and we need to move right.  (If the scan key is equal to the high key,
-	 * we might or might not need to move right; have to scan the page first
-	 * anyway.)
+	 * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
+	 * have some duplicates to the right as well as the left, but that's
+	 * something that's only ever dealt with on the leaf level, after
+	 * _bt_search has found an initial leaf page.)
 	 *
 	 * When nextkey = true: move right if the scan key is >= page's high key.
+	 * (Note that key.scantid cannot be set in this case.)
 	 *
 	 * The page could even have split more than once, so scan as far as
 	 * needed.
@@ -358,6 +364,9 @@ _bt_binsrch(Relation rel,
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
+	/* Requesting nextkey semantics while using scantid seems nonsensical */
+	Assert(!key->nextkey || key->scantid == NULL);
+
 	if (!key->restorebinsrch)
 	{
 		low = P_FIRSTDATAKEY(opaque);
@@ -367,6 +376,7 @@ _bt_binsrch(Relation rel,
 	else
 	{
 		/* Restore result of previous binary search against same page */
+		Assert(!key->heapkeyspace || key->scantid != NULL);
 		Assert(P_ISLEAF(opaque));
 		low = key->low;
 		high = key->stricthigh;
@@ -446,6 +456,7 @@ _bt_binsrch(Relation rel,
 	if (key->savebinsrch)
 	{
 		Assert(isleaf);
+		Assert(key->scantid == NULL);
 		key->low = low;
 		key->stricthigh = stricthigh;
 		key->savebinsrch = false;
@@ -492,19 +503,31 @@ _bt_compare(Relation rel,
 	TupleDesc	itupdesc = RelationGetDescr(rel);
 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	IndexTuple	itup;
+	ItemPointer heapTid;
 	ScanKey		scankey;
+	int			ncmpkey;
+	int			ntupatts;
 
-	Assert(_bt_check_natts(rel, page, offnum));
+	Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
 	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
+	Assert(key->heapkeyspace || key->scantid == NULL);
+	Assert(key->minusinfkey || key->heapkeyspace);
 
 	/*
 	 * Force result ">" if target item is first data item on an internal page
 	 * --- see NOTE above.
+	 *
+	 * A minus infinity key has all attributes truncated away, so this test is
+	 * redundant with the minus infinity attribute tie-breaker.  However, the
+	 * number of attributes in minus infinity tuples is not explicitly
+	 * represented as 0 within btree version 2 indexes, so an explicit offnum
+	 * test is still required.
 	 */
 	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
 		return 1;
 
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+	ntupatts = BTreeTupleGetNAtts(itup, rel);
 
 	/*
 	 * The scan key is set up with the attribute number associated with each
@@ -518,8 +541,10 @@ _bt_compare(Relation rel,
 	 * _bt_first).
 	 */
 
+	ncmpkey = Min(ntupatts, key->keysz);
+	Assert(key->heapkeyspace || ncmpkey == key->keysz);
 	scankey = key->scankeys;
-	for (int i = 1; i <= key->keysz; i++)
+	for (int i = 1; i <= ncmpkey; i++)
 	{
 		Datum		datum;
 		bool		isNull;
@@ -570,8 +595,65 @@ _bt_compare(Relation rel,
 		scankey++;
 	}
 
-	/* if we get here, the keys are equal */
-	return 0;
+	/*
+	 * All non-truncated attributes (other than heap TID) were found to be
+	 * equal.  Treat truncated attributes as minus infinity when scankey has a
+	 * key attribute value that would otherwise be compared directly.
+	 *
+	 * Note: it doesn't matter if ntupatts includes non-key attributes;
+	 * scankey won't, so explicitly excluding non-key attributes isn't
+	 * necessary.
+	 */
+	if (key->keysz > ntupatts)
+		return 1;
+
+	/*
+	 * Use the heap TID attribute and scantid to try to break the tie.  The
+	 * rules are the same as any other key attribute -- only the
+	 * representation differs.  (This is also a convenient point to check if
+	 * the !minusinfkey optimization can be used.)
+	 */
+	heapTid = BTreeTupleGetHeapTID(itup);
+	if (key->scantid == NULL)
+	{
+		/*
+		 * Most searches (all !minusinfkey searches) are not interested in
+		 * keys where minus infinity is explicitly represented, since that's a
+		 * sentinel value that never appears in non-pivot tuples.  It is safe
+		 * for these searches to have their scankey considered greater than a
+		 * truncated pivot tuple iff the scankey has equal values for
+		 * attributes up to and including the least significant untruncated
+		 * attribute in pivot tuple.  The only would-be "match" that will be
+		 * "missed" is a single leaf page's high key (the leaf page whose high
+		 * key the values from affected pivot tuple originate from).
+		 *
+		 * This optimization prevents an extra leaf page visit when the
+		 * insertion scankey would otherwise be equal.  If this tiebreaker
+		 * wasn't performed, code like _bt_readpage() and _bt_readnextpage()
+		 * would often end up moving right having found no matches on the leaf
+		 * page that their search lands on initially.
+		 *
+		 * Note: the heap TID part of this test ensures that scankey is being
+		 * compared to a pivot tuple with one or more truncated key attributes
+		 * (often though not necessarily just the heap TID attribute).
+		 */
+		if (!key->minusinfkey && key->keysz == ntupatts && heapTid == NULL)
+			return 1;
+
+		/* All provided scankey arguments found to be equal */
+		return 0;
+	}
+
+	/*
+	 * Treat truncated heap TID as minus infinity, since scankey has a key
+	 * attribute value (scantid) that would otherwise be compared directly
+	 */
+	Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
+	if (heapTid == NULL)
+		return 1;
+
+	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
+	return ItemPointerCompare(key->scantid, heapTid);
 }
 
 /*
@@ -1088,7 +1170,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	/* Initialize remaining insertion scan key fields */
 	inskey.savebinsrch = inskey.restorebinsrch = false;
 	inskey.low = inskey.stricthigh = InvalidOffsetNumber;
+	inskey.heapkeyspace = _bt_heapkeyspace(rel);
+	inskey.minusinfkey = !inskey.heapkeyspace;
 	inskey.nextkey = nextkey;
+	inskey.scantid = NULL;
 	inskey.keysz = keysCount;
 
 	/*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 759859c302..67cdb44cf5 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -746,6 +746,7 @@ _bt_sortaddtup(Page page,
 	{
 		trunctuple = *itup;
 		trunctuple.t_info = sizeof(IndexTupleData);
+		/* Deliberately zero INDEX_ALT_TID_MASK bits */
 		BTreeTupleSetNAtts(&trunctuple, 0);
 		itup = &trunctuple;
 		itemsize = sizeof(IndexTupleData);
@@ -799,8 +800,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	OffsetNumber last_off;
 	Size		pgspc;
 	Size		itupsz;
-	int			indnatts = IndexRelationGetNumberOfAttributes(wstate->index);
-	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(wstate->index);
 
 	/*
 	 * This is a handy place to check for cancel interrupts during the btree
@@ -817,27 +816,21 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	itupsz = MAXALIGN(itupsz);
 
 	/*
-	 * Check whether the item can fit on a btree page at all. (Eventually, we
-	 * ought to try to apply TOAST methods if not.) We actually need to be
-	 * able to fit three items on every page, so restrict any one item to 1/3
-	 * the per-page available space. Note that at this point, itupsz doesn't
-	 * include the ItemId.
+	 * Check whether the item can fit on a btree page at all.
 	 *
-	 * NOTE: similar code appears in _bt_insertonpg() to defend against
-	 * oversize items being inserted into an already-existing index. But
-	 * during creation of an index, we don't go through there.
+	 * Every newly built index will treat heap TID as part of the keyspace,
+	 * which imposes the requirement that new high keys must occasionally have
+	 * a heap TID appended within _bt_truncate().  That may leave a new pivot
+	 * tuple one or two MAXALIGN() quantums larger than the original first
+	 * right tuple it's derived from.  v4 deals with the problem by decreasing
+	 * the limit on the size of tuples inserted on the leaf level by the same
+	 * small amount.  Enforce the new v4+ limit on the leaf level, and the old
+	 * limit on internal levels, since pivot tuples may need to make use of
+	 * the resered space.  This should never fail on internal pages.
 	 */
-	if (itupsz > BTMaxItemSize(npage))
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
-						itupsz, BTMaxItemSize(npage),
-						RelationGetRelationName(wstate->index)),
-				 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
-						 "Consider a function index of an MD5 hash of the value, "
-						 "or use full text indexing."),
-				 errtableconstraint(wstate->heap,
-									RelationGetRelationName(wstate->index))));
+	if (unlikely(itupsz > BTMaxItemSize(npage)))
+		_bt_check_third_page(wstate->index, wstate->heap,
+							 state->btps_level == 0, npage, itup);
 
 	/*
 	 * Check to see if page is "full".  It's definitely full if the item won't
@@ -883,24 +876,35 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		ItemIdSetUnused(ii);	/* redundant */
 		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
 
-		if (indnkeyatts != indnatts && P_ISLEAF(opageop))
+		if (P_ISLEAF(opageop))
 		{
+			IndexTuple	lastleft;
 			IndexTuple	truncated;
 			Size		truncsz;
 
 			/*
-			 * Truncate any non-key attributes from high key on leaf level
-			 * (i.e. truncate on leaf level if we're building an INCLUDE
-			 * index).  This is only done at the leaf level because downlinks
+			 * Truncate away any unneeded attributes from high key on leaf
+			 * level.  This is only done at the leaf level because downlinks
 			 * in internal pages are either negative infinity items, or get
 			 * their contents from copying from one level down.  See also:
 			 * _bt_split().
 			 *
+			 * We don't try to bias our choice of split point to make it more
+			 * likely that _bt_truncate() can truncate away more attributes,
+			 * whereas the split point passed to _bt_split() is chosen much
+			 * more delicately.  Suffix truncation is mostly useful because it
+			 * improves space utilization for workloads with random
+			 * insertions.  It doesn't seem worthwhile to add logic for
+			 * choosing a split point here for a benefit that is bound to be
+			 * much smaller.
+			 *
 			 * Since the truncated tuple is probably smaller than the
 			 * original, it cannot just be copied in place (besides, we want
 			 * to actually save space on the leaf page).  We delete the
 			 * original high key, and add our own truncated high key at the
-			 * same offset.
+			 * same offset.  It's okay if the truncated tuple is slightly
+			 * larger due to containing a heap TID value, since this case is
+			 * known to _bt_check_third_page(), which reserves space.
 			 *
 			 * Note that the page layout won't be changed very much.  oitup is
 			 * already located at the physical beginning of tuple space, so we
@@ -908,7 +912,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 			 * the latter portion of the space occupied by the original tuple.
 			 * This is fairly cheap.
 			 */
-			truncated = _bt_nonkey_truncate(wstate->index, oitup);
+			ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
+			lastleft = (IndexTuple) PageGetItem(opage, ii);
+
+			truncated = _bt_truncate(wstate->index, lastleft, oitup,
+									 wstate->inskey);
 			truncsz = IndexTupleSize(truncated);
 			PageIndexTupleDelete(opage, P_HIKEY);
 			_bt_sortaddtup(opage, truncsz, truncated, P_HIKEY);
@@ -927,8 +935,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		if (state->btps_next == NULL)
 			state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
 
-		Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) ==
-			   IndexRelationGetNumberOfKeyAttributes(wstate->index) ||
+		Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <=
+				IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
+				BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > 0) ||
 			   P_LEFTMOST(opageop));
 		Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 ||
 			   !P_LEFTMOST(opageop));
@@ -973,7 +982,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	 * the first item for a page is copied from the prior page in the code
 	 * above.  Since the minimum key for an entire level is only used as a
 	 * minus infinity downlink, and never as a high key, there is no need to
-	 * truncate away non-key attributes at this point.
+	 * truncate away suffix attributes at this point.
 	 */
 	if (last_off == P_HIKEY)
 	{
@@ -1032,8 +1041,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 		}
 		else
 		{
-			Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) ==
-				   IndexRelationGetNumberOfKeyAttributes(wstate->index) ||
+			Assert((BTreeTupleGetNAtts(s->btps_minkey, wstate->index) <=
+					IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
+					BTreeTupleGetNAtts(s->btps_minkey, wstate->index) > 0) ||
 				   P_LEFTMOST(opaque));
 			Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 ||
 				   !P_LEFTMOST(opaque));
@@ -1126,6 +1136,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 			}
 			else if (itup != NULL)
 			{
+				int32		compare = 0;
+
 				for (i = 1; i <= keysz; i++)
 				{
 					SortSupport entry;
@@ -1133,7 +1145,6 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 								attrDatum2;
 					bool		isNull1,
 								isNull2;
-					int32		compare;
 
 					entry = sortKeys + i - 1;
 					attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
@@ -1150,6 +1161,20 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 					else if (compare < 0)
 						break;
 				}
+
+				/*
+				 * If key values are equal, we sort on ItemPointer.  This is
+				 * required for btree indexes, since heap TID is treated as an
+				 * implicit last key attribute in order to ensure that all
+				 * keys in the index are physically unique.
+				 */
+				if (compare == 0)
+				{
+					compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
+					Assert(compare != 0);
+					if (compare > 0)
+						load1 = false;
+				}
 			}
 			else
 				load1 = false;
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index e010bcdcfa..15090b26d2 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -49,6 +49,8 @@ static void _bt_mark_scankey_required(ScanKey skey);
 static bool _bt_check_rowcompare(ScanKey skey,
 					 IndexTuple tuple, TupleDesc tupdesc,
 					 ScanDirection dir, bool *continuescan);
+static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
+			   IndexTuple firstright, BTScanInsert itup_key);
 
 
 /*
@@ -56,9 +58,25 @@ static bool _bt_check_rowcompare(ScanKey skey,
  *		Build an insertion scan key that contains comparison data from itup
  *		as well as comparator routines appropriate to the key datatypes.
  *
- *		Result is intended for use with _bt_compare().  Callers that don't
- *		need to fill out the insertion scankey arguments (e.g. they use an own
- *		ad-hoc comparison routine) can pass a NULL index tuple.
+ *		When itup is a non-pivot tuple, the returned insertion scan key is
+ *		suitable for finding a place for it to go on the leaf level.  Pivot
+ *		tuples can be used to relocate leaf page with matching high key, but
+ *		then caller needs to set scan key's minusinfkey field.  This can be
+ *		thought of as explicitly representing that absent attributes in scan
+ *		key have minus infinity values.
+ *
+ *		Result is intended for use with _bt_compare() and _bt_truncate().
+ *		Callers that don't need to fill out the insertion scankey arguments
+ *		(e.g. they use an own ad-hoc comparison routine, or only need a
+ *		scankey for _bt_truncate()) can pass a NULL index tuple.  The
+ *		scankey will be initialized as if an "all truncated" pivot tuple
+ *		was passed instead.
+ *
+ *		Note that we may occasionally have to share lock the metapage to
+ *		determine whether or not the keys in the index are expected to be
+ *		unique (i.e. if this is a "heapkeyspace" index).  We assume a
+ *		heapkeyspace index when caller passes a NULL tuple, allowing index
+ *		build callers to avoid accessing the non-existent metapage.
  */
 BTScanInsert
 _bt_mkscankey(Relation rel, IndexTuple itup)
@@ -79,15 +97,38 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
 
 	/*
-	 * We'll execute search using scan key constructed on key columns. Non-key
-	 * (INCLUDE index) columns are always omitted from scan keys.
+	 * We'll execute search using scan key constructed on key columns.
+	 * Truncated attributes and non-key attributes are omitted from the final
+	 * scan key.
 	 */
 	key = palloc(offsetof(BTScanInsertData, scankeys) +
 				 sizeof(ScanKeyData) * indnkeyatts);
+	key->heapkeyspace = itup == NULL || _bt_heapkeyspace(rel);
+
+	/*
+	 * Only heapkeyspace indexes support the "no minus infinity keys"
+	 * optimization.  !heapkeyspace indexes don't actually have minus infinity
+	 * attributes, but this allows us to avoid checking heapkeyspace
+	 * separately (explicit representation of number of key attributes in v3
+	 * indexes shouldn't confuse tie-breaker logic).
+	 *
+	 * There is never a need to explicitly represent truncated attributes as
+	 * having minus infinity values.  The only caller that may truly need to
+	 * search for negative infinity is the page deletion code.  It is
+	 * sufficient to omit trailing truncated attributes from the scankey
+	 * returned to that caller because caller relies on the fact that there
+	 * cannot be duplicate high keys in heapkeyspace indexes.  Caller also
+	 * opts out of the "no minus infinity key" optimization, so search moves
+	 * left on scankey-equal downlink in parent, allowing VACUUM caller to
+	 * reliably relocate leaf page undergoing deletion.
+	 */
+	key->minusinfkey = !key->heapkeyspace;
 	key->savebinsrch = key->restorebinsrch = false;
 	key->low = key->stricthigh = InvalidOffsetNumber;
 	key->nextkey = false;
 	key->keysz = Min(indnkeyatts, tupnatts);
+	key->scantid = key->heapkeyspace && itup ?
+		BTreeTupleGetHeapTID(itup) : NULL;
 	skey = key->scankeys;
 	for (i = 0; i < indnkeyatts; i++)
 	{
@@ -103,9 +144,9 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 		procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
 
 		/*
-		 * Key arguments built when caller provides no tuple are defensively
-		 * represented as NULL values, though they should still not
-		 * participate in comparisons.
+		 * Key arguments built from truncated attributes (or when caller
+		 * provides no tuple) are defensively represented as NULL values,
+		 * though they should still not participate in comparisons.
 		 */
 		if (i < tupnatts)
 			arg = index_getattr(itup, i + 1, itupdesc, &null);
@@ -2043,38 +2084,238 @@ btproperty(Oid index_oid, int attno,
 }
 
 /*
- *	_bt_nonkey_truncate() -- create tuple without non-key suffix attributes.
+ *	_bt_truncate() -- create tuple without unneeded suffix attributes.
  *
- * Returns truncated index tuple allocated in caller's memory context, with key
- * attributes copied from caller's itup argument.  Currently, suffix truncation
- * is only performed to create pivot tuples in INCLUDE indexes, but some day it
- * could be generalized to remove suffix attributes after the first
- * distinguishing key attribute.
+ * Returns truncated pivot index tuple allocated in caller's memory context,
+ * with key attributes copied from caller's firstright argument.  If rel is
+ * an INCLUDE index, non-key attributes will definitely be truncated away,
+ * since they're not part of the key space.  More aggressive suffix
+ * truncation can take place when it's clear that the returned tuple does not
+ * need one or more suffix key attributes.  We only need to keep firstright
+ * attributes up to and including the first non-lastleft-equal attribute.
+ * Caller's insertion scankey is used to compare the tuples; the scankey's
+ * argument values are not considered here.
  *
- * Truncated tuple is guaranteed to be no larger than the original, which is
- * important for staying under the 1/3 of a page restriction on tuple size.
+ * Sometimes this routine will return a new pivot tuple that takes up more
+ * space than firstright, because a new heap TID attribute had to be added to
+ * distinguish lastleft from firstright.  This should only happen when the
+ * caller is in the process of splitting a leaf page that has many logical
+ * duplicates, where it's unavoidable.
  *
  * Note that returned tuple's t_tid offset will hold the number of attributes
  * present, so the original item pointer offset is not represented.  Caller
- * should only change truncated tuple's downlink.
+ * should only change truncated tuple's downlink.  Note also that truncated
+ * key attributes are treated as containing "minus infinity" values by
+ * _bt_compare().
+ *
+ * In the worst case (when a heap TID is appended) the size of the returned
+ * tuple is the size of the first right tuple plus an additional MAXALIGN()'d
+ * item pointer.  This guarantee is important, since callers need to stay
+ * under the 1/3 of a page restriction on tuple size.  If this routine is ever
+ * taught to truncate within an attribute/datum, it will need to avoid
+ * returning an enlarged tuple to caller when truncation + TOAST compression
+ * ends up enlarging the final datum.
  */
 IndexTuple
-_bt_nonkey_truncate(Relation rel, IndexTuple itup)
+_bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
+			 BTScanInsert itup_key)
 {
-	int			nkeyattrs = IndexRelationGetNumberOfKeyAttributes(rel);
-	IndexTuple	truncated;
+	TupleDesc	itupdesc = RelationGetDescr(rel);
+	int16		natts = IndexRelationGetNumberOfAttributes(rel);
+	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	int			keepnatts;
+	IndexTuple	pivot;
+	ItemPointer pivotheaptid;
+	Size		newsize;
 
 	/*
-	 * We should only ever truncate leaf index tuples, which must have both
-	 * key and non-key attributes.  It's never okay to truncate a second time.
+	 * We should only ever truncate leaf index tuples.  It's never okay to
+	 * truncate a second time.
 	 */
-	Assert(BTreeTupleGetNAtts(itup, rel) ==
-		   IndexRelationGetNumberOfAttributes(rel));
+	Assert(BTreeTupleGetNAtts(lastleft, rel) == natts);
+	Assert(BTreeTupleGetNAtts(firstright, rel) == natts);
 
-	truncated = index_truncate_tuple(RelationGetDescr(rel), itup, nkeyattrs);
-	BTreeTupleSetNAtts(truncated, nkeyattrs);
+	/* Determine how many attributes must be kept in truncated tuple */
+	keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
 
-	return truncated;
+#ifdef DEBUG_NO_TRUNCATE
+	/* Force truncation to be ineffective for testing purposes */
+	keepnatts = nkeyatts + 1;
+#endif
+
+	if (keepnatts <= natts)
+	{
+		IndexTuple	tidpivot;
+
+		pivot = index_truncate_tuple(itupdesc, firstright, keepnatts);
+
+		/*
+		 * If there is a distinguishing key attribute within new pivot tuple,
+		 * there is no need to add an explicit heap TID attribute
+		 */
+		if (keepnatts <= nkeyatts)
+		{
+			BTreeTupleSetNAtts(pivot, keepnatts);
+			return pivot;
+		}
+
+		/*
+		 * Only truncation of non-key attributes was possible, since key
+		 * attributes are all equal.  It's necessary to add a heap TID
+		 * attribute to the new pivot tuple.
+		 */
+		Assert(natts != nkeyatts);
+		newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData));
+		tidpivot = palloc0(newsize);
+		memcpy(tidpivot, pivot, IndexTupleSize(pivot));
+		/* cannot leak memory here */
+		pfree(pivot);
+		pivot = tidpivot;
+	}
+	else
+	{
+		/*
+		 * No truncation was possible, since key attributes are all equal.
+		 * It's necessary to add a heap TID attribute to the new pivot tuple.
+		 */
+		Assert(natts == nkeyatts);
+		newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData));
+		pivot = palloc0(newsize);
+		memcpy(pivot, firstright, IndexTupleSize(firstright));
+	}
+
+	/*
+	 * We have to use heap TID as a unique-ifier in the new pivot tuple, since
+	 * no non-TID key attribute in the right item readily distinguishes the
+	 * right side of the split from the left side.  Use enlarged space that
+	 * holds a copy of first right tuple; place a heap TID value within the
+	 * extra space that remains at the end.
+	 *
+	 * nbtree conceptualizes this case as an inability to truncate away any
+	 * key attribute.  We must use an alternative representation of heap TID
+	 * within pivots because heap TID is only treated as an attribute within
+	 * nbtree (e.g., there is no explicit pg_attribute entry).
+	 */
+	Assert(itup_key->heapkeyspace);
+	pivot->t_info &= ~INDEX_SIZE_MASK;
+	pivot->t_info |= newsize;
+
+	/*
+	 * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
+	 * consider suffix truncation.  It seems like a good idea to follow that
+	 * example in cases where no truncation takes place -- use lastleft's heap
+	 * TID.  (This is also the closest value to negative infinity that's
+	 * legally usable.)
+	 */
+	pivotheaptid = (ItemPointer) ((char *) pivot + newsize -
+								  sizeof(ItemPointerData));
+	ItemPointerCopy(&lastleft->t_tid, pivotheaptid);
+
+	/*
+	 * Lehman and Yao require that the downlink to the right page, which is to
+	 * be inserted into the parent page in the second phase of a page split be
+	 * a strict lower bound on items on the right page, and a non-strict upper
+	 * bound for items on the left page.  Assert that heap TIDs follow these
+	 * invariants, since a heap TID value is apparently needed as a
+	 * tiebreaker.
+	 */
+#ifndef DEBUG_NO_TRUNCATE
+	Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0);
+	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+#else
+
+	/*
+	 * Those invariants aren't guaranteed to hold for lastleft + firstright
+	 * heap TID attribute values when they're considered here only because
+	 * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
+	 * needed as a tiebreaker).  DEBUG_NO_TRUNCATE must therefore use a heap
+	 * TID value that always works as a strict lower bound for items to the
+	 * right.  In particular, it must avoid using firstright's leading key
+	 * attribute values along with lastleft's heap TID value when lastleft's
+	 * TID happens to be greater than firstright's TID.
+	 *
+	 * (We could just use all of lastleft instead, but that would complicate
+	 * caller's free space accounting, which makes the assumption that the new
+	 * pivot must be no larger than firstright plus a MAXALIGN()'d item
+	 * pointer.)
+	 */
+	ItemPointerCopy(&firstright->t_tid, pivotheaptid);
+
+	/*
+	 * Pivot heap TID should never be fully equal to firstright.  Note that
+	 * the pivot heap TID will still end up equal to lastleft's heap TID when
+	 * that's the only value that's legally usable.
+	 */
+	ItemPointerSetOffsetNumber(pivotheaptid,
+							   OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
+	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+#endif
+
+	BTreeTupleSetNAtts(pivot, nkeyatts);
+	BTreeTupleSetAltHeapTID(pivot);
+
+	return pivot;
+}
+
+/*
+ * _bt_keep_natts - how many key attributes to keep when truncating.
+ *
+ * Caller provides two tuples that enclose a split point.  Caller's insertion
+ * scankey is used to compare the tuples; the scankey's argument values are
+ * not considered here.
+ *
+ * This can return a number of attributes that is one greater than the
+ * number of key attributes for the index relation.  This indicates that the
+ * caller must use a heap TID as a unique-ifier in new pivot tuple.
+ */
+static int
+_bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
+			   BTScanInsert itup_key)
+{
+	int			nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	TupleDesc	itupdesc = RelationGetDescr(rel);
+	int			keepnatts;
+	ScanKey		scankey;
+
+	/*
+	 * Be consistent about the representation of BTREE_VERSION 3 tuples across
+	 * Postgres versions; don't allow new pivot tuples to have truncated key
+	 * attributes there.  This keeps things consistent and simple for
+	 * verification tools that have to handle multiple versions.
+	 */
+	if (!itup_key->heapkeyspace)
+	{
+		Assert(nkeyatts != IndexRelationGetNumberOfAttributes(rel));
+		return nkeyatts;
+	}
+
+	scankey = itup_key->scankeys;
+	keepnatts = 1;
+	for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
+	{
+		Datum		datum1,
+					datum2;
+		bool		isNull1,
+					isNull2;
+
+		datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
+		datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
+
+		if (isNull1 != isNull2)
+			break;
+
+		if (!isNull1 &&
+			DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
+											scankey->sk_collation,
+											datum1,
+											datum2)) != 0)
+			break;
+
+		keepnatts++;
+	}
+
+	return keepnatts;
 }
 
 /*
@@ -2088,15 +2329,17 @@ _bt_nonkey_truncate(Relation rel, IndexTuple itup)
  * preferred to calling here.  That's usually more convenient, and is always
  * more explicit.  Call here instead when offnum's tuple may be a negative
  * infinity tuple that uses the pre-v11 on-disk representation, or when a low
- * context check is appropriate.
+ * context check is appropriate.  This routine is as strict as possible about
+ * what is expected on each version of btree.
  */
 bool
-_bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
+_bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 {
 	int16		natts = IndexRelationGetNumberOfAttributes(rel);
 	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	IndexTuple	itup;
+	int			tupnatts;
 
 	/*
 	 * We cannot reliably test a deleted or half-deleted page, since they have
@@ -2116,16 +2359,26 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
 					 "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
 
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+	tupnatts = BTreeTupleGetNAtts(itup, rel);
 
 	if (P_ISLEAF(opaque))
 	{
 		if (offnum >= P_FIRSTDATAKEY(opaque))
 		{
 			/*
-			 * Leaf tuples that are not the page high key (non-pivot tuples)
-			 * should never be truncated
+			 * Non-pivot tuples currently never use alternative heap TID
+			 * representation -- even those within heapkeyspace indexes
 			 */
-			return BTreeTupleGetNAtts(itup, rel) == natts;
+			if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
+				return false;
+
+			/*
+			 * Leaf tuples that are not the page high key (non-pivot tuples)
+			 * should never be truncated.  (Note that tupnatts must have been
+			 * inferred, rather than coming from an explicit on-disk
+			 * representation.)
+			 */
+			return tupnatts == natts;
 		}
 		else
 		{
@@ -2135,8 +2388,15 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
 			 */
 			Assert(!P_RIGHTMOST(opaque));
 
-			/* Page high key tuple contains only key attributes */
-			return BTreeTupleGetNAtts(itup, rel) == nkeyatts;
+			/*
+			 * !heapkeyspace high key tuple contains only key attributes. Note
+			 * that tupnatts will only have been explicitly represented in
+			 * !heapkeyspace indexes that happen to have non-key attributes.
+			 */
+			if (!heapkeyspace)
+				return tupnatts == nkeyatts;
+
+			/* Use generic heapkeyspace pivot tuple handling */
 		}
 	}
 	else						/* !P_ISLEAF(opaque) */
@@ -2148,7 +2408,11 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
 			 * its high key) is its negative infinity tuple.  Negative
 			 * infinity tuples are always truncated to zero attributes.  They
 			 * are a particular kind of pivot tuple.
-			 *
+			 */
+			if (heapkeyspace)
+				return tupnatts == 0;
+
+			/*
 			 * The number of attributes won't be explicitly represented if the
 			 * negative infinity tuple was generated during a page split that
 			 * occurred with a version of Postgres before v11.  There must be
@@ -2159,18 +2423,109 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
 			 * Prior to v11, downlinks always had P_HIKEY as their offset. Use
 			 * that to decide if the tuple is a pre-v11 tuple.
 			 */
-			return BTreeTupleGetNAtts(itup, rel) == 0 ||
+			return tupnatts == 0 ||
 				((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
 				 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
 		}
 		else
 		{
 			/*
-			 * Tuple contains only key attributes despite on is it page high
-			 * key or not
+			 * !heapkeyspace downlink tuple with separator key contains only
+			 * key attributes.  Note that tupnatts will only have been
+			 * explicitly represented in !heapkeyspace indexes that happen to
+			 * have non-key attributes.
 			 */
-			return BTreeTupleGetNAtts(itup, rel) == nkeyatts;
+			if (!heapkeyspace)
+				return tupnatts == nkeyatts;
+
+			/* Use generic heapkeyspace pivot tuple handling */
 		}
 
 	}
+
+	/* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
+	Assert(heapkeyspace);
+
+	/*
+	 * Explicit representation of the number of attributes is mandatory with
+	 * heapkeyspace index pivot tuples, regardless of whether or not there are
+	 * non-key attributes.
+	 */
+	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+		return false;
+
+	/*
+	 * Heap TID is a tie-breaker key attribute, so it cannot be untruncated
+	 * when any other key attribute is truncated
+	 */
+	if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
+		return false;
+
+	/*
+	 * Pivot tuple must have at least one untruncated key attribute (minus
+	 * infinity pivot tuples are the only exception).  Pivot tuples can never
+	 * represent that there is a value present for a key attribute that
+	 * exceeds pg_index.indnkeyatts for the index.
+	 */
+	return tupnatts > 0 && tupnatts <= nkeyatts;
+}
+
+/*
+ *
+ *  _bt_check_third_page() -- check whether tuple fits on a btree page at all.
+ *
+ * We actually need to be able to fit three items on every page, so restrict
+ * any one item to 1/3 the per-page available space.  Note that itemsz should
+ * not include the ItemId overhead.
+ *
+ * It might be useful to apply TOAST methods rather than throw an error here.
+ * Using out of line storage would break assumptions made by suffix truncation
+ * and by contrib/amcheck, though.
+ */
+void
+_bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
+					 Page page, IndexTuple newtup)
+{
+	Size		itemsz;
+	BTPageOpaque opaque;
+
+	itemsz = MAXALIGN(IndexTupleSize(newtup));
+
+	/* Double check item size against limit */
+	if (itemsz <= BTMaxItemSize(page))
+		return;
+
+	/*
+	 * Tuple is probably too large to fit on page, but it's possible that the
+	 * index uses version 2 or version 3, or that page is an internal page, in
+	 * which case a slightly higher limit applies.
+	 */
+	if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
+		return;
+
+	/*
+	 * Internal page insertions cannot fail here, because that would mean that
+	 * an earlier leaf level insertion that should have failed didn't
+	 */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	if (!P_ISLEAF(opaque))
+		elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
+			 itemsz, RelationGetRelationName(rel));
+
+	ereport(ERROR,
+			(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+			 errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
+					itemsz,
+					needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
+					needheaptidspace ? BTMaxItemSize(page) :
+					BTMaxItemSizeNoHeapTid(page),
+					RelationGetRelationName(rel)),
+			 errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
+					   ItemPointerGetBlockNumber(&newtup->t_tid),
+					   ItemPointerGetOffsetNumber(&newtup->t_tid),
+					   RelationGetRelationName(heap)),
+			 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
+					 "Consider a function index of an MD5 hash of the value, "
+					 "or use full text indexing."),
+			 errtableconstraint(heap, RelationGetRelationName(rel))));
 }
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index b0666b42df..876ff0c40f 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -103,7 +103,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
 
 	md = BTPageGetMeta(metapg);
 	md->btm_magic = BTREE_MAGIC;
-	md->btm_version = BTREE_VERSION;
+	md->btm_version = xlrec->version;
 	md->btm_root = xlrec->root;
 	md->btm_level = xlrec->level;
 	md->btm_fastroot = xlrec->fastroot;
@@ -202,7 +202,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
 }
 
 static void
-btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record)
+btree_xlog_split(bool onleft, XLogReaderState *record)
 {
 	XLogRecPtr	lsn = record->EndRecPtr;
 	xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
@@ -213,8 +213,6 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record)
 	BTPageOpaque ropaque;
 	char	   *datapos;
 	Size		datalen;
-	IndexTuple	left_hikey = NULL;
-	Size		left_hikeysz = 0;
 	BlockNumber leftsib;
 	BlockNumber rightsib;
 	BlockNumber rnext;
@@ -248,20 +246,6 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record)
 
 	_bt_restore_page(rpage, datapos, datalen);
 
-	/*
-	 * When the high key isn't present is the wal record, then we assume it to
-	 * be equal to the first key on the right page.  It must be from the leaf
-	 * level.
-	 */
-	if (!lhighkey)
-	{
-		ItemId		hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
-
-		Assert(isleaf);
-		left_hikey = (IndexTuple) PageGetItem(rpage, hiItemId);
-		left_hikeysz = ItemIdGetLength(hiItemId);
-	}
-
 	PageSetLSN(rpage, lsn);
 	MarkBufferDirty(rbuf);
 
@@ -284,6 +268,8 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record)
 		OffsetNumber off;
 		IndexTuple	newitem = NULL;
 		Size		newitemsz = 0;
+		IndexTuple	left_hikey = NULL;
+		Size		left_hikeysz = 0;
 		Page		newlpage;
 		OffsetNumber leftoff;
 
@@ -298,13 +284,10 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record)
 		}
 
 		/* Extract left hikey and its size (assuming 16-bit alignment) */
-		if (lhighkey)
-		{
-			left_hikey = (IndexTuple) datapos;
-			left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
-			datapos += left_hikeysz;
-			datalen -= left_hikeysz;
-		}
+		left_hikey = (IndexTuple) datapos;
+		left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
+		datapos += left_hikeysz;
+		datalen -= left_hikeysz;
 
 		Assert(datalen == 0);
 
@@ -1003,16 +986,10 @@ btree_redo(XLogReaderState *record)
 			btree_xlog_insert(false, true, record);
 			break;
 		case XLOG_BTREE_SPLIT_L:
-			btree_xlog_split(true, false, record);
-			break;
-		case XLOG_BTREE_SPLIT_L_HIGHKEY:
-			btree_xlog_split(true, true, record);
+			btree_xlog_split(true, record);
 			break;
 		case XLOG_BTREE_SPLIT_R:
-			btree_xlog_split(false, false, record);
-			break;
-		case XLOG_BTREE_SPLIT_R_HIGHKEY:
-			btree_xlog_split(false, true, record);
+			btree_xlog_split(false, record);
 			break;
 		case XLOG_BTREE_VACUUM:
 			btree_xlog_vacuum(record);
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index 8d5c6ae0ab..fcac0cd8a9 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -35,8 +35,6 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 			}
 		case XLOG_BTREE_SPLIT_L:
 		case XLOG_BTREE_SPLIT_R:
-		case XLOG_BTREE_SPLIT_L_HIGHKEY:
-		case XLOG_BTREE_SPLIT_R_HIGHKEY:
 			{
 				xl_btree_split *xlrec = (xl_btree_split *) rec;
 
@@ -130,12 +128,6 @@ btree_identify(uint8 info)
 		case XLOG_BTREE_SPLIT_R:
 			id = "SPLIT_R";
 			break;
-		case XLOG_BTREE_SPLIT_L_HIGHKEY:
-			id = "SPLIT_L_HIGHKEY";
-			break;
-		case XLOG_BTREE_SPLIT_R_HIGHKEY:
-			id = "SPLIT_R_HIGHKEY";
-			break;
 		case XLOG_BTREE_VACUUM:
 			id = "VACUUM";
 			break;
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index f97a82ae7b..5b7637883e 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -4057,9 +4057,10 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 	}
 
 	/*
-	 * If key values are equal, we sort on ItemPointer.  This does not affect
-	 * validity of the finished index, but it may be useful to have index
-	 * scans in physical order.
+	 * If key values are equal, we sort on ItemPointer.  This is required for
+	 * btree indexes, since heap TID is treated as an implicit last key
+	 * attribute in order to ensure that all keys in the index are physically
+	 * unique.
 	 */
 	{
 		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
@@ -4076,6 +4077,9 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 			return (pos1 < pos2) ? -1 : 1;
 	}
 
+	/* ItemPointer values should never be equal */
+	Assert(false);
+
 	return 0;
 }
 
@@ -4128,6 +4132,9 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 			return (pos1 < pos2) ? -1 : 1;
 	}
 
+	/* ItemPointer values should never be equal */
+	Assert(false);
+
 	return 0;
 }
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 950a61958d..9332bf4086 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -112,18 +112,44 @@ typedef struct BTMetaPageData
 #define BTPageGetMeta(p) \
 	((BTMetaPageData *) PageGetContents(p))
 
+/*
+ * The current Btree version is 4.  That's what you'll get when you create
+ * a new index.
+ *
+ * Btree version 3 was used in PostgreSQL v11.  It is mostly the same as
+ * version 4, but heap TIDs were not part of the keyspace.  Index tuples
+ * with duplicate keys could be stored in any order.  We continue to
+ * support reading and writing Btree version 3, so that they don't need to
+ * be immediately re-indexed at pg_upgrade.  In order to get the new
+ * heapkeyspace semantics, however, a REINDEX is needed.
+ *
+ * Btree version 2 is the same as version 3, except for two new fields in
+ * the metapage that were introduced in version 3.  A version 2 metapage
+ * will be automatically upgraded to version 3 on the first insert to it.
+ */
 #define BTREE_METAPAGE	0		/* first page is meta */
-#define BTREE_MAGIC		0x053162	/* magic number of btree pages */
-#define BTREE_VERSION	3		/* current version number */
+#define BTREE_MAGIC		0x053162	/* magic number in metapage */
+#define BTREE_VERSION	4		/* current version number */
 #define BTREE_MIN_VERSION	2	/* minimal supported version number */
+#define BTREE_NOVAC_VERSION	3	/* minimal version with all meta fields */
 
 /*
  * Maximum size of a btree index entry, including its tuple header.
  *
  * We actually need to be able to fit three items on every page,
  * so restrict any one item to 1/3 the per-page available space.
+ *
+ * There are rare cases where _bt_truncate() will need to enlarge
+ * a heap index tuple to make space for a tie-breaker heap TID
+ * attribute, which we account for here.
  */
 #define BTMaxItemSize(page) \
+	MAXALIGN_DOWN((PageGetPageSize(page) - \
+				   MAXALIGN(SizeOfPageHeaderData + \
+							3*sizeof(ItemIdData)  + \
+							3*sizeof(ItemPointerData)) - \
+				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
+#define BTMaxItemSizeNoHeapTid(page) \
 	MAXALIGN_DOWN((PageGetPageSize(page) - \
 				   MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
 				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
@@ -187,38 +213,71 @@ typedef struct BTMetaPageData
 #define P_FIRSTDATAKEY(opaque)	(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
 /*
+ *
+ * Notes on B-Tree tuple format, and key and non-key attributes:
+ *
  * INCLUDE B-Tree indexes have non-key attributes.  These are extra
  * attributes that may be returned by index-only scans, but do not influence
  * the order of items in the index (formally, non-key attributes are not
  * considered to be part of the key space).  Non-key attributes are only
  * present in leaf index tuples whose item pointers actually point to heap
- * tuples.  All other types of index tuples (collectively, "pivot" tuples)
- * only have key attributes, since pivot tuples only ever need to represent
- * how the key space is separated.  In general, any B-Tree index that has
- * more than one level (i.e. any index that does not just consist of a
- * metapage and a single leaf root page) must have some number of pivot
- * tuples, since pivot tuples are used for traversing the tree.
+ * tuples (non-pivot tuples).
  *
- * We store the number of attributes present inside pivot tuples by abusing
- * their item pointer offset field, since pivot tuples never need to store a
- * real offset (downlinks only need to store a block number).  The offset
- * field only stores the number of attributes when the INDEX_ALT_TID_MASK
- * bit is set (we never assume that pivot tuples must explicitly store the
- * number of attributes, and currently do not bother storing the number of
- * attributes unless indnkeyatts actually differs from indnatts).
- * INDEX_ALT_TID_MASK is only used for pivot tuples at present, though it's
- * possible that it will be used within non-pivot tuples in the future.  Do
- * not assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot
- * tuple.
+ * Non-pivot tuple format:
  *
- * The 12 least significant offset bits are used to represent the number of
- * attributes in INDEX_ALT_TID_MASK tuples, leaving 4 bits that are reserved
- * for future use (BT_RESERVED_OFFSET_MASK bits). BT_N_KEYS_OFFSET_MASK should
- * be large enough to store any number <= INDEX_MAX_KEYS.
+ *  t_tid | t_info | key values | INCLUDE columns, if any
+ *
+ * t_tid points to the heap TID, which is a tie-breaker key column as of
+ * BTREE_VERSION 4.  Currently, the INDEX_ALT_TID_MASK status bit is never
+ * set.
+ *
+ * All other types of index tuples (collectively, "pivot" tuples) only
+ * have key columns, since pivot tuples only ever need to represent how
+ * the key space is separated.  In general, any B-Tree index that has more
+ * than one level (i.e. any index that does not just consist of a metapage
+ * and a single leaf root page) must have some number of pivot tuples,
+ * since pivot tuples are used for traversing the tree.  Suffix truncation
+ * can omit trailing key columns when a new pivot is formed, which makes
+ * minus infinity their logical value.  Since BTREE_VERSION 4 indexes
+ * treat heap TID as a trailing key columns that ensures that all index
+ * tuples are unique, it is necessary to represent heap TID as a trailing
+ * key column in pivot tuples, though very often this can be truncated
+ * away, just like any other key column. (Actually, the heap TID is
+ * omitted rather than truncated, since its representation is different to
+ * the non-pivot representation.)
+ *
+ * Pivot tuple format:
+ *
+ *  t_tid | t_info | key values | [heap TID]
+ *
+ * We store the number of columns present inside pivot tuples by abusing
+ * their t_tid offset field, since pivot tuples never need to store a real
+ * offset (downlinks only need to store a block number in t_tid).  The
+ * offset field only stores the number of columns/attributes when the
+ * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
+ * TID column sometimes stored in pivot tuples -- that's represented by
+ * the presence of BT_HEAP_TID_ATTR.  The INDEX_ALT_TID_MASK bit in t_info
+ * is always set on BTREE_VERSION 4.  BT_HEAP_TID_ATTR can only be set on
+ * BTREE_VERSION 4.
+ *
+ * In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set.  In
+ * that case, the number key columns is implicitly the same as the number
+ * of key columns in the index.  It is never set on version 2 indexes,
+ * which predate the introduction of INCLUDE indexes. (INCLUDE indexes are
+ * the only indexes that use INDEX_ALT_TID_MASK on version 3.)
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of columns in INDEX_ALT_TID_MASK tuples, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use.  BT_N_KEYS_OFFSET_MASK should be large enough to store any
+ * number of columns/attributes <= INDEX_MAX_KEYS.
  */
 #define INDEX_ALT_TID_MASK			INDEX_AM_RESERVED_BIT
+
+/* Item pointer offset bits */
 #define BT_RESERVED_OFFSET_MASK		0xF000
 #define BT_N_KEYS_OFFSET_MASK		0x0FFF
+#define BT_HEAP_TID_ATTR			0x1000
 
 /* Get/set downlink block number */
 #define BTreeInnerTupleGetDownLink(itup) \
@@ -241,14 +300,16 @@ typedef struct BTMetaPageData
 	} while(0)
 
 /*
- * Get/set number of attributes within B-tree index tuple. Asserts should be
- * removed when BT_RESERVED_OFFSET_MASK bits will be used.
+ * Get/set number of attributes within B-tree index tuple.
+ *
+ * Note that this does not include an implicit tie-breaker heap-TID
+ * attribute, if any.  Note also that the number of key attributes must be
+ * explicitly represented in heapkeyspace pivot tuples.
  */
 #define BTreeTupleGetNAtts(itup, rel)	\
 	( \
 		(itup)->t_info & INDEX_ALT_TID_MASK ? \
 		( \
-			AssertMacro((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_RESERVED_OFFSET_MASK) == 0), \
 			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
 		) \
 		: \
@@ -257,10 +318,52 @@ typedef struct BTMetaPageData
 #define BTreeTupleSetNAtts(itup, n) \
 	do { \
 		(itup)->t_info |= INDEX_ALT_TID_MASK; \
-		Assert(((n) & BT_RESERVED_OFFSET_MASK) == 0); \
 		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
 	} while(0)
 
+/*
+ * Get tie-breaker heap TID attribute, if any.  Macro works with both pivot
+ * and non-pivot tuples, despite differences in how heap TID is represented.
+ *
+ * Only BTREE_VERSION 4 indexes treat heap TID as a tie-breaker key attribute.
+ * This macro can be used with tuples from indexes that use earlier versions,
+ * even though the result won't be meaningful.  The expectation is that higher
+ * level code will ensure that the result is never used, for example by never
+ * providing a scantid that the result is compared against.
+ *
+ * Assumes that any tuple without INDEX_ALT_TID_MASK set has a t_tid that
+ * points to the heap, and that all pivot tuples have INDEX_ALT_TID_MASK set
+ * (since all pivot tuples must as of BTREE_VERSION 4).  When non-pivot
+ * tuples use the INDEX_ALT_TID_MASK representation in the future, they'll
+ * probably also contain a heap TID at the end of the tuple.  We currently
+ * assume that a tuple with INDEX_ALT_TID_MASK set is a pivot tuple within
+ * heapkeyspace indexes (and that a tuple without it set must be a non-pivot
+ * tuple), but it might also be used by non-pivot tuples in the future.
+ * pg_upgrade'd !heapkeyspace indexes only set INDEX_ALT_TID_MASK in pivot
+ * tuples that actually originated with the truncation of one or more
+ * attributes.
+ */
+#define BTreeTupleGetHeapTID(itup) \
+	( \
+	  (itup)->t_info & INDEX_ALT_TID_MASK && \
+	  (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \
+	  ( \
+		(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
+					   sizeof(ItemPointerData)) \
+	  ) \
+	  : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
+	)
+/*
+ * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
+ * representation (currently limited to pivot tuples)
+ */
+#define BTreeTupleSetAltHeapTID(itup) \
+	do { \
+		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+								   ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \
+	} while(0)
+
 /*
  *	Operator strategy numbers for B-tree have been moved to access/stratnum.h,
  *	because many places need to use them in ScanKeyInit() calls.
@@ -326,25 +429,55 @@ typedef BTStackData *BTStack;
  * _bt_search.  For details on its mutable state, see _bt_binsrch and
  * _bt_findinsertloc.
  *
+ * heapkeyspace indicates if we expect all keys in the index to be unique by
+ * treating heap TID as a tie-breaker attribute (i.e. the index is
+ * BTREE_VERSION 4+).  scantid should never be set when index is not a
+ * heapkeyspace index.
+ *
+ * minusinfkey controls an optimization used by heapkeyspace indexes.
+ * Searches that are not specifically interested in keys with the value minus
+ * infinity (all searches bar those performed by VACUUM for page deletion)
+ * apply the optimization by setting the field to false.  The optimization
+ * avoids unnecessarily reading the left sibling of the leaf page that
+ * matching tuples can appear on first.  Work is saved when the insertion
+ * scankey happens to search on all the untruncated "separator" key attributes
+ * for some pivot tuple, without also providing a key value for a remaining
+ * truncated-in-pivot-tuple attribute.  Reasoning about minus infinity values
+ * specifically allows this case to use a special tie-breaker, guiding search
+ * right instead of left on the next level down.  This is particularly likely
+ * to help in the common case where insertion scankey has no scantid but has
+ * values for all other attributes, especially with indexes that happen to
+ * have few distinct values (once heap TID is excluded) on each leaf page.
+ *
  * When nextkey is false (the usual case), _bt_search and _bt_binsrch will
  * locate the first item >= scankey.  When nextkey is true, they will locate
  * the first item > scan key.
  *
- * keysz is the number of insertion scankeys present.
+ * scantid is the heap TID that is used as a final tie-breaker attribute,
+ * which may be set to NULL to indicate its absence.  When inserting new
+ * tuples, it must be set, since every tuple in the tree unambiguously belongs
+ * in one exact position, even when there are entries in the tree that are
+ * considered duplicates by external code.  Unique insertions set scantid only
+ * after unique checking indicates that it's safe to insert.  Despite the
+ * representational difference, scantid is just another insertion scankey to
+ * routines like _bt_search.
  *
- * scankeys is an array of scan key entries for attributes that are compared.
- * During insertion, there must be a scan key for every attribute, but when
- * starting a regular index scan some can be omitted.  The array is used as a
- * flexible array member, though it's sized in a way that makes it possible to
- * use stack allocations.  See nbtree/README for full details.
+ * keysz is the number of insertion scankeys present, not including scantid.
+ *
+ * scankeys is an array of scan key entries for attributes that are compared
+ * before scantid (user-visible attributes).  During insertion, there must be
+ * a scan key for every attribute, but when starting a regular index scan some
+ * can be omitted.  The array is used as a flexible array member, though it's
+ * sized in a way that makes it possible to use stack allocations.  See
+ * nbtree/README for full details.
  */
 
 typedef struct BTScanInsertData
 {
 	/*
 	 * Mutable state used by _bt_binsrch to inexpensively repeat a binary
-	 * search on the leaf level.  Only used for insertions where
-	 * _bt_check_unique is called.
+	 * search on the leaf level when only scantid has changed.  Only used for
+	 * insertions where _bt_check_unique is called.
 	 */
 	bool		savebinsrch;
 	bool		restorebinsrch;
@@ -352,7 +485,10 @@ typedef struct BTScanInsertData
 	OffsetNumber stricthigh;
 
 	/* State used to locate a position at the leaf level */
+	bool		heapkeyspace;
+	bool		minusinfkey;
 	bool		nextkey;
+	ItemPointer scantid;		/* tiebreaker for scankeys */
 	int			keysz;			/* Size of scankeys */
 	ScanKeyData scankeys[INDEX_MAX_KEYS];	/* Must appear last */
 } BTScanInsertData;
@@ -582,6 +718,7 @@ extern void _bt_upgrademetapage(Page page);
 extern Buffer _bt_getroot(Relation rel, int access);
 extern Buffer _bt_gettrueroot(Relation rel);
 extern int	_bt_getrootheight(Relation rel);
+extern bool _bt_heapkeyspace(Relation rel);
 extern void _bt_checkpage(Relation rel, Buffer buf);
 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
@@ -635,8 +772,12 @@ extern bytea *btoptions(Datum reloptions, bool validate);
 extern bool btproperty(Oid index_oid, int attno,
 		   IndexAMProperty prop, const char *propname,
 		   bool *res, bool *isnull);
-extern IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup);
-extern bool _bt_check_natts(Relation rel, Page page, OffsetNumber offnum);
+extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft,
+			 IndexTuple firstright, BTScanInsert itup_key);
+extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
+				OffsetNumber offnum);
+extern void _bt_check_third_page(Relation rel, Relation heap,
+					 bool needheaptidspace, Page page, IndexTuple newtup);
 
 /*
  * prototypes for functions in nbtvalidate.c
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index a605851c98..6320a0098f 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -28,8 +28,7 @@
 #define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
 #define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */
 #define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
-#define XLOG_BTREE_SPLIT_L_HIGHKEY 0x50 /* as above, include truncated highkey */
-#define XLOG_BTREE_SPLIT_R_HIGHKEY 0x60 /* as above, include truncated highkey */
+/* 0x50 and 0x60 are unused */
 #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
 #define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */
 #define XLOG_BTREE_UNLINK_PAGE_META 0x90	/* same, and update metapage */
@@ -47,6 +46,7 @@
  */
 typedef struct xl_btree_metadata
 {
+	uint32		version;
 	BlockNumber root;
 	uint32		level;
 	BlockNumber fastroot;
@@ -80,27 +80,30 @@ typedef struct xl_btree_insert
  * whole page image.  The left page, however, is handled in the normal
  * incremental-update fashion.
  *
- * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
- * The _L and _R variants indicate whether the inserted tuple went into the
- * left or right split page (and thus, whether newitemoff and the new item
- * are stored or not).  The _HIGHKEY variants indicate that we've logged
- * explicitly left page high key value, otherwise redo should use right page
- * leftmost key as a left page high key.  _HIGHKEY is specified for internal
- * pages where right page leftmost key is suppressed, and for leaf pages
- * of covering indexes where high key have non-key attributes truncated.
+ * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
+ * There are two variants to indicate whether the inserted tuple went into the
+ * left or right split page (and thus, whether newitemoff and the new item are
+ * stored or not).  We always log the left page high key because suffix
+ * truncation can generate a new leaf high key using user-defined code.  This
+ * is also necessary on internal pages, since the first right item that the
+ * left page's high key was based on will have been truncated to zero
+ * attributes in the right page (the original is unavailable from the right
+ * page).
  *
  * Backup Blk 0: original page / new left page
  *
  * The left page's data portion contains the new item, if it's the _L variant.
- * (In the _R variants, the new item is one of the right page's tuples.)
- * If level > 0, an IndexTuple representing the HIKEY of the left page
- * follows.  We don't need this on leaf pages, because it's the same as the
- * leftmost key in the new right page.
+ * An IndexTuple representing the high key of the left page must follow with
+ * either variant.
  *
  * Backup Blk 1: new right page
  *
- * The right page's data portion contains the right page's tuples in the
- * form used by _bt_restore_page.
+ * The right page's data portion contains the right page's tuples in the form
+ * used by _bt_restore_page.  This includes the new item, if it's the _R
+ * variant.  The right page's tuples also include the right page's high key
+ * with either variant (moved from the left/original page during the split),
+ * unless the split happened to be of the rightmost page on its level, where
+ * there is no high key for new right page.
  *
  * Backup Blk 2: next block (orig page's rightlink), if any
  * Backup Blk 3: child's left sibling, if non-leaf split
diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
index b21298a2a6..ff443a476c 100644
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -199,28 +199,22 @@ reset enable_seqscan;
 reset enable_indexscan;
 reset enable_bitmapscan;
 --
--- Test B-tree page deletion. In particular, deleting a non-leaf page.
+-- Test B-tree fast path (cache rightmost leaf page) optimization.
 --
--- First create a tree that's at least four levels deep. The text inserted
--- is long and poorly compressible. That way only a few index tuples fit on
--- each page, allowing us to get a tall tree with fewer pages.
+-- First create a tree that's at least three levels deep (i.e. has one level
+-- between the root and leaf levels). The text inserted is long.  It won't be
+-- compressed because we use plain storage in the table.  Only a few index
+-- tuples fit on each internal page, allowing us to get a tall tree with few
+-- pages.  (A tall tree is required to trigger caching.)
+--
+-- The text column must be the leading column in the index, since suffix
+-- truncation would otherwise truncate tuples on internal pages, leaving us
+-- with a short tree.
 create table btree_tall_tbl(id int4, t text);
-create index btree_tall_idx on btree_tall_tbl (id, t) with (fillfactor = 10);
-insert into btree_tall_tbl
-  select g, g::text || '_' ||
-          (select string_agg(md5(i::text), '_') from generate_series(1, 50) i)
-from generate_series(1, 100) g;
--- Delete most entries, and vacuum. This causes page deletions.
-delete from btree_tall_tbl where id < 950;
-vacuum btree_tall_tbl;
---
--- Test B-tree insertion with a metapage update (XLOG_BTREE_INSERT_META
--- WAL record type). This happens when a "fast root" page is split.
---
--- The vacuum above should've turned the leaf page into a fast root. We just
--- need to insert some rows to cause the fast root page to split.
-insert into btree_tall_tbl (id, t)
-  select g, repeat('x', 100) from generate_series(1, 500) g;
+alter table btree_tall_tbl alter COLUMN t set storage plain;
+create index btree_tall_idx on btree_tall_tbl (t, id) with (fillfactor = 10);
+insert into btree_tall_tbl select g, repeat('x', 250)
+from generate_series(1, 130) g;
 --
 -- Test vacuum_cleanup_index_scale_factor
 --
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index 5d4eb59a0c..54d3eee197 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -3225,11 +3225,22 @@ explain (costs off)
 CREATE TABLE delete_test_table (a bigint, b bigint, c bigint, d bigint);
 INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
 ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
+-- Delete many entries, and vacuum. This causes page deletions.
 DELETE FROM delete_test_table WHERE a > 40000;
 VACUUM delete_test_table;
-DELETE FROM delete_test_table WHERE a > 10;
+-- Delete most entries, and vacuum, deleting internal pages and creating "fast
+-- root"
+DELETE FROM delete_test_table WHERE a < 79990;
 VACUUM delete_test_table;
 --
+-- Test B-tree insertion with a metapage update (XLOG_BTREE_INSERT_META
+-- WAL record type). This happens when a "fast root" page is split.  This
+-- also creates coverage for nbtree FSM page recycling.
+--
+-- The vacuum above should've turned the leaf page into a fast root. We just
+-- need to insert some rows to cause the fast root page to split.
+INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,1000) i;
+--
 -- REINDEX (VERBOSE)
 --
 CREATE TABLE reindex_verbose(id integer primary key);
diff --git a/src/test/regress/expected/dependency.out b/src/test/regress/expected/dependency.out
index 8e50f8ffbb..8d31110b87 100644
--- a/src/test/regress/expected/dependency.out
+++ b/src/test/regress/expected/dependency.out
@@ -128,9 +128,9 @@ FROM pg_type JOIN pg_class c ON typrelid = c.oid WHERE typname = 'deptest_t';
 -- doesn't work: grant still exists
 DROP USER regress_dep_user1;
 ERROR:  role "regress_dep_user1" cannot be dropped because some objects depend on it
-DETAIL:  owner of default privileges on new relations belonging to role regress_dep_user1 in schema deptest
+DETAIL:  privileges for table deptest1
 privileges for database regression
-privileges for table deptest1
+owner of default privileges on new relations belonging to role regress_dep_user1 in schema deptest
 DROP OWNED BY regress_dep_user1;
 DROP USER regress_dep_user1;
 \set VERBOSITY terse
diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out
index 0e32d5c427..ac41419c7b 100644
--- a/src/test/regress/expected/event_trigger.out
+++ b/src/test/regress/expected/event_trigger.out
@@ -187,9 +187,9 @@ ERROR:  event trigger "regress_event_trigger" does not exist
 -- should fail, regress_evt_user owns some objects
 drop role regress_evt_user;
 ERROR:  role "regress_evt_user" cannot be dropped because some objects depend on it
-DETAIL:  owner of event trigger regress_event_trigger3
+DETAIL:  owner of user mapping for regress_evt_user on server useless_server
 owner of default privileges on new relations belonging to role regress_evt_user
-owner of user mapping for regress_evt_user on server useless_server
+owner of event trigger regress_event_trigger3
 -- cleanup before next test
 -- these are all OK; the second one should emit a NOTICE
 drop event trigger if exists regress_event_trigger2;
diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out
index 4d82d3a7e8..9c763ec184 100644
--- a/src/test/regress/expected/foreign_data.out
+++ b/src/test/regress/expected/foreign_data.out
@@ -441,8 +441,8 @@ ALTER SERVER s1 OWNER TO regress_test_indirect;
 RESET ROLE;
 DROP ROLE regress_test_indirect;                            -- ERROR
 ERROR:  role "regress_test_indirect" cannot be dropped because some objects depend on it
-DETAIL:  owner of server s1
-privileges for foreign-data wrapper foo
+DETAIL:  privileges for foreign-data wrapper foo
+owner of server s1
 \des+
                                                                                  List of foreign servers
  Name |           Owner           | Foreign-data wrapper |                   Access privileges                   |  Type  | Version |             FDW options              | Description 
@@ -1998,9 +1998,9 @@ DROP TABLE temp_parted;
 DROP SCHEMA foreign_schema CASCADE;
 DROP ROLE regress_test_role;                                -- ERROR
 ERROR:  role "regress_test_role" cannot be dropped because some objects depend on it
-DETAIL:  privileges for server s4
+DETAIL:  owner of user mapping for regress_test_role on server s6
 privileges for foreign-data wrapper foo
-owner of user mapping for regress_test_role on server s6
+privileges for server s4
 DROP SERVER t1 CASCADE;
 NOTICE:  drop cascades to user mapping for public on server t1
 DROP USER MAPPING FOR regress_test_role SERVER s6;
diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out
index 2e170497c9..bad5199d9e 100644
--- a/src/test/regress/expected/rowsecurity.out
+++ b/src/test/regress/expected/rowsecurity.out
@@ -3503,8 +3503,8 @@ SELECT refclassid::regclass, deptype
 SAVEPOINT q;
 DROP ROLE regress_rls_eve; --fails due to dependency on POLICY p
 ERROR:  role "regress_rls_eve" cannot be dropped because some objects depend on it
-DETAIL:  target of policy p on table tbl1
-privileges for table tbl1
+DETAIL:  privileges for table tbl1
+target of policy p on table tbl1
 ROLLBACK TO q;
 ALTER POLICY p ON tbl1 TO regress_rls_frank USING (true);
 SAVEPOINT q;
diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
index 2b087be796..19fbfa8b72 100644
--- a/src/test/regress/sql/btree_index.sql
+++ b/src/test/regress/sql/btree_index.sql
@@ -84,32 +84,23 @@ reset enable_indexscan;
 reset enable_bitmapscan;
 
 --
--- Test B-tree page deletion. In particular, deleting a non-leaf page.
+-- Test B-tree fast path (cache rightmost leaf page) optimization.
 --
 
--- First create a tree that's at least four levels deep. The text inserted
--- is long and poorly compressible. That way only a few index tuples fit on
--- each page, allowing us to get a tall tree with fewer pages.
+-- First create a tree that's at least three levels deep (i.e. has one level
+-- between the root and leaf levels). The text inserted is long.  It won't be
+-- compressed because we use plain storage in the table.  Only a few index
+-- tuples fit on each internal page, allowing us to get a tall tree with few
+-- pages.  (A tall tree is required to trigger caching.)
+--
+-- The text column must be the leading column in the index, since suffix
+-- truncation would otherwise truncate tuples on internal pages, leaving us
+-- with a short tree.
 create table btree_tall_tbl(id int4, t text);
-create index btree_tall_idx on btree_tall_tbl (id, t) with (fillfactor = 10);
-insert into btree_tall_tbl
-  select g, g::text || '_' ||
-          (select string_agg(md5(i::text), '_') from generate_series(1, 50) i)
-from generate_series(1, 100) g;
-
--- Delete most entries, and vacuum. This causes page deletions.
-delete from btree_tall_tbl where id < 950;
-vacuum btree_tall_tbl;
-
---
--- Test B-tree insertion with a metapage update (XLOG_BTREE_INSERT_META
--- WAL record type). This happens when a "fast root" page is split.
---
-
--- The vacuum above should've turned the leaf page into a fast root. We just
--- need to insert some rows to cause the fast root page to split.
-insert into btree_tall_tbl (id, t)
-  select g, repeat('x', 100) from generate_series(1, 500) g;
+alter table btree_tall_tbl alter COLUMN t set storage plain;
+create index btree_tall_idx on btree_tall_tbl (t, id) with (fillfactor = 10);
+insert into btree_tall_tbl select g, repeat('x', 250)
+from generate_series(1, 130) g;
 
 --
 -- Test vacuum_cleanup_index_scale_factor
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index 67ecad8dd5..4487421ef3 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -1146,11 +1146,23 @@ explain (costs off)
 CREATE TABLE delete_test_table (a bigint, b bigint, c bigint, d bigint);
 INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
 ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
+-- Delete many entries, and vacuum. This causes page deletions.
 DELETE FROM delete_test_table WHERE a > 40000;
 VACUUM delete_test_table;
-DELETE FROM delete_test_table WHERE a > 10;
+-- Delete most entries, and vacuum, deleting internal pages and creating "fast
+-- root"
+DELETE FROM delete_test_table WHERE a < 79990;
 VACUUM delete_test_table;
 
+--
+-- Test B-tree insertion with a metapage update (XLOG_BTREE_INSERT_META
+-- WAL record type). This happens when a "fast root" page is split.  This
+-- also creates coverage for nbtree FSM page recycling.
+--
+-- The vacuum above should've turned the leaf page into a fast root. We just
+-- need to insert some rows to cause the fast root page to split.
+INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,1000) i;
+
 --
 -- REINDEX (VERBOSE)
 --
-- 
2.17.1