From 0efdbf168fc324b0173cbc1a2019c4748d5f312a Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 25 Sep 2019 10:08:53 -0700
Subject: [PATCH v25 2/4] Add deduplication to nbtree

---
 src/include/access/nbtree.h                   | 329 +++++++-
 src/include/access/nbtxlog.h                  |  71 +-
 src/include/access/rmgrlist.h                 |   2 +-
 src/backend/access/common/reloptions.c        |   9 +
 src/backend/access/index/genam.c              |   4 +
 src/backend/access/nbtree/Makefile            |   1 +
 src/backend/access/nbtree/README              |  74 +-
 src/backend/access/nbtree/nbtdedup.c          | 715 ++++++++++++++++++
 src/backend/access/nbtree/nbtinsert.c         | 343 ++++++++-
 src/backend/access/nbtree/nbtpage.c           | 238 +++++-
 src/backend/access/nbtree/nbtree.c            | 167 +++-
 src/backend/access/nbtree/nbtsearch.c         | 250 +++++-
 src/backend/access/nbtree/nbtsort.c           | 204 ++++-
 src/backend/access/nbtree/nbtsplitloc.c       |  36 +-
 src/backend/access/nbtree/nbtutils.c          | 204 ++++-
 src/backend/access/nbtree/nbtxlog.c           | 236 +++++-
 src/backend/access/rmgrdesc/nbtdesc.c         |  25 +-
 src/backend/utils/misc/guc.c                  |  28 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/bin/psql/tab-complete.c                   |   4 +-
 contrib/amcheck/verify_nbtree.c               | 180 ++++-
 doc/src/sgml/btree.sgml                       | 123 ++-
 doc/src/sgml/charset.sgml                     |   9 +-
 doc/src/sgml/config.sgml                      |  33 +
 doc/src/sgml/maintenance.sgml                 |   8 +
 doc/src/sgml/ref/create_index.sgml            |  44 +-
 doc/src/sgml/ref/reindex.sgml                 |   5 +-
 src/test/regress/expected/btree_index.out     |  16 +
 src/test/regress/sql/btree_index.sql          |  17 +
 29 files changed, 3131 insertions(+), 245 deletions(-)
 create mode 100644 src/backend/access/nbtree/nbtdedup.c

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9833cc10bd..1482d5ab1a 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -24,6 +24,17 @@
 #include "storage/bufmgr.h"
 #include "storage/shm_toc.h"
 
+/* deduplication GUC modes */
+typedef enum DeduplicationMode
+{
+	DEDUP_OFF = 0,		/* disabled */
+	DEDUP_ON,			/* enabled generally */
+	DEDUP_NONUNIQUE		/* enabled with non-unique indexes only (default) */
+} DeduplicationMode;
+
+/* GUC parameter */
+extern int	btree_deduplication;
+
 /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
 typedef uint16 BTCycleId;
 
@@ -108,6 +119,7 @@ typedef struct BTMetaPageData
 										 * pages */
 	float8		btm_last_cleanup_num_heap_tuples;	/* number of heap tuples
 													 * during last cleanup */
+	bool		btm_safededup;	/* deduplication known to be safe? */
 } BTMetaPageData;
 
 #define BTPageGetMeta(p) \
@@ -115,7 +127,8 @@ typedef struct BTMetaPageData
 
 /*
  * The current Btree version is 4.  That's what you'll get when you create
- * a new index.
+ * a new index.  The btm_safededup field can only be set if this happened
+ * on Postgres 13, but it's safe to read with version 3 indexes.
  *
  * Btree version 3 was used in PostgreSQL v11.  It is mostly the same as
  * version 4, but heap TIDs were not part of the keyspace.  Index tuples
@@ -132,8 +145,8 @@ typedef struct BTMetaPageData
 #define BTREE_METAPAGE	0		/* first page is meta */
 #define BTREE_MAGIC		0x053162	/* magic number in metapage */
 #define BTREE_VERSION	4		/* current version number */
-#define BTREE_MIN_VERSION	2	/* minimal supported version number */
-#define BTREE_NOVAC_VERSION	3	/* minimal version with all meta fields */
+#define BTREE_MIN_VERSION	2	/* minimum supported version */
+#define BTREE_NOVAC_VERSION	3	/* version with all meta fields set */
 
 /*
  * Maximum size of a btree index entry, including its tuple header.
@@ -155,6 +168,26 @@ typedef struct BTMetaPageData
 	MAXALIGN_DOWN((PageGetPageSize(page) - \
 				   MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
 				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
+/*
+ * MaxBTreeIndexTuplesPerPage is an upper bound on the number of "logical"
+ * tuples that may be stored on a btree leaf page.  This is comparable to
+ * the generic/physical MaxIndexTuplesPerPage upper bound.  A separate
+ * upper bound is needed in certain contexts due to posting list tuples,
+ * which only use a single physical page entry to store many logical
+ * tuples.  (MaxBTreeIndexTuplesPerPage is used to size the per-page
+ * temporary buffers used by index scans.)
+ *
+ * Note: we don't bother considering per-physical-tuple overheads here to
+ * keep things simple (value is based on how many elements a single array
+ * of heap TIDs must have to fill the space between the page header and
+ * special area).  The value is slightly higher (i.e. more conservative)
+ * than necessary as a result, which is considered acceptable.  There will
+ * only be three (very large) physical posting list tuples in leaf pages
+ * that have the largest possible number of heap TIDs/logical tuples.
+ */
+#define MaxBTreeIndexTuplesPerPage \
+	(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
+		   sizeof(ItemPointerData))
 
 /*
  * The leaf-page fillfactor defaults to 90% but is user-adjustable.
@@ -230,16 +263,15 @@ typedef struct BTMetaPageData
  * tuples (non-pivot tuples).  _bt_check_natts() enforces the rules
  * described here.
  *
- * Non-pivot tuple format:
+ * Non-pivot tuple format (plain/non-posting variant):
  *
  *  t_tid | t_info | key values | INCLUDE columns, if any
  *
  * t_tid points to the heap TID, which is a tiebreaker key column as of
- * BTREE_VERSION 4.  Currently, the INDEX_ALT_TID_MASK status bit is never
- * set for non-pivot tuples.
+ * BTREE_VERSION 4.
  *
- * All other types of index tuples ("pivot" tuples) only have key columns,
- * since pivot tuples only exist to represent how the key space is
+ * Non-pivot tuples complement pivot tuples, which only have key columns.
+ * The sole purpose of pivot tuples is to represent how the key space is
  * separated.  In general, any B-Tree index that has more than one level
  * (i.e. any index that does not just consist of a metapage and a single
  * leaf root page) must have some number of pivot tuples, since pivot
@@ -283,20 +315,103 @@ typedef struct BTMetaPageData
  * future use.  BT_N_KEYS_OFFSET_MASK should be large enough to store any
  * number of columns/attributes <= INDEX_MAX_KEYS.
  *
+ * Sometimes non-pivot tuples also use a representation that repurposes
+ * t_tid to store metadata rather than a TID.  Postgres 13 introduced a new
+ * non-pivot tuple format to support deduplication: posting list tuples.
+ * Deduplication folds together multiple equal non-pivot tuples into a
+ * logically equivalent, space efficient representation.  A posting list is
+ * an array of ItemPointerData elements.  Regular non-pivot tuples are
+ * merged together to form posting list tuples lazily, at the point where
+ * we'd otherwise have to split a leaf page.
+ *
+ * Posting tuple format (alternative non-pivot tuple representation):
+ *
+ *  t_tid | t_info | key values | posting list (TID array)
+ *
+ * Posting list tuples are recognized as such by having the
+ * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
+ * bit set in t_tid.  These flags redefine the content of the posting
+ * tuple's t_tid to store an offset to the posting list, as well as the
+ * total number of posting list array elements.
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of posting items present in the tuple, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use.  Like any non-pivot tuple, the number of columns stored is
+ * always implicitly the total number in the index (in practice there can
+ * never be non-key columns stored, since deduplication is not supported
+ * with INCLUDE indexes).
+ *
  * Note well: The macros that deal with the number of attributes in tuples
- * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
- * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
- * tuple (or must have the same number of attributes as the index has
- * generally in the case of !heapkeyspace indexes).  They will need to be
- * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
- * for something else.
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or
+ * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set
+ * must be a non-pivot tuple (or must have the same number of attributes as
+ * the index has generally in the case of !heapkeyspace indexes).
  */
 #define INDEX_ALT_TID_MASK			INDEX_AM_RESERVED_BIT
 
 /* Item pointer offset bits */
 #define BT_RESERVED_OFFSET_MASK		0xF000
 #define BT_N_KEYS_OFFSET_MASK		0x0FFF
+#define BT_N_POSTING_OFFSET_MASK	0x0FFF
 #define BT_HEAP_TID_ATTR			0x1000
+#define BT_IS_POSTING				0x2000
+
+/*
+ * N.B.: BTreeTupleIsPivot() should only be used in code that deals with
+ * heapkeyspace indexes specifically.  BTreeTupleIsPosting() works with all
+ * nbtree indexes, though.
+ */
+#define BTreeTupleIsPivot(itup)  \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0))\
+	)
+#define BTreeTupleIsPosting(itup)  \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0))\
+	)
+
+#define BTreeTupleClearBtIsPosting(itup) \
+	do { \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+		ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & ~BT_IS_POSTING); \
+	} while(0)
+
+#define BTreeTupleGetNPosting(itup)	\
+	( \
+		AssertMacro(BTreeTupleIsPosting(itup)), \
+		ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_POSTING_OFFSET_MASK \
+	)
+#define BTreeTupleSetNPosting(itup, n) \
+	do { \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_POSTING_OFFSET_MASK); \
+		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_IS_POSTING); \
+	} while(0)
+
+/*
+ * If tuple is posting, t_tid.ip_blkid contains offset of the posting list
+ */
+#define BTreeTupleGetPostingOffset(itup) \
+	( \
+		AssertMacro(BTreeTupleIsPosting(itup)), \
+		ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) \
+	)
+#define BTreeSetPostingMeta(itup, nposting, off) \
+	do { \
+		BTreeTupleSetNPosting(itup, nposting); \
+		Assert(BTreeTupleIsPosting(itup)); \
+		ItemPointerSetBlockNumber(&((itup)->t_tid), (off)); \
+	} while(0)
+
+#define BTreeTupleGetPosting(itup) \
+	(ItemPointer) ((char*) (itup) + BTreeTupleGetPostingOffset(itup))
+#define BTreeTupleGetPostingN(itup,n) \
+	(BTreeTupleGetPosting(itup) + (n))
 
 /* Get/set downlink block number */
 #define BTreeInnerTupleGetDownLink(itup) \
@@ -327,40 +442,69 @@ typedef struct BTMetaPageData
  */
 #define BTreeTupleGetNAtts(itup, rel)	\
 	( \
-		(itup)->t_info & INDEX_ALT_TID_MASK ? \
+		(BTreeTupleIsPivot(itup)) ? \
 		( \
 			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
 		) \
 		: \
 		IndexRelationGetNumberOfAttributes(rel) \
 	)
-#define BTreeTupleSetNAtts(itup, n) \
-	do { \
-		(itup)->t_info |= INDEX_ALT_TID_MASK; \
-		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
-	} while(0)
+
+static inline void
+BTreeTupleSetNAtts(IndexTuple itup, int n)
+{
+	Assert(!BTreeTupleIsPosting(itup));
+	itup->t_info |= INDEX_ALT_TID_MASK;
+	ItemPointerSetOffsetNumber(&itup->t_tid, n & BT_N_KEYS_OFFSET_MASK);
+}
 
 /*
- * Get tiebreaker heap TID attribute, if any.  Macro works with both pivot
- * and non-pivot tuples, despite differences in how heap TID is represented.
+ * Get tiebreaker heap TID attribute, if any.
+ *
+ * This returns the first/lowest heap TID in the case of a posting list tuple.
  */
-#define BTreeTupleGetHeapTID(itup) \
-	( \
-	  (itup)->t_info & INDEX_ALT_TID_MASK && \
-	  (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \
-	  ( \
-		(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
-					   sizeof(ItemPointerData)) \
-	  ) \
-	  : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
-	)
+static inline ItemPointer
+BTreeTupleGetHeapTID(IndexTuple itup)
+{
+	if (BTreeTupleIsPivot(itup))
+	{
+		/* Pivot tuple heap TID representation? */
+		if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+			 BT_HEAP_TID_ATTR) != 0)
+			return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
+								  sizeof(ItemPointerData));
+
+		/* Heap TID attribute was truncated */
+		return NULL;
+	}
+	else if (BTreeTupleIsPosting(itup))
+		return BTreeTupleGetPosting(itup);
+
+	return &(itup->t_tid);
+}
+
 /*
- * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
- * representation (currently limited to pivot tuples)
+ * Get maximum heap TID attribute, which could be the only TID in the case of
+ * a non-pivot tuple that does not have a posting list tuple.  Works with
+ * non-pivot tuples only.
+ */
+static inline ItemPointer
+BTreeTupleGetMaxHeapTID(IndexTuple itup)
+{
+	Assert(!BTreeTupleIsPivot(itup));
+
+	if (BTreeTupleIsPosting(itup))
+		return BTreeTupleGetPosting(itup) + (BTreeTupleGetNPosting(itup) - 1);
+
+	return &(itup->t_tid);
+}
+
+/*
+ * Set the heap TID attribute for a pivot tuple
  */
 #define BTreeTupleSetAltHeapTID(itup) \
 	do { \
-		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		Assert(BTreeTupleIsPivot(itup)); \
 		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
 								   ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \
 	} while(0)
@@ -435,6 +579,11 @@ typedef BTStackData *BTStack;
  * indexes whose version is >= version 4.  It's convenient to keep this close
  * by, rather than accessing the metapage repeatedly.
  *
+ * safededup is set to indicate that index may use dynamic deduplication
+ * safely (index storage parameter separately indicates if deduplication is
+ * currently in use).  This is also a property of the index relation rather
+ * than an indexscan that is kept around for convenience.
+ *
  * anynullkeys indicates if any of the keys had NULL value when scankey was
  * built from index tuple (note that already-truncated tuple key attributes
  * set NULL as a placeholder key value, which also affects value of
@@ -470,6 +619,7 @@ typedef BTStackData *BTStack;
 typedef struct BTScanInsertData
 {
 	bool		heapkeyspace;
+	bool		safededup;
 	bool		anynullkeys;
 	bool		nextkey;
 	bool		pivotsearch;
@@ -508,10 +658,70 @@ typedef struct BTInsertStateData
 	bool		bounds_valid;
 	OffsetNumber low;
 	OffsetNumber stricthigh;
+
+	/*
+	 * if _bt_binsrch_insert found the location inside existing posting list,
+	 * save the position inside the list.  This will be -1 in rare cases where
+	 * the overlapping posting list is LP_DEAD.
+	 */
+	int			postingoff;
 } BTInsertStateData;
 
 typedef BTInsertStateData *BTInsertState;
 
+/*
+ * State used to representing a pending posting list during deduplication.
+ *
+ * Each entry represents a group of consecutive items from the page, starting
+ * from page offset number 'baseoff', which is the offset number of the "base"
+ * tuple on the page undergoing deduplication.  'nitems' is the total number
+ * of items from the page that will be merged to make a new posting tuple.
+ *
+ * Note: 'nitems' means the number of physical index tuples/line pointers on
+ * the page, starting with and including the item at offset number 'baseoff'
+ * (so nitems should be at least 2 when interval is used).  These existing
+ * tuples may be posting list tuples or regular tuples.
+ */
+typedef struct BTDedupInterval
+{
+	OffsetNumber baseoff;
+	uint16		nitems;
+} BTDedupInterval;
+
+/*
+ * Btree-private state used to deduplicate items on a leaf page
+ */
+typedef struct BTDedupStateData
+{
+	Relation	rel;
+	/* Deduplication status info for entire page/operation */
+	Size		maxitemsize;	/* Limit on size of final tuple */
+	IndexTuple	newitem;
+	bool		checkingunique; /* Use unique index strategy? */
+	OffsetNumber skippedbase;	/* First offset skipped by checkingunique */
+
+	/* Metadata about current pending posting list */
+	ItemPointer htids;			/* Heap TIDs in pending posting list */
+	int			nhtids;			/* # heap TIDs in nhtids array */
+	int			nitems;			/* See BTDedupInterval definition */
+	Size		alltupsize;		/* Includes line pointer overhead */
+	bool		overlap;		/* Avoid overlapping posting lists? */
+
+	/* Metadata about base tuple of current pending posting list */
+	IndexTuple	base;			/* Use to form new posting list */
+	OffsetNumber baseoff;		/* page offset of base */
+	Size		basetupsize;	/* base size without posting list */
+
+	/*
+	 * Pending posting list.  Contains information about a group of
+	 * consecutive items that will be deduplicated by creating a new posting
+	 * list tuple.
+	 */
+	BTDedupInterval interval;
+} BTDedupStateData;
+
+typedef BTDedupStateData *BTDedupState;
+
 /*
  * BTScanOpaqueData is the btree-private state needed for an indexscan.
  * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
@@ -535,7 +745,10 @@ typedef BTInsertStateData *BTInsertState;
  * If we are doing an index-only scan, we save the entire IndexTuple for each
  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
  * into a separate workspace array; each BTScanPosItem stores its tuple's
- * offset within that array.
+ * offset within that array.  Posting list tuples store a "base" tuple once,
+ * allowing the same key to be returned for each logical tuple associated
+ * with the physical posting list tuple (i.e. for each TID from the posting
+ * list).
  */
 
 typedef struct BTScanPosItem	/* what we remember about each match */
@@ -568,6 +781,12 @@ typedef struct BTScanPosData
 	 */
 	int			nextTupleOffset;
 
+	/*
+	 * Posting list tuples use postingTupleOffset to store the current
+	 * location of the tuple that is returned multiple times.
+	 */
+	int			postingTupleOffset;
+
 	/*
 	 * The items array is always ordered in index order (ie, increasing
 	 * indexoffset).  When scanning backwards it is convenient to fill the
@@ -579,7 +798,7 @@ typedef struct BTScanPosData
 	int			lastItem;		/* last valid index in items[] */
 	int			itemIndex;		/* current index in items[] */
 
-	BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+	BTScanPosItem items[MaxBTreeIndexTuplesPerPage];	/* MUST BE LAST */
 } BTScanPosData;
 
 typedef BTScanPosData *BTScanPos;
@@ -687,6 +906,7 @@ typedef struct BTOptions
 	int			fillfactor;		/* page fill factor in percent (0..100) */
 	/* fraction of newly inserted tuples prior to trigger index cleanup */
 	float8		vacuum_cleanup_index_scale_factor;
+	bool		deduplication;	/* Use deduplication where safe? */
 } BTOptions;
 
 #define BTGetFillFactor(relation) \
@@ -695,8 +915,18 @@ typedef struct BTOptions
 	 (relation)->rd_options ? \
 	 ((BTOptions *) (relation)->rd_options)->fillfactor : \
 	 BTREE_DEFAULT_FILLFACTOR)
+#define BTGetUseDedup(relation) \
+	(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+				 relation->rd_rel->relam == BTREE_AM_OID), \
+	((relation)->rd_options ? \
+	 ((BTOptions *) (relation)->rd_options)->deduplication : \
+	 BTGetUseDedupGUC(relation)))
 #define BTGetTargetPageFreeSpace(relation) \
 	(BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
+#define BTGetUseDedupGUC(relation) \
+	(relation->rd_index->indisunique ? \
+	 btree_deduplication == DEDUP_ON : \
+	 btree_deduplication != DEDUP_OFF)
 
 /*
  * Constant definition for progress reporting.  Phase numbers must match
@@ -743,6 +973,22 @@ extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
 extern void _bt_parallel_done(IndexScanDesc scan);
 extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
 
+/*
+ * prototypes for functions in nbtdedup.c
+ */
+extern void _bt_dedup_one_page(Relation rel, Buffer buffer, Relation heapRel,
+							   IndexTuple newitem, Size newitemsz,
+							   bool checkingunique);
+extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+									OffsetNumber base_off);
+extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
+extern Size _bt_dedup_finish_pending(Buffer buffer, BTDedupState state,
+									 bool need_wal);
+extern IndexTuple _bt_form_posting(IndexTuple tuple, ItemPointer htids,
+								   int nhtids);
+extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
+								   int postingoff);
+
 /*
  * prototypes for functions in nbtinsert.c
  */
@@ -761,7 +1007,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page,
 /*
  * prototypes for functions in nbtpage.c
  */
-extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
+extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+							 bool safededup);
 extern void _bt_update_meta_cleanup_info(Relation rel,
 										 TransactionId oldestBtpoXact, float8 numHeapTuples);
 extern void _bt_upgrademetapage(Page page);
@@ -769,6 +1016,7 @@ extern Buffer _bt_getroot(Relation rel, int access);
 extern Buffer _bt_gettrueroot(Relation rel);
 extern int	_bt_getrootheight(Relation rel);
 extern bool _bt_heapkeyspace(Relation rel);
+extern bool _bt_safededup(Relation rel);
 extern void _bt_checkpage(Relation rel, Buffer buf);
 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
@@ -779,7 +1027,9 @@ extern bool _bt_page_recyclable(Page page);
 extern void _bt_delitems_delete(Relation rel, Buffer buf,
 								OffsetNumber *itemnos, int nitems, Relation heapRel);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
-								OffsetNumber *deletable, int ndeletable);
+								OffsetNumber *deletable, int ndeletable,
+								OffsetNumber *updateitemnos,
+								IndexTuple *updated, int nupdateable);
 extern int	_bt_pagedel(Relation rel, Buffer buf);
 
 /*
@@ -829,6 +1079,7 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
 							OffsetNumber offnum);
 extern void _bt_check_third_page(Relation rel, Relation heap,
 								 bool needheaptidspace, Page page, IndexTuple newtup);
+extern bool _bt_opclasses_support_dedup(Relation index);
 
 /*
  * prototypes for functions in nbtvalidate.c
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 71435a13b3..d387905cc0 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -28,7 +28,8 @@
 #define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
 #define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */
 #define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
-/* 0x50 and 0x60 are unused */
+#define XLOG_BTREE_DEDUP_PAGE	0x50	/* deduplicate tuples on leaf page */
+#define XLOG_BTREE_INSERT_POST	0x60	/* add index tuple with posting split */
 #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
 #define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */
 #define XLOG_BTREE_UNLINK_PAGE_META 0x90	/* same, and update metapage */
@@ -53,21 +54,32 @@ typedef struct xl_btree_metadata
 	uint32		fastlevel;
 	TransactionId oldest_btpo_xact;
 	float8		last_cleanup_num_heap_tuples;
+	bool		btm_safededup;
 } xl_btree_metadata;
 
 /*
  * This is what we need to know about simple (without split) insert.
  *
- * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
- * Note that INSERT_META implies it's not a leaf page.
+ * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
+ * INSERT_POST.  Note that INSERT_META and INSERT_UPPER implies it's not a
+ * leaf page, while INSERT_POST and INSERT_LEAF imply that it is.
  *
- * Backup Blk 0: original page (data contains the inserted tuple)
+ * Backup Blk 0: original page
  * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
  * Backup Blk 2: xl_btree_metadata, if INSERT_META
+ *
+ * Note: The new tuple is actually the "original" new item in the posting
+ * list split insert case (i.e. the INSERT_POST case).  A split offset for
+ * the posting list is logged before the original new item.  Recovery needs
+ * both, since it must do an in-place update of the existing posting list
+ * that was split as an extra step.  Also, recovery generates a "final"
+ * newitem.  See _bt_swap_posting().
  */
 typedef struct xl_btree_insert
 {
 	OffsetNumber offnum;
+	/* posting split offset (INSERT_POST only) */
+	/* new tuple that was inserted (or orignewitem in INSERT_POST case) */
 } xl_btree_insert;
 
 #define SizeOfBtreeInsert	(offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
@@ -91,9 +103,18 @@ typedef struct xl_btree_insert
  *
  * Backup Blk 0: original page / new left page
  *
- * The left page's data portion contains the new item, if it's the _L variant.
- * An IndexTuple representing the high key of the left page must follow with
- * either variant.
+ * The left page's data portion contains the new item, if it's the _L variant
+ * (though _R variant page split records with a posting list split sometimes
+ * need to include newitem).  An IndexTuple representing the high key of the
+ * left page must follow in all cases.
+ *
+ * The newitem is actually an "original" newitem when a posting list split
+ * occurs that requires than the original posting list be updated in passing.
+ * Recovery recognizes this case when postingoff is set.  This corresponds to
+ * the xl_btree_insert INSERT_POST case.  Note that postingoff will be set to
+ * zero (no posting split) when a posting list split occurs where both
+ * original posting list and newitem go on the right page, since recovery
+ * doesn't need to consider the posting list split at all.
  *
  * Backup Blk 1: new right page
  *
@@ -111,10 +132,26 @@ typedef struct xl_btree_split
 {
 	uint32		level;			/* tree level of page being split */
 	OffsetNumber firstright;	/* first item moved to right page */
-	OffsetNumber newitemoff;	/* new item's offset (useful for _L variant) */
+	OffsetNumber newitemoff;	/* new item's offset */
+	uint16		postingoff;		/* offset inside orig posting tuple */
 } xl_btree_split;
 
-#define SizeOfBtreeSplit	(offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber))
+#define SizeOfBtreeSplit	(offsetof(xl_btree_split, postingoff) + sizeof(uint16))
+
+/*
+ * When page is deduplicated, consecutive groups of tuples with equal keys are
+ * merged together into posting list tuples.
+ *
+ * The WAL record represents the interval that describes the posing tuple
+ * that should be added to the page.
+ */
+typedef struct xl_btree_dedup
+{
+	OffsetNumber baseoff;
+	uint16		nitems;
+} xl_btree_dedup;
+
+#define SizeOfBtreeDedup 	(offsetof(xl_btree_dedup, nitems) + sizeof(uint16))
 
 /*
  * This is what we need to know about delete of individual leaf index tuples.
@@ -148,19 +185,25 @@ typedef struct xl_btree_reuse_page
 /*
  * This is what we need to know about vacuum of individual leaf index tuples.
  * The WAL record can represent deletion of any number of index tuples on a
- * single index page when executed by VACUUM.
+ * single index page when executed by VACUUM.  It can also support "updates"
+ * of index tuples, which are actually deletions of "logical" tuples contained
+ * in an existing posting list tuple that will still have some remaining
+ * logical tuples once VACUUM finishes.
  *
  * Note that the WAL record in any vacuum of an index must have at least one
- * item to delete.
+ * item to delete or update.
  */
 typedef struct xl_btree_vacuum
 {
-	uint32		ndeleted;
+	uint16		ndeleted;
+	uint16		nupdated;
 
 	/* DELETED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+	/* UPDATED TUPLES TO ADD BACK FOLLOW */
 } xl_btree_vacuum;
 
-#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32))
+#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
 
 /*
  * This is what we need to know about marking an empty branch for deletion.
@@ -241,6 +284,8 @@ typedef struct xl_btree_newroot
 extern void btree_redo(XLogReaderState *record);
 extern void btree_desc(StringInfo buf, XLogReaderState *record);
 extern const char *btree_identify(uint8 info);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
 extern void btree_mask(char *pagedata, BlockNumber blkno);
 
 #endif							/* NBTXLOG_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index 3c0db2ccf5..2b8c6c7fc8 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL,
 PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
 PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
 PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
 PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
 PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
 PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 48377ace24..2b37afd9e5 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -158,6 +158,15 @@ static relopt_bool boolRelOpts[] =
 		},
 		true
 	},
+	{
+		{
+			"deduplication",
+			"Enables deduplication on btree index leaf pages",
+			RELOPT_KIND_BTREE,
+			ShareUpdateExclusiveLock
+		},
+		true
+	},
 	/* list terminator */
 	{{NULL}}
 };
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 2599b5d342..6e1dc596e1 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -276,6 +276,10 @@ BuildIndexValueDescription(Relation indexRelation,
 /*
  * Get the latestRemovedXid from the table entries pointed at by the index
  * tuples being deleted.
+ *
+ * Note: index access methods that don't consistently use the standard
+ * IndexTuple + heap TID item pointer representation will need to provide
+ * their own version of this function.
  */
 TransactionId
 index_compute_xid_horizon_for_tuples(Relation irel,
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile
index bf245f5dab..d69808e78c 100644
--- a/src/backend/access/nbtree/Makefile
+++ b/src/backend/access/nbtree/Makefile
@@ -14,6 +14,7 @@ include $(top_builddir)/src/Makefile.global
 
 OBJS = \
 	nbtcompare.o \
+	nbtdedup.o \
 	nbtinsert.o \
 	nbtpage.o \
 	nbtree.o \
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 6db203e75c..54cb9db49d 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -432,7 +432,10 @@ because we allow LP_DEAD to be set with only a share lock (it's exactly
 like a hint bit for a heap tuple), but physically removing tuples requires
 exclusive lock.  In the current code we try to remove LP_DEAD tuples when
 we are otherwise faced with having to split a page to do an insertion (and
-hence have exclusive lock on it already).
+hence have exclusive lock on it already).  Deduplication can also prevent
+a page split, but removing LP_DEAD tuples is the preferred approach.
+(Note that posting list tuples can only have their LP_DEAD bit set when
+every "logical" tuple represented within the posting list is known dead.)
 
 This leaves the index in a state where it has no entry for a dead tuple
 that still exists in the heap.  This is not a problem for the current
@@ -710,6 +713,75 @@ the fallback strategy assumes that duplicates are mostly inserted in
 ascending heap TID order.  The page is split in a way that leaves the left
 half of the page mostly full, and the right half of the page mostly empty.
 
+Notes about deduplication
+-------------------------
+
+We deduplicate non-pivot tuples in non-unique indexes to reduce storage
+overhead, and to avoid or at least delay page splits.  Deduplication alters
+the physical representation of tuples without changing the logical contents
+of the index, and without adding overhead to read queries.  Non-pivot
+tuples are folded together into a single physical tuple with a posting list
+(a simple array of heap TIDs with the standard item pointer format).
+Deduplication is always applied lazily, at the point where it would
+otherwise be necessary to perform a page split.  It occurs only when
+LP_DEAD items have been removed, as our last line of defense against
+splitting a leaf page.  We can set the LP_DEAD bit with posting list
+tuples, though only when all table tuples are known dead. (Bitmap scans
+cannot perform LP_DEAD bit setting, and are the common case with indexes
+that contain lots of duplicates, so this downside is considered
+acceptable.)
+
+Large groups of logical duplicates tend to appear together on the same leaf
+page due to the special duplicate logic used when choosing a split point.
+This facilitates lazy/dynamic deduplication.  Deduplication can reliably
+deduplicate a large localized group of duplicates before it can span
+multiple leaf pages.  Posting list tuples are subject to the same 1/3 of a
+page restriction as any other tuple.
+
+Lazy deduplication allows the page space accounting used during page splits
+to have absolutely minimal special case logic for posting lists.  A posting
+list can be thought of as extra payload that suffix truncation will
+reliably truncate away as needed during page splits, just like non-key
+columns from an INCLUDE index tuple.  An incoming tuple (which might cause
+a page split) can always be thought of as a non-posting-list tuple that
+must be inserted alongside existing items, without needing to consider
+deduplication.  Most of the time, that's what actually happens: incoming
+tuples are either not duplicates, or are duplicates with a heap TID that
+doesn't overlap with any existing posting list tuple.  When the incoming
+tuple really does overlap with an existing posting list, a posting list
+split is performed.  Posting list splits work in a way that more or less
+preserves the illusion that all incoming tuples do not need to be merged
+with any existing posting list tuple.
+
+Posting list splits work by "overriding" the details of the incoming tuple.
+The heap TID of the incoming tuple is altered to make it match the
+rightmost heap TID from the existing/originally overlapping posting list.
+The offset number that the new/incoming tuple is to be inserted at is
+incremented so that it will be inserted to the right of the existing
+posting list.  The insertion (or page split) operation that completes the
+insert does one extra step: an in-place update of the posting list.  The
+update changes the posting list such that the "true" heap TID from the
+original incoming tuple is now contained in the posting list.  We make
+space in the posting list by removing the heap TID that became the new
+item.  The size of the posting list won't change, and so the page split
+space accounting does not need to care about posting lists.  Also, overall
+space utilization is improved by keeping existing posting lists large.
+
+The representation of posting lists is identical to the posting lists used
+by GIN, so it would be straightforward to apply GIN's varbyte encoding
+compression scheme to individual posting lists.  Posting list compression
+would break the assumptions made by posting list splits about page space
+accounting, though, so it's not clear how compression could be integrated
+with nbtree.  Besides, posting list compression does not offer a compelling
+trade-off for nbtree, since in general nbtree is optimized for consistent
+performance with many concurrent readers and writers.  A major goal of
+nbtree's lazy approach to deduplication is to limit the performance impact
+of deduplication with random updates.  Even concurrent append-only inserts
+of the same key value will tend to have inserts of individual index tuples
+in an order that doesn't quite match heap TID order.  In general, delaying
+deduplication avoids many unnecessary posting list splits, and minimizes
+page level fragmentation.
+
 Notes About Data Representation
 -------------------------------
 
diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c
new file mode 100644
index 0000000000..1dbc32b70a
--- /dev/null
+++ b/src/backend/access/nbtree/nbtdedup.c
@@ -0,0 +1,715 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtdedup.c
+ *	  Deduplicate items in Lehman and Yao btrees for Postgres.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtdedup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "miscadmin.h"
+#include "utils/rel.h"
+
+
+/*
+ * Try to deduplicate items to free at least enough space to avoid a page
+ * split.  This function should be called during insertion, only after LP_DEAD
+ * items were removed by _bt_vacuum_one_page() to prevent a page split.
+ * (We'll have to kill LP_DEAD items here when the page's BTP_HAS_GARBAGE hint
+ * was not set, but that should be rare.)
+ *
+ * The strategy for !checkingunique callers is to perform as much
+ * deduplication as possible to free as much space as possible now, since
+ * making it harder to set LP_DEAD bits is considered an acceptable price for
+ * not having to deduplicate the same page many times.  It is unlikely that
+ * the items on the page will have their LP_DEAD bit set in the future, since
+ * that hasn't happened before now (besides, entire posting lists can still
+ * have their LP_DEAD bit set).
+ *
+ * The strategy for checkingunique callers is rather different, since the
+ * overall goal is different.  Deduplication cooperates with and enhances
+ * garbage collection, especially the LP_DEAD bit setting that takes place in
+ * _bt_check_unique().  Deduplication does as little as possible while still
+ * preventing a page split for caller, since it's less likely that posting
+ * lists will have their LP_DEAD bit set.  Deduplication avoids creating new
+ * posting lists with only two heap TIDs, and also avoids creating new posting
+ * lists from an existing posting list.  Deduplication is only useful when it
+ * delays a page split long enough for garbage collection to prevent the page
+ * split altogether.  checkingunique deduplication can make all the difference
+ * in cases where VACUUM keeps up with dead index tuples, but "recently dead"
+ * index tuples are still numerous enough to cause page splits that are truly
+ * unnecessary.
+ *
+ * Note: If newitem contains NULL values in key attributes, caller will be
+ * !checkingunique even when rel is a unique index.  The page in question will
+ * usually have many existing items with NULLs.
+ */
+void
+_bt_dedup_one_page(Relation rel, Buffer buffer, Relation heapRel,
+				   IndexTuple newitem, Size newitemsz, bool checkingunique)
+{
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Page		page = BufferGetPage(buffer);
+	BTPageOpaque oopaque;
+	BTDedupState state = NULL;
+	int			natts = IndexRelationGetNumberOfAttributes(rel);
+	OffsetNumber deletable[MaxIndexTuplesPerPage];
+	bool		minimal = checkingunique;
+	int			ndeletable = 0;
+	Size		pagesaving = 0;
+	int			count = 0;
+	bool		singlevalue = false;
+
+	oopaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	/* init deduplication state needed to build posting tuples */
+	state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+	state->rel = rel;
+
+	state->maxitemsize = BTMaxItemSize(page);
+	state->newitem = newitem;
+	state->checkingunique = checkingunique;
+	state->skippedbase = InvalidOffsetNumber;
+	/* Metadata about current pending posting list */
+	state->htids = NULL;
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->alltupsize = 0;
+	state->overlap = false;
+	/* Metadata about based tuple of current pending posting list */
+	state->base = NULL;
+	state->baseoff = InvalidOffsetNumber;
+	state->basetupsize = 0;
+
+	minoff = P_FIRSTDATAKEY(oopaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * Delete dead tuples if any. We cannot simply skip them in the cycle
+	 * below, because it's necessary to generate special Xlog record
+	 * containing such tuples to compute latestRemovedXid on a standby server
+	 * later.
+	 *
+	 * This should not affect performance, since it only can happen in a rare
+	 * situation when BTP_HAS_GARBAGE flag was not set and _bt_vacuum_one_page
+	 * was not called, or _bt_vacuum_one_page didn't remove all dead items.
+	 */
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+
+		if (ItemIdIsDead(itemid))
+			deletable[ndeletable++] = offnum;
+	}
+
+	if (ndeletable > 0)
+	{
+		/*
+		 * Skip duplication in rare cases where there were LP_DEAD items
+		 * encountered here when that frees sufficient space for caller to
+		 * avoid a page split
+		 */
+		_bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
+		if (PageGetFreeSpace(page) >= newitemsz)
+		{
+			pfree(state);
+			return;
+		}
+
+		/* Continue with deduplication */
+		minoff = P_FIRSTDATAKEY(oopaque);
+		maxoff = PageGetMaxOffsetNumber(page);
+	}
+
+	/* Make sure that new page won't have garbage flag set */
+	oopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+	newitemsz += sizeof(ItemIdData);
+	/* Conservatively size array */
+	state->htids = palloc(state->maxitemsize);
+
+	/*
+	 * Determine if a "single value" strategy page split is likely to occur
+	 * shortly after deduplication finishes.  It should be possible for the
+	 * single value split to find a split point that packs the left half of
+	 * the split BTREE_SINGLEVAL_FILLFACTOR% full.
+	 */
+	if (!checkingunique)
+	{
+		ItemId		itemid;
+		IndexTuple	itup;
+
+		itemid = PageGetItemId(page, minoff);
+		itup = (IndexTuple) PageGetItem(page, itemid);
+
+		if (_bt_keep_natts_fast(rel, newitem, itup) > natts)
+		{
+			itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
+			itup = (IndexTuple) PageGetItem(page, itemid);
+
+			/*
+			 * Use different strategy if future page split likely to need to
+			 * use "single value" strategy
+			 */
+			if (_bt_keep_natts_fast(rel, newitem, itup) > natts)
+				singlevalue = true;
+		}
+	}
+
+	/*
+	 * Iterate over tuples on the page, try to deduplicate them into posting
+	 * lists and insert into new page.  NOTE: It's essential to reassess the
+	 * max offset on each iteration, since it will change as items are
+	 * deduplicated.
+	 */
+	offnum = minoff;
+retry:
+	while (offnum <= PageGetMaxOffsetNumber(page))
+	{
+		ItemId		itemid = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(!ItemIdIsDead(itemid));
+
+		if (state->nitems == 0)
+		{
+			/*
+			 * No previous/base tuple for the data item -- use the data item
+			 * as base tuple of pending posting list
+			 */
+			_bt_dedup_start_pending(state, itup, offnum);
+		}
+		else if (_bt_keep_natts_fast(rel, state->base, itup) > natts &&
+				 _bt_dedup_save_htid(state, itup))
+		{
+			/*
+			 * Tuple is equal to base tuple of pending posting list.  Heap
+			 * TID(s) for itup have been saved in state.  The next iteration
+			 * will also end up here if it's possible to merge the next tuple
+			 * into the same pending posting list.
+			 */
+		}
+		else
+		{
+			/*
+			 * Tuple is not equal to pending posting list tuple, or
+			 * _bt_dedup_save_htid() opted to not merge current item into
+			 * pending posting list for some other reason (e.g., adding more
+			 * TIDs would have caused posting list to exceed BTMaxItemSize()
+			 * limit).
+			 *
+			 * If state contains pending posting list with more than one item,
+			 * form new posting tuple, and update the page.  Otherwise, reset
+			 * the state and move on.
+			 */
+			pagesaving += _bt_dedup_finish_pending(buffer, state,
+												   RelationNeedsWAL(rel));
+
+			count++;
+
+			/*
+			 * When caller is a checkingunique caller and we have deduplicated
+			 * enough to avoid a page split, do minimal deduplication in case
+			 * the remaining items are about to be marked dead within
+			 * _bt_check_unique().
+			 */
+			if (minimal && pagesaving >= newitemsz)
+				break;
+
+			/*
+			 * Consider special steps when a future page split of the leaf
+			 * page is likely to occur using nbtsplitloc.c's "single value"
+			 * strategy
+			 */
+			if (singlevalue)
+			{
+				/*
+				 * Adjust maxitemsize so that there isn't a third and final
+				 * 1/3 of a page width tuple that fills the page to capacity.
+				 * The third tuple produced should be smaller than the first
+				 * two by an amount equal to the free space that nbtsplitloc.c
+				 * is likely to want to leave behind when the page it split.
+				 * When there are 3 posting lists on the page, then we end
+				 * deduplication.  Remaining tuples on the page can be
+				 * deduplicated later, when they're on the new right sibling
+				 * of this page, and the new sibling page needs to be split in
+				 * turn.
+				 *
+				 * Note that it doesn't matter if there are items on the page
+				 * that were already 1/3 of a page during current pass;
+				 * they'll still count as the first two posting list tuples.
+				 */
+				if (count == 2)
+				{
+					Size		leftfree;
+
+					/* This calculation needs to match nbtsplitloc.c */
+					leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
+						MAXALIGN(sizeof(BTPageOpaqueData));
+					/* Subtract predicted size of new high key */
+					leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
+
+					/*
+					 * Reduce maxitemsize by an amount equal to target free
+					 * space on left half of page
+					 */
+					state->maxitemsize -= leftfree *
+						((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
+				}
+				else if (count == 3)
+					break;
+			}
+
+			/*
+			 * Next iteration starts immediately after base tuple offset (this
+			 * will be the next offset on the page when we didn't modify the
+			 * page)
+			 */
+			offnum = state->baseoff;
+		}
+
+		offnum = OffsetNumberNext(offnum);
+	}
+
+	/* Handle the last item when pending posting list is not empty */
+	if (state->nitems != 0)
+	{
+		pagesaving += _bt_dedup_finish_pending(buffer, state,
+											   RelationNeedsWAL(rel));
+		count++;
+	}
+
+	if (pagesaving < newitemsz && state->skippedbase != InvalidOffsetNumber)
+	{
+		/*
+		 * Didn't free enough space for new item in first checkingunique pass.
+		 * Try making a second pass over the page, this time starting from the
+		 * first candidate posting list base offset that was skipped over in
+		 * the first pass (only do a second pass when this actually happened).
+		 *
+		 * The second pass over the page may deduplicate items that were
+		 * initially passed over due to concerns about limiting the
+		 * effectiveness of LP_DEAD bit setting within _bt_check_unique().
+		 * Note that the second pass will still stop deduplicating as soon as
+		 * enough space has been freed to avoid an immediate page split.
+		 */
+		Assert(state->checkingunique);
+		offnum = state->skippedbase;
+
+		state->checkingunique = false;
+		state->skippedbase = InvalidOffsetNumber;
+		state->alltupsize = 0;
+		state->nitems = 0;
+		state->base = NULL;
+		state->baseoff = InvalidOffsetNumber;
+		state->basetupsize = 0;
+		goto retry;
+	}
+
+	/* Local space accounting should agree with page accounting */
+	Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
+
+	/* be tidy */
+	pfree(state->htids);
+	pfree(state);
+}
+
+/*
+ * Create a new pending posting list tuple based on caller's tuple.
+ *
+ * Every tuple processed by the deduplication routines either becomes the base
+ * tuple for a posting list, or gets its heap TID(s) accepted into a pending
+ * posting list.  A tuple that starts out as the base tuple for a posting list
+ * will only actually be rewritten within _bt_dedup_finish_pending() when
+ * there was at least one successful call to _bt_dedup_save_htid().
+ */
+void
+_bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+						OffsetNumber baseoff)
+{
+	Assert(state->nhtids == 0);
+	Assert(state->nitems == 0);
+
+	/*
+	 * Copy heap TIDs from new base tuple for new candidate posting list into
+	 * ipd array.  Assume that we'll eventually create a new posting tuple by
+	 * merging later tuples with this existing one, though we may not.
+	 */
+	if (!BTreeTupleIsPosting(base))
+	{
+		memcpy(state->htids, base, sizeof(ItemPointerData));
+		state->nhtids = 1;
+		/* Save size of tuple without any posting list */
+		state->basetupsize = IndexTupleSize(base);
+	}
+	else
+	{
+		int			nposting;
+
+		nposting = BTreeTupleGetNPosting(base);
+		memcpy(state->htids, BTreeTupleGetPosting(base),
+			   sizeof(ItemPointerData) * nposting);
+		state->nhtids = nposting;
+		/* Save size of tuple without any posting list */
+		state->basetupsize = BTreeTupleGetPostingOffset(base);
+	}
+
+	/*
+	 * Save new base tuple itself -- it'll be needed if we actually create a
+	 * new posting list from new pending posting list.
+	 *
+	 * Must maintain size of all tuples (including line pointer overhead) to
+	 * calculate space savings on page within _bt_dedup_finish_pending().
+	 * Also, save number of base tuple logical tuples so that we can save
+	 * cycles in the common case where an existing posting list can't or won't
+	 * be merged with other tuples on the page.
+	 */
+	state->nitems = 1;
+	state->base = base;
+	state->baseoff = baseoff;
+	state->alltupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
+	/* Also save baseoff in pending state for interval */
+	state->interval.baseoff = state->baseoff;
+	state->overlap = false;
+	if (state->newitem)
+	{
+		/* Might overlap with new item -- mark it as possible if it is */
+		if (BTreeTupleGetHeapTID(base) < BTreeTupleGetHeapTID(state->newitem))
+			state->overlap = true;
+	}
+}
+
+/*
+ * Save itup heap TID(s) into pending posting list where possible.
+ *
+ * Returns bool indicating if the pending posting list managed by state has
+ * itup's heap TID(s) saved.  When this is false, enlarging the pending
+ * posting list by the required amount would exceed the maxitemsize limit, so
+ * caller must finish the pending posting list tuple.  (Generally itup becomes
+ * the base tuple of caller's new pending posting list).
+ */
+bool
+_bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
+{
+	int			nhtids;
+	ItemPointer htids;
+	Size		mergedtupsz;
+
+	if (!BTreeTupleIsPosting(itup))
+	{
+		nhtids = 1;
+		htids = &itup->t_tid;
+	}
+	else
+	{
+		nhtids = BTreeTupleGetNPosting(itup);
+		htids = BTreeTupleGetPosting(itup);
+	}
+
+	/*
+	 * Don't append (have caller finish pending posting list as-is) if
+	 * appending heap TID(s) from itup would put us over limit
+	 */
+	mergedtupsz = MAXALIGN(state->basetupsize +
+						   (state->nhtids + nhtids) *
+						   sizeof(ItemPointerData));
+
+	if (mergedtupsz > state->maxitemsize)
+		return false;
+
+	/* Don't merge existing posting lists with checkingunique */
+	if (state->checkingunique &&
+		(BTreeTupleIsPosting(state->base) || nhtids > 1))
+	{
+		/* May begin here if second pass over page is required */
+		if (state->skippedbase == InvalidOffsetNumber)
+			state->skippedbase = state->baseoff;
+		return false;
+	}
+
+	if (state->overlap)
+	{
+		if (BTreeTupleGetMaxHeapTID(itup) > BTreeTupleGetHeapTID(state->newitem))
+		{
+			/*
+			 * newitem has heap TID in the range of the would-be new posting
+			 * list.  Avoid an immediate posting list split for caller.
+			 */
+			if (_bt_keep_natts_fast(state->rel, state->newitem, itup) >
+				IndexRelationGetNumberOfAttributes(state->rel))
+			{
+				state->newitem = NULL;	/* avoid unnecessary comparisons */
+				return false;
+			}
+		}
+	}
+
+	/*
+	 * Save heap TIDs to pending posting list tuple -- itup can be merged into
+	 * pending posting list
+	 */
+	state->nitems++;
+	memcpy(state->htids + state->nhtids, htids,
+		   sizeof(ItemPointerData) * nhtids);
+	state->nhtids += nhtids;
+	state->alltupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+
+	return true;
+}
+
+/*
+ * Finalize pending posting list tuple, and add it to the page.  Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * Returns space saving from deduplicating to make a new posting list tuple.
+ * Note that this includes line pointer overhead.  This is zero in the case
+ * where no deduplication was possible.
+ */
+Size
+_bt_dedup_finish_pending(Buffer buffer, BTDedupState state, bool need_wal)
+{
+	Size		spacesaving = 0;
+	Page		page = BufferGetPage(buffer);
+	int			minimum = 2;
+
+	Assert(state->nitems > 0);
+	Assert(state->nitems <= state->nhtids);
+	Assert(state->interval.baseoff == state->baseoff);
+
+	/*
+	 * Only create a posting list when at least 3 heap TIDs will appear in the
+	 * checkingunique case (checkingunique strategy won't merge existing
+	 * posting list tuples, so we know that the number of items here must also
+	 * be the total number of heap TIDs).  Creating a new posting lists with
+	 * only two heap TIDs won't even save enough space to fit another
+	 * duplicate with the same key as the posting list.  This is a bad
+	 * trade-off if there is a chance that the LP_DEAD bit can be set for
+	 * either existing tuple by putting off deduplication.
+	 *
+	 * (Note that a second pass over the page can deduplicate the item if that
+	 * is truly the only way to avoid a page split for checkingunique caller)
+	 */
+	Assert(!state->checkingunique || state->nitems == 1 ||
+		   state->nhtids == state->nitems);
+	if (state->checkingunique)
+	{
+		minimum = 3;
+		/* May begin here if second pass over page is required */
+		if (state->nitems == 2 && state->skippedbase == InvalidOffsetNumber)
+			state->skippedbase = state->baseoff;
+	}
+
+	if (state->nitems >= minimum)
+	{
+		IndexTuple	final;
+		Size		finalsz;
+		OffsetNumber offnum;
+		OffsetNumber deletable[MaxOffsetNumber];
+		int			ndeletable = 0;
+
+		/* find all tuples that will be replaced with this new posting tuple */
+		for (offnum = state->baseoff;
+			 offnum < state->baseoff + state->nitems;
+			 offnum = OffsetNumberNext(offnum))
+			deletable[ndeletable++] = offnum;
+
+		/* Form a tuple with a posting list */
+		final = _bt_form_posting(state->base, state->htids, state->nhtids);
+		finalsz = IndexTupleSize(final);
+		spacesaving = state->alltupsize - (finalsz + sizeof(ItemIdData));
+		/* Must have saved some space */
+		Assert(spacesaving > 0 && spacesaving < BLCKSZ);
+
+		/* Save final number of items for posting list */
+		state->interval.nitems = state->nitems;
+
+		Assert(finalsz <= state->maxitemsize);
+		Assert(finalsz == MAXALIGN(IndexTupleSize(final)));
+
+		START_CRIT_SECTION();
+
+		/* Delete items to replace */
+		PageIndexMultiDelete(page, deletable, ndeletable);
+		/* Insert posting tuple */
+		if (PageAddItem(page, (Item) final, finalsz, state->baseoff, false,
+						false) == InvalidOffsetNumber)
+			elog(ERROR, "deduplication failed to add tuple to page");
+
+		MarkBufferDirty(buffer);
+
+		/* Log deduplicated items */
+		if (need_wal)
+		{
+			XLogRecPtr	recptr;
+			xl_btree_dedup xlrec_dedup;
+
+			xlrec_dedup.baseoff = state->interval.baseoff;
+			xlrec_dedup.nitems = state->interval.nitems;
+
+			XLogBeginInsert();
+			XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+			XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
+
+			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP_PAGE);
+
+			PageSetLSN(page, recptr);
+		}
+
+		END_CRIT_SECTION();
+
+		pfree(final);
+	}
+
+	/* Reset state for next pending posting list */
+	state->nhtids = 0;
+	state->nitems = 0;
+	state->alltupsize = 0;
+
+	return spacesaving;
+}
+
+/*
+ * Build a posting list tuple from a "base" index tuple and a list of heap
+ * TIDs for posting list.
+ *
+ * Caller's "htids" array must be sorted in ascending order.  Any heap TIDs
+ * from caller's base tuple will not appear in returned posting list.
+ *
+ * If nhtids == 1, builds a non-posting tuple (posting list tuples can never
+ * have a single heap TID).
+ */
+IndexTuple
+_bt_form_posting(IndexTuple tuple, ItemPointer htids, int nhtids)
+{
+	uint32		keysize,
+				newsize = 0;
+	IndexTuple	itup;
+
+	/* We only need key part of the tuple */
+	if (BTreeTupleIsPosting(tuple))
+		keysize = BTreeTupleGetPostingOffset(tuple);
+	else
+		keysize = IndexTupleSize(tuple);
+
+	Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
+
+	/* Add space needed for posting list */
+	if (nhtids > 1)
+		newsize = SHORTALIGN(keysize) + sizeof(ItemPointerData) * nhtids;
+	else
+		newsize = keysize;
+
+	newsize = MAXALIGN(newsize);
+	itup = palloc0(newsize);
+	memcpy(itup, tuple, keysize);
+	itup->t_info &= ~INDEX_SIZE_MASK;
+	itup->t_info |= newsize;
+
+	if (nhtids > 1)
+	{
+		/* Form posting tuple, fill posting fields */
+
+		itup->t_info |= INDEX_ALT_TID_MASK;
+		BTreeSetPostingMeta(itup, nhtids, SHORTALIGN(keysize));
+		/* Copy posting list into the posting tuple */
+		memcpy(BTreeTupleGetPosting(itup), htids,
+			   sizeof(ItemPointerData) * nhtids);
+
+#ifdef USE_ASSERT_CHECKING
+		{
+			/* Assert that htid array is sorted and has unique TIDs */
+			ItemPointerData last;
+			ItemPointer current;
+
+			ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
+
+			for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+			{
+				current = BTreeTupleGetPostingN(itup, i);
+				Assert(ItemPointerCompare(current, &last) > 0);
+				ItemPointerCopy(current, &last);
+			}
+		}
+#endif
+	}
+	else
+	{
+		/* To finish building of a non-posting tuple, copy TID from htids */
+		itup->t_info &= ~INDEX_ALT_TID_MASK;
+		ItemPointerCopy(htids, &itup->t_tid);
+	}
+
+	return itup;
+}
+
+/*
+ * Prepare for a posting list split by swapping heap TID in newitem with heap
+ * TID from original posting list (the 'oposting' heap TID located at offset
+ * 'postingoff').
+ *
+ * Returns new posting list tuple, which is palloc()'d in caller's context.
+ * This is guaranteed to be the same size as 'oposting'.  Modified version of
+ * newitem is what caller actually inserts inside the critical section that
+ * also performs an in-place update of posting list.
+ *
+ * Explicit WAL-logging of newitem must use the original version of newitem in
+ * order to make it possible for our nbtxlog.c callers to correctly REDO
+ * original steps.  This approach avoids any explicit WAL-logging of a posting
+ * list tuple.  This is important because posting lists are often much larger
+ * than plain tuples.
+ *
+ * Caller should avoid assuming that the IndexTuple-wise key representation in
+ * newitem is bitwise equal to the representation used within oposting.  Note,
+ * in particular, that one may even be larger than the other.  This could
+ * occur due to differences in TOAST input state, for example.
+ */
+IndexTuple
+_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
+{
+	int			nhtids;
+	char	   *replacepos;
+	char	   *rightpos;
+	Size		nbytes;
+	IndexTuple	nposting;
+
+	nhtids = BTreeTupleGetNPosting(oposting);
+	Assert(postingoff > 0 && postingoff < nhtids);
+
+	nposting = CopyIndexTuple(oposting);
+	replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
+	rightpos = replacepos + sizeof(ItemPointerData);
+	nbytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
+
+	/*
+	 * Move item pointers in posting list to make a gap for the new item's
+	 * heap TID (shift TIDs one place to the right, losing original rightmost
+	 * TID)
+	 */
+	memmove(rightpos, replacepos, nbytes);
+
+	/* Fill the gap with the TID of the new item */
+	ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
+
+	/* Copy original posting list's rightmost TID into new item */
+	ItemPointerCopy(BTreeTupleGetPostingN(oposting, nhtids - 1),
+					&newitem->t_tid);
+	Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
+							  BTreeTupleGetHeapTID(newitem)) < 0);
+	Assert(BTreeTupleGetNPosting(oposting) == BTreeTupleGetNPosting(nposting));
+
+	return nposting;
+}
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index b93b2a0ffd..d816c45f2c 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -28,6 +28,8 @@
 /* Minimum tree height for application of fastpath optimization */
 #define BTREE_FASTPATH_MIN_LEVEL	2
 
+/* GUC parameter */
+int			btree_deduplication = DEDUP_NONUNIQUE;
 
 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
 
@@ -47,10 +49,12 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
 						   BTStack stack,
 						   IndexTuple itup,
 						   OffsetNumber newitemoff,
+						   int postingoff,
 						   bool split_only_page);
 static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
 						Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
-						IndexTuple newitem);
+						IndexTuple newitem, IndexTuple orignewitem,
+						IndexTuple nposting, uint16 postingoff);
 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 							  BTStack stack, bool is_root, bool is_only);
 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
@@ -61,7 +65,8 @@ static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
  *	_bt_doinsert() -- Handle insertion of a single index tuple in the tree.
  *
  *		This routine is called by the public interface routine, btinsert.
- *		By here, itup is filled in, including the TID.
+ *		By here, itup is filled in, including the TID.  Caller should be
+ *		prepared for us to scribble on 'itup'.
  *
  *		If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
  *		will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
@@ -125,6 +130,7 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 	insertstate.itup_key = itup_key;
 	insertstate.bounds_valid = false;
 	insertstate.buf = InvalidBuffer;
+	insertstate.postingoff = 0;
 
 	/*
 	 * It's very common to have an index on an auto-incremented or
@@ -300,7 +306,7 @@ top:
 		newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
 									   stack, heapRel);
 		_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
-					   itup, newitemoff, false);
+					   itup, newitemoff, insertstate.postingoff, false);
 	}
 	else
 	{
@@ -353,6 +359,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	BTPageOpaque opaque;
 	Buffer		nbuf = InvalidBuffer;
 	bool		found = false;
+	bool		inposting = false;
+	bool		prev_all_dead = true;
+	int			curposti = 0;
 
 	/* Assume unique until we find a duplicate */
 	*is_unique = true;
@@ -374,6 +383,11 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 
 	/*
 	 * Scan over all equal tuples, looking for live conflicts.
+	 *
+	 * Note that each iteration of the loop processes one heap TID, not one
+	 * index tuple.  The page offset number won't be advanced for iterations
+	 * which process heap TIDs from posting list tuples until the last such
+	 * heap TID for the posting list (curposti will be advanced instead).
 	 */
 	Assert(!insertstate->bounds_valid || insertstate->low == offset);
 	Assert(!itup_key->anynullkeys);
@@ -435,7 +449,27 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 
 				/* okay, we gotta fetch the heap tuple ... */
 				curitup = (IndexTuple) PageGetItem(page, curitemid);
-				htid = curitup->t_tid;
+
+				/*
+				 * decide if this is the first heap TID in tuple we'll
+				 * process, or if we should continue to process current
+				 * posting list
+				 */
+				if (!BTreeTupleIsPosting(curitup))
+				{
+					htid = curitup->t_tid;
+					inposting = false;
+				}
+				else if (!inposting)
+				{
+					/* First heap TID in posting list */
+					inposting = true;
+					prev_all_dead = true;
+					curposti = 0;
+				}
+
+				if (inposting)
+					htid = *BTreeTupleGetPostingN(curitup, curposti);
 
 				/*
 				 * If we are doing a recheck, we expect to find the tuple we
@@ -511,8 +545,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					 * not part of this chain because it had a different index
 					 * entry.
 					 */
-					htid = itup->t_tid;
-					if (table_index_fetch_tuple_check(heapRel, &htid,
+					if (table_index_fetch_tuple_check(heapRel, &itup->t_tid,
 													  SnapshotSelf, NULL))
 					{
 						/* Normal case --- it's still live */
@@ -570,12 +603,14 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 													RelationGetRelationName(rel))));
 					}
 				}
-				else if (all_dead)
+				else if (all_dead && (!inposting ||
+									  (prev_all_dead &&
+									   curposti == BTreeTupleGetNPosting(curitup) - 1)))
 				{
 					/*
-					 * The conflicting tuple (or whole HOT chain) is dead to
-					 * everyone, so we may as well mark the index entry
-					 * killed.
+					 * The conflicting tuple (or all HOT chains pointed to by
+					 * all posting list TIDs) is dead to everyone, so mark the
+					 * index entry killed.
 					 */
 					ItemIdMarkDead(curitemid);
 					opaque->btpo_flags |= BTP_HAS_GARBAGE;
@@ -589,14 +624,29 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					else
 						MarkBufferDirtyHint(insertstate->buf, true);
 				}
+
+				/*
+				 * Remember if posting list tuple has even a single HOT chain
+				 * whose members are not all dead
+				 */
+				if (!all_dead && inposting)
+					prev_all_dead = false;
 			}
 		}
 
-		/*
-		 * Advance to next tuple to continue checking.
-		 */
-		if (offset < maxoff)
+		if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
+		{
+			/* Advance to next TID in same posting list */
+			curposti++;
+			continue;
+		}
+		else if (offset < maxoff)
+		{
+			/* Advance to next tuple */
+			curposti = 0;
+			inposting = false;
 			offset = OffsetNumberNext(offset);
+		}
 		else
 		{
 			int			highkeycmp;
@@ -621,6 +671,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					elog(ERROR, "fell off the end of index \"%s\"",
 						 RelationGetRelationName(rel));
 			}
+			curposti = 0;
+			inposting = false;
 			maxoff = PageGetMaxOffsetNumber(page);
 			offset = P_FIRSTDATAKEY(opaque);
 			/* Don't invalidate binary search bounds */
@@ -689,6 +741,7 @@ _bt_findinsertloc(Relation rel,
 	BTScanInsert itup_key = insertstate->itup_key;
 	Page		page = BufferGetPage(insertstate->buf);
 	BTPageOpaque lpageop;
+	OffsetNumber location;
 
 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 
@@ -751,13 +804,25 @@ _bt_findinsertloc(Relation rel,
 
 		/*
 		 * If the target page is full, see if we can obtain enough space by
-		 * erasing LP_DEAD items
+		 * erasing LP_DEAD items.  If that doesn't work out, and if the index
+		 * deduplication is both possible and enabled, try deduplication.
 		 */
-		if (PageGetFreeSpace(page) < insertstate->itemsz &&
-			P_HAS_GARBAGE(lpageop))
+		if (PageGetFreeSpace(page) < insertstate->itemsz)
 		{
-			_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
-			insertstate->bounds_valid = false;
+			if (P_HAS_GARBAGE(lpageop))
+			{
+				_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+				insertstate->bounds_valid = false;
+			}
+
+			if (insertstate->itup_key->safededup && BTGetUseDedup(rel) &&
+				PageGetFreeSpace(page) < insertstate->itemsz)
+			{
+				_bt_dedup_one_page(rel, insertstate->buf, heapRel,
+								   insertstate->itup, insertstate->itemsz,
+								   checkingunique);
+				insertstate->bounds_valid = false;
+			}
 		}
 	}
 	else
@@ -839,7 +904,38 @@ _bt_findinsertloc(Relation rel,
 	Assert(P_RIGHTMOST(lpageop) ||
 		   _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
 
-	return _bt_binsrch_insert(rel, insertstate);
+	location = _bt_binsrch_insert(rel, insertstate);
+
+	/*
+	 * Insertion is not prepared for the case where an LP_DEAD posting list
+	 * tuple must be split.  In the unlikely event that this happens, call
+	 * _bt_dedup_one_page() to force it to kill all LP_DEAD items.
+	 */
+	if (unlikely(insertstate->postingoff == -1))
+	{
+		Assert(insertstate->itup_key->safededup);
+
+		/*
+		 * Don't check if the option is enabled, since no actual deduplication
+		 * will be done, just cleanup.
+		 */
+		_bt_dedup_one_page(rel, insertstate->buf, heapRel, insertstate->itup,
+						   0, checkingunique);
+		Assert(!P_HAS_GARBAGE(lpageop));
+
+		/* Must reset insertstate ahead of new _bt_binsrch_insert() call */
+		insertstate->bounds_valid = false;
+		insertstate->postingoff = 0;
+		location = _bt_binsrch_insert(rel, insertstate);
+
+		/*
+		 * Might still have to split some other posting list now, but that
+		 * should never be LP_DEAD
+		 */
+		Assert(insertstate->postingoff >= 0);
+	}
+
+	return location;
 }
 
 /*
@@ -905,10 +1001,12 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
  *
  *		This recursive procedure does the following things:
  *
+ *			+  if necessary, splits an existing posting list on page.
+ *			   This is only needed when 'postingoff' is non-zero.
  *			+  if necessary, splits the target page, using 'itup_key' for
  *			   suffix truncation on leaf pages (caller passes NULL for
  *			   non-leaf pages).
- *			+  inserts the tuple.
+ *			+  inserts the new tuple (could be from split posting list).
  *			+  if the page was split, pops the parent stack, and finds the
  *			   right place to insert the new child pointer (by walking
  *			   right using information stored in the parent stack).
@@ -918,7 +1016,8 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
  *
  *		On entry, we must have the correct buffer in which to do the
  *		insertion, and the buffer must be pinned and write-locked.  On return,
- *		we will have dropped both the pin and the lock on the buffer.
+ *		we will have dropped both the pin and the lock on the buffer.  Caller
+ *		should be prepared for us to scribble on 'itup'.
  *
  *		This routine only performs retail tuple insertions.  'itup' should
  *		always be either a non-highkey leaf item, or a downlink (new high
@@ -936,11 +1035,15 @@ _bt_insertonpg(Relation rel,
 			   BTStack stack,
 			   IndexTuple itup,
 			   OffsetNumber newitemoff,
+			   int postingoff,
 			   bool split_only_page)
 {
 	Page		page;
 	BTPageOpaque lpageop;
 	Size		itemsz;
+	IndexTuple	oposting;
+	IndexTuple	origitup = NULL;
+	IndexTuple	nposting = NULL;
 
 	page = BufferGetPage(buf);
 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -954,6 +1057,8 @@ _bt_insertonpg(Relation rel,
 	Assert(P_ISLEAF(lpageop) ||
 		   BTreeTupleGetNAtts(itup, rel) <=
 		   IndexRelationGetNumberOfKeyAttributes(rel));
+	/* retail insertions of posting list tuples are disallowed */
+	Assert(!BTreeTupleIsPosting(itup));
 
 	/* The caller should've finished any incomplete splits already. */
 	if (P_INCOMPLETE_SPLIT(lpageop))
@@ -964,6 +1069,39 @@ _bt_insertonpg(Relation rel,
 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 								 * need to be consistent */
 
+	/*
+	 * Do we need to split an existing posting list item?
+	 */
+	if (postingoff != 0)
+	{
+		ItemId		itemid = PageGetItemId(page, newitemoff);
+
+		/*
+		 * The new tuple is a duplicate with a heap TID that falls inside the
+		 * range of an existing posting list tuple on a leaf page.  Prepare to
+		 * split an existing posting list by swapping new item's heap TID with
+		 * the rightmost heap TID from original posting list, and generating a
+		 * new version of the posting list that has new item's heap TID.
+		 *
+		 * Posting list splits work by modifying the overlapping posting list
+		 * as part of the same atomic operation that inserts the "new item".
+		 * The space accounting is kept simple, since it does not need to
+		 * consider posting list splits at all (this is particularly important
+		 * for the case where we also have to split the page).  Overwriting
+		 * the posting list with its post-split version is treated as an extra
+		 * step in either the insert or page split critical section.
+		 */
+		Assert(P_ISLEAF(lpageop) && !ItemIdIsDead(itemid));
+		oposting = (IndexTuple) PageGetItem(page, itemid);
+
+		/* save a copy of itup with unchanged TID for xlog record */
+		origitup = CopyIndexTuple(itup);
+		nposting = _bt_swap_posting(itup, oposting, postingoff);
+
+		/* Alter offset so that it goes after existing posting list */
+		newitemoff = OffsetNumberNext(newitemoff);
+	}
+
 	/*
 	 * Do we need to split the page to fit the item on it?
 	 *
@@ -996,7 +1134,8 @@ _bt_insertonpg(Relation rel,
 				 BlockNumberIsValid(RelationGetTargetBlock(rel))));
 
 		/* split the buffer into left and right halves */
-		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
+		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
+						 origitup, nposting, postingoff);
 		PredicateLockPageSplit(rel,
 							   BufferGetBlockNumber(buf),
 							   BufferGetBlockNumber(rbuf));
@@ -1075,6 +1214,13 @@ _bt_insertonpg(Relation rel,
 			elog(PANIC, "failed to add new item to block %u in index \"%s\"",
 				 itup_blkno, RelationGetRelationName(rel));
 
+		/*
+		 * Posting list split requires an in-place update of the existing
+		 * posting list
+		 */
+		if (nposting)
+			memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
 		MarkBufferDirty(buf);
 
 		if (BufferIsValid(metabuf))
@@ -1120,8 +1266,19 @@ _bt_insertonpg(Relation rel,
 			XLogBeginInsert();
 			XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
 
-			if (P_ISLEAF(lpageop))
+			if (P_ISLEAF(lpageop) && postingoff == 0)
+			{
+				/* Simple leaf insert */
 				xlinfo = XLOG_BTREE_INSERT_LEAF;
+			}
+			else if (postingoff != 0)
+			{
+				/*
+				 * Leaf insert with posting list split.  Must include
+				 * postingoff field before newitem/orignewitem.
+				 */
+				xlinfo = XLOG_BTREE_INSERT_POST;
+			}
 			else
 			{
 				/*
@@ -1144,6 +1301,7 @@ _bt_insertonpg(Relation rel,
 				xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
 				xlmeta.last_cleanup_num_heap_tuples =
 					metad->btm_last_cleanup_num_heap_tuples;
+				xlmeta.btm_safededup = metad->btm_safededup;
 
 				XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 				XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
@@ -1152,7 +1310,28 @@ _bt_insertonpg(Relation rel,
 			}
 
 			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
-			XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+
+			/*
+			 * We always write newitem to the page, but when there is an
+			 * original newitem due to a posting list split then we log the
+			 * original item instead.  REDO routine must reconstruct the final
+			 * newitem at the same time it reconstructs nposting.
+			 */
+			if (postingoff == 0)
+				XLogRegisterBufData(0, (char *) itup,
+									IndexTupleSize(itup));
+			else
+			{
+				/*
+				 * Must explicitly log posting off before newitem in case of
+				 * posting list split.
+				 */
+				uint16		upostingoff = postingoff;
+
+				XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
+				XLogRegisterBufData(0, (char *) origitup,
+									IndexTupleSize(origitup));
+			}
 
 			recptr = XLogInsert(RM_BTREE_ID, xlinfo);
 
@@ -1194,6 +1373,13 @@ _bt_insertonpg(Relation rel,
 			_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
 			RelationSetTargetBlock(rel, cachedBlock);
 	}
+
+	/* be tidy */
+	if (postingoff != 0)
+	{
+		pfree(nposting);
+		pfree(origitup);
+	}
 }
 
 /*
@@ -1209,12 +1395,25 @@ _bt_insertonpg(Relation rel,
  *		This function will clear the INCOMPLETE_SPLIT flag on it, and
  *		release the buffer.
  *
+ *		orignewitem, nposting, and postingoff are needed when an insert of
+ *		orignewitem results in both a posting list split and a page split.
+ *		newitem and nposting are replacements for orignewitem and the
+ *		existing posting list on the page respectively.  These extra
+ *		posting list split details are used here in the same way as they
+ *		are used in the more common case where a posting list split does
+ *		not coincide with a page split.  We need to deal with posting list
+ *		splits directly in order to ensure that everything that follows
+ *		from the insert of orignewitem is handled as a single atomic
+ *		operation (though caller's insert of a new pivot/downlink into
+ *		parent page will still be a separate operation).
+ *
  *		Returns the new right sibling of buf, pinned and write-locked.
  *		The pin and lock on buf are maintained.
  */
 static Buffer
 _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
-		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
+		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+		  IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
 {
 	Buffer		rbuf;
 	Page		origpage;
@@ -1236,12 +1435,23 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	OffsetNumber firstright;
 	OffsetNumber maxoff;
 	OffsetNumber i;
+	OffsetNumber replacepostingoff = InvalidOffsetNumber;
 	bool		newitemonleft,
 				isleaf;
 	IndexTuple	lefthikey;
 	int			indnatts = IndexRelationGetNumberOfAttributes(rel);
 	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 
+	/*
+	 * Determine offset number of existing posting list on page when a split
+	 * of a posting list needs to take place as the page is split
+	 */
+	if (nposting != NULL)
+	{
+		Assert(itup_key->heapkeyspace);
+		replacepostingoff = OffsetNumberPrev(newitemoff);
+	}
+
 	/*
 	 * origpage is the original page to be split.  leftpage is a temporary
 	 * buffer that receives the left-sibling data, which will be copied back
@@ -1273,6 +1483,13 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	 * newitemoff == firstright.  In all other cases it's clear which side of
 	 * the split every tuple goes on from context.  newitemonleft is usually
 	 * (but not always) redundant information.
+	 *
+	 * Note: In theory, the split point choice logic should operate against a
+	 * version of the page that already replaced the posting list at offset
+	 * replacepostingoff with nposting where applicable.  We don't bother with
+	 * that, though.  Both versions of the posting list must be the same size,
+	 * and both will have the same base tuple key values, so split point
+	 * choice is never affected.
 	 */
 	firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
 								  newitem, &newitemonleft);
@@ -1340,6 +1557,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		itemid = PageGetItemId(origpage, firstright);
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
+		/* Behave as if origpage posting list has already been swapped */
+		if (firstright == replacepostingoff)
+			item = nposting;
 	}
 
 	/*
@@ -1373,6 +1593,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 			Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
 			itemid = PageGetItemId(origpage, lastleftoff);
 			lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+			/* Behave as if origpage posting list has already been swapped */
+			if (lastleftoff == replacepostingoff)
+				lastleft = nposting;
 		}
 
 		Assert(lastleft != item);
@@ -1480,8 +1703,23 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 
+		/*
+		 * did caller pass new replacement posting list tuple due to posting
+		 * list split?
+		 */
+		if (i == replacepostingoff)
+		{
+			/*
+			 * swap origpage posting list with post-posting-list-split version
+			 * from caller
+			 */
+			Assert(isleaf);
+			Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
+			item = nposting;
+		}
+
 		/* does new item belong before this one? */
-		if (i == newitemoff)
+		else if (i == newitemoff)
 		{
 			if (newitemonleft)
 			{
@@ -1650,8 +1888,12 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		XLogRecPtr	recptr;
 
 		xlrec.level = ropaque->btpo.level;
+		/* See comments below on newitem, orignewitem, and posting lists */
 		xlrec.firstright = firstright;
 		xlrec.newitemoff = newitemoff;
+		xlrec.postingoff = 0;
+		if (replacepostingoff < firstright)
+			xlrec.postingoff = postingoff;
 
 		XLogBeginInsert();
 		XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
@@ -1670,11 +1912,45 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		 * because it's included with all the other items on the right page.)
 		 * Show the new item as belonging to the left page buffer, so that it
 		 * is not stored if XLogInsert decides it needs a full-page image of
-		 * the left page.  We store the offset anyway, though, to support
-		 * archive compression of these records.
+		 * the left page.  We always store newitemoff in the record, though.
+		 *
+		 * The details are sometimes slightly different for page splits that
+		 * coincide with a posting list split.  If both the replacement
+		 * posting list and newitem go on the right page, then we don't need
+		 * to log anything extra, just like the simple !newitemonleft
+		 * no-posting-split case (postingoff is set to zero in the WAL record,
+		 * so recovery doesn't need to process a posting list split at all).
+		 * Otherwise, we set postingoff and log orignewitem instead of
+		 * newitem, despite having actually inserted newitem.  Recovery must
+		 * reconstruct nposting and newitem using _bt_swap_posting().
+		 *
+		 * Note: It's possible that our page split point is the point that
+		 * makes the posting list lastleft and newitem firstright.  This is
+		 * the only case where we log orignewitem despite newitem going on the
+		 * right page.  If XLogInsert decides that it can omit orignewitem due
+		 * to logging a full-page image of the left page, everything still
+		 * works out, since recovery only needs to log orignewitem for items
+		 * on the left page (just like the regular newitem-logged case).
 		 */
-		if (newitemonleft)
-			XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
+		if (newitemonleft || xlrec.postingoff != 0)
+		{
+			if (xlrec.postingoff == 0)
+			{
+				/* Must WAL-log newitem, since it's on left page */
+				Assert(newitemonleft);
+				Assert(orignewitem == NULL && nposting == NULL);
+				XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
+			}
+			else
+			{
+				/* Must WAL-log orignewitem following posting list split */
+				Assert(newitemonleft || firstright == newitemoff);
+				Assert(ItemPointerCompare(&orignewitem->t_tid,
+										  &newitem->t_tid) < 0);
+				XLogRegisterBufData(0, (char *) orignewitem,
+									MAXALIGN(IndexTupleSize(orignewitem)));
+			}
+		}
 
 		/* Log the left page's new high key */
 		itemid = PageGetItemId(origpage, P_HIKEY);
@@ -1834,7 +2110,7 @@ _bt_insert_parent(Relation rel,
 
 		/* Recursively insert into the parent */
 		_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
-					   new_item, stack->bts_offset + 1,
+					   new_item, stack->bts_offset + 1, 0,
 					   is_only);
 
 		/* be tidy */
@@ -2190,6 +2466,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		md.fastlevel = metad->btm_level;
 		md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
 		md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+		md.btm_safededup = metad->btm_safededup;
 
 		XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
 
@@ -2303,6 +2580,6 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
 	 * Note: if we didn't find any LP_DEAD items, then the page's
 	 * BTP_HAS_GARBAGE hint bit is falsely set.  We do not bother expending a
 	 * separate write to clear it, however.  We will clear it when we split
-	 * the page.
+	 * the page (or when deduplication runs).
 	 */
 }
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 66c79623cf..3b49eb0762 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -24,6 +24,7 @@
 
 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
+#include "access/tableam.h"
 #include "access/transam.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
@@ -42,12 +43,18 @@ static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
 								   BlockNumber *target, BlockNumber *rightsib);
 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
 							   TransactionId latestRemovedXid);
+static TransactionId _bt_compute_xid_horizon_for_tuples(Relation rel,
+														Relation heapRel,
+														Buffer buf,
+														OffsetNumber *itemnos,
+														int nitems);
 
 /*
  *	_bt_initmetapage() -- Fill a page buffer with a correct metapage image
  */
 void
-_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
+_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+				 bool safededup)
 {
 	BTMetaPageData *metad;
 	BTPageOpaque metaopaque;
@@ -63,6 +70,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
 	metad->btm_fastlevel = level;
 	metad->btm_oldest_btpo_xact = InvalidTransactionId;
 	metad->btm_last_cleanup_num_heap_tuples = -1.0;
+	metad->btm_safededup = safededup;
 
 	metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	metaopaque->btpo_flags = BTP_META;
@@ -102,6 +110,9 @@ _bt_upgrademetapage(Page page)
 	metad->btm_version = BTREE_NOVAC_VERSION;
 	metad->btm_oldest_btpo_xact = InvalidTransactionId;
 	metad->btm_last_cleanup_num_heap_tuples = -1.0;
+	/* Only a REINDEX can set this field */
+	Assert(!metad->btm_safededup);
+	metad->btm_safededup = false;
 
 	/* Adjust pd_lower (see _bt_initmetapage() for details) */
 	((PageHeader) page)->pd_lower =
@@ -213,6 +224,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 		md.fastlevel = metad->btm_fastlevel;
 		md.oldest_btpo_xact = oldestBtpoXact;
 		md.last_cleanup_num_heap_tuples = numHeapTuples;
+		md.btm_safededup = metad->btm_safededup;
 
 		XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
 
@@ -274,6 +286,8 @@ _bt_getroot(Relation rel, int access)
 		Assert(metad->btm_magic == BTREE_MAGIC);
 		Assert(metad->btm_version >= BTREE_MIN_VERSION);
 		Assert(metad->btm_version <= BTREE_VERSION);
+		Assert(!metad->btm_safededup ||
+			   metad->btm_version > BTREE_NOVAC_VERSION);
 		Assert(metad->btm_root != P_NONE);
 
 		rootblkno = metad->btm_fastroot;
@@ -394,6 +408,7 @@ _bt_getroot(Relation rel, int access)
 			md.fastlevel = 0;
 			md.oldest_btpo_xact = InvalidTransactionId;
 			md.last_cleanup_num_heap_tuples = -1.0;
+			md.btm_safededup = metad->btm_safededup;
 
 			XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
 
@@ -618,6 +633,7 @@ _bt_getrootheight(Relation rel)
 	Assert(metad->btm_magic == BTREE_MAGIC);
 	Assert(metad->btm_version >= BTREE_MIN_VERSION);
 	Assert(metad->btm_version <= BTREE_VERSION);
+	Assert(!metad->btm_safededup || metad->btm_version > BTREE_NOVAC_VERSION);
 	Assert(metad->btm_fastroot != P_NONE);
 
 	return metad->btm_fastlevel;
@@ -683,6 +699,56 @@ _bt_heapkeyspace(Relation rel)
 	return metad->btm_version > BTREE_NOVAC_VERSION;
 }
 
+/*
+ *	_bt_safededup() -- can deduplication safely be used by index?
+ *
+ * Uses field from index relation's metapage/cached metapage.
+ */
+bool
+_bt_safededup(Relation rel)
+{
+	BTMetaPageData *metad;
+
+	if (rel->rd_amcache == NULL)
+	{
+		Buffer		metabuf;
+
+		metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+		metad = _bt_getmeta(rel, metabuf);
+
+		/*
+		 * If there's no root page yet, _bt_getroot() doesn't expect a cache
+		 * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
+		 * be changed to allow this case.)
+		 *
+		 * Note that we rely on the assumption that this field will be zero'ed
+		 * on indexes that were pg_upgrade'd.
+		 */
+		if (metad->btm_root == P_NONE)
+		{
+			_bt_relbuf(rel, metabuf);
+			return metad->btm_safededup;;
+		}
+
+		/* Cache the metapage data for next time */
+		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+											 sizeof(BTMetaPageData));
+		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+		_bt_relbuf(rel, metabuf);
+	}
+
+	/* Get cached page */
+	metad = (BTMetaPageData *) rel->rd_amcache;
+	/* We shouldn't have cached it if any of these fail */
+	Assert(metad->btm_magic == BTREE_MAGIC);
+	Assert(metad->btm_version >= BTREE_MIN_VERSION);
+	Assert(metad->btm_version <= BTREE_VERSION);
+	Assert(!metad->btm_safededup || metad->btm_version > BTREE_NOVAC_VERSION);
+	Assert(metad->btm_fastroot != P_NONE);
+
+	return metad->btm_safededup;
+}
+
 /*
  *	_bt_checkpage() -- Verify that a freshly-read page looks sane.
  */
@@ -968,27 +1034,73 @@ _bt_page_recyclable(Page page)
  * deleting the page it points to.
  *
  * This routine assumes that the caller has pinned and locked the buffer.
- * Also, the given deletable array *must* be sorted in ascending order.
+ * Also, the given deletable and updateitemnos arrays *must* be sorted in
+ * ascending order.
  *
  * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
  * generate recovery conflicts by accessing the heap inline, whereas VACUUMs
  * can rely on the initial heap scan taking care of the problem (pruning would
- * have generated the conflicts needed for hot standby already).
+ * have generated the conflicts needed for hot standby already).  Also,
+ * VACUUMs must deal with the case where posting list tuples have some dead
+ * TIDs, and some remaining TIDs that must not be killed.
  */
 void
-_bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable,
-					int ndeletable)
+_bt_delitems_vacuum(Relation rel, Buffer buf,
+					OffsetNumber *deletable, int ndeletable,
+					OffsetNumber *updateitemnos,
+					IndexTuple *updated, int nupdatable)
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
+	Size		itemsz;
+	Size		updated_sz = 0;
+	char	   *updated_buf = NULL;
 
-	Assert(ndeletable > 0);
+	Assert(ndeletable > 0 || nupdatable > 0);
+
+	/* XLOG stuff, buffer for updated */
+	if (nupdatable > 0 && RelationNeedsWAL(rel))
+	{
+		Size		offset = 0;
+
+		for (int i = 0; i < nupdatable; i++)
+			updated_sz += MAXALIGN(IndexTupleSize(updated[i]));
+
+		updated_buf = palloc(updated_sz);
+		for (int i = 0; i < nupdatable; i++)
+		{
+			itemsz = IndexTupleSize(updated[i]);
+			memcpy(updated_buf + offset, (char *) updated[i], itemsz);
+			offset += MAXALIGN(itemsz);
+		}
+		Assert(offset == updated_sz);
+	}
 
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
 
+	/* Handle posting tuple updates */
+	for (int i = 0; i < nupdatable; i++)
+	{
+		/*
+		 * Delete the old posting tuple first.  This will also clear the
+		 * LP_DEAD bit. (It would be correct to leave it set, but we're going
+		 * to unset the BTP_HAS_GARBAGE bit anyway.)
+		 */
+		PageIndexTupleDelete(page, updateitemnos[i]);
+
+		itemsz = IndexTupleSize(updated[i]);
+		itemsz = MAXALIGN(itemsz);
+
+		/* Add tuple with updated ItemPointers to the page */
+		if (PageAddItem(page, (Item) updated[i], itemsz, updateitemnos[i],
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "failed to rewrite posting list item in index while doing vacuum");
+	}
+
 	/* Fix the page */
-	PageIndexMultiDelete(page, deletable, ndeletable);
+	if (ndeletable > 0)
+		PageIndexMultiDelete(page, deletable, ndeletable);
 
 	/*
 	 * We can clear the vacuum cycle ID since this page has certainly been
@@ -1015,6 +1127,7 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable,
 		xl_btree_vacuum xlrec_vacuum;
 
 		xlrec_vacuum.ndeleted = ndeletable;
+		xlrec_vacuum.nupdated = nupdatable;
 
 		XLogBeginInsert();
 		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
@@ -1025,8 +1138,22 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable,
 		 * is.  When XLogInsert stores the whole buffer, the offsets array
 		 * need not be stored too.
 		 */
-		XLogRegisterBufData(0, (char *) deletable, ndeletable *
-							sizeof(OffsetNumber));
+		if (ndeletable > 0)
+			XLogRegisterBufData(0, (char *) deletable,
+								ndeletable * sizeof(OffsetNumber));
+
+		/*
+		 * Here we should save offnums and updated tuples themselves. It's
+		 * important to restore them in correct order. At first, we must
+		 * handle updated tuples and only after that other deleted items.
+		 */
+		if (nupdatable > 0)
+		{
+			Assert(updated_buf != NULL);
+			XLogRegisterBufData(0, (char *) updateitemnos,
+								nupdatable * sizeof(OffsetNumber));
+			XLogRegisterBufData(0, updated_buf, updated_sz);
+		}
 
 		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
 
@@ -1036,6 +1163,91 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable,
 	END_CRIT_SECTION();
 }
 
+/*
+ * Get the latestRemovedXid from the table entries pointed at by the index
+ * tuples being deleted.
+ *
+ * This is a version of index_compute_xid_horizon_for_tuples() specialized to
+ * nbtree, which can handle posting lists.
+ */
+static TransactionId
+_bt_compute_xid_horizon_for_tuples(Relation rel, Relation heapRel,
+								   Buffer buf, OffsetNumber *itemnos,
+								   int nitems)
+{
+	ItemPointer htids;
+	TransactionId latestRemovedXid = InvalidTransactionId;
+	Page		page = BufferGetPage(buf);
+	int			arraynitems;
+	int			finalnitems;
+
+	/*
+	 * Initial size of array can fit everything when it turns out that are no
+	 * posting lists
+	 */
+	arraynitems = nitems;
+	htids = (ItemPointer) palloc(sizeof(ItemPointerData) * arraynitems);
+
+	finalnitems = 0;
+	/* identify what the index tuples about to be deleted point to */
+	for (int i = 0; i < nitems; i++)
+	{
+		ItemId		itemid;
+		IndexTuple	itup;
+
+		itemid = PageGetItemId(page, itemnos[i]);
+		itup = (IndexTuple) PageGetItem(page, itemid);
+
+		Assert(ItemIdIsDead(itemid));
+
+		if (!BTreeTupleIsPosting(itup))
+		{
+			/* Make sure that we have space for additional heap TID */
+			if (finalnitems + 1 > arraynitems)
+			{
+				arraynitems = arraynitems * 2;
+				htids = (ItemPointer)
+					repalloc(htids, sizeof(ItemPointerData) * arraynitems);
+			}
+
+			Assert(ItemPointerIsValid(&itup->t_tid));
+			ItemPointerCopy(&itup->t_tid, &htids[finalnitems]);
+			finalnitems++;
+		}
+		else
+		{
+			int			nposting = BTreeTupleGetNPosting(itup);
+
+			/* Make sure that we have space for additional heap TIDs */
+			if (finalnitems + nposting > arraynitems)
+			{
+				arraynitems = Max(arraynitems * 2, finalnitems + nposting);
+				htids = (ItemPointer)
+					repalloc(htids, sizeof(ItemPointerData) * arraynitems);
+			}
+
+			for (int j = 0; j < nposting; j++)
+			{
+				ItemPointer htid = BTreeTupleGetPostingN(itup, j);
+
+				Assert(ItemPointerIsValid(htid));
+				ItemPointerCopy(htid, &htids[finalnitems]);
+				finalnitems++;
+			}
+		}
+	}
+
+	Assert(finalnitems >= nitems);
+
+	/* determine the actual xid horizon */
+	latestRemovedXid =
+		table_compute_xid_horizon_for_tuples(heapRel, htids, finalnitems);
+
+	pfree(htids);
+
+	return latestRemovedXid;
+}
+
 /*
  * Delete item(s) from a btree page during single-page cleanup.
  *
@@ -1046,7 +1258,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable,
  *
  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
  * the page, but it needs to generate its own recovery conflicts by accessing
- * the heap.  See comments for _bt_delitems_vacuum.
+ * the heap, and doesn't handle updating posting list tuples.  See comments
+ * for _bt_delitems_vacuum.
  */
 void
 _bt_delitems_delete(Relation rel, Buffer buf,
@@ -1062,8 +1275,8 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 
 	if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
 		latestRemovedXid =
-			index_compute_xid_horizon_for_tuples(rel, heapRel, buf,
-												 itemnos, nitems);
+			_bt_compute_xid_horizon_for_tuples(rel, heapRel, buf,
+											   itemnos, nitems);
 
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
@@ -2061,6 +2274,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
 			xlmeta.fastlevel = metad->btm_fastlevel;
 			xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
 			xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+			xlmeta.btm_safededup = metad->btm_safededup;
 
 			XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
 			xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index bbc1376b0a..8a67193152 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -95,6 +95,8 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 						 BTCycleId cycleid, TransactionId *oldestBtpoXact);
 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
 						 BlockNumber orig_blkno);
+static ItemPointer btreevacuumposting(BTVacState *vstate, IndexTuple itup,
+									  int *nremaining);
 
 
 /*
@@ -158,7 +160,7 @@ btbuildempty(Relation index)
 
 	/* Construct metapage. */
 	metapage = (Page) palloc(BLCKSZ);
-	_bt_initmetapage(metapage, P_NONE, 0);
+	_bt_initmetapage(metapage, P_NONE, 0, _bt_opclasses_support_dedup(index));
 
 	/*
 	 * Write the page and log it.  It might seem that an immediate sync would
@@ -261,8 +263,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 				 */
 				if (so->killedItems == NULL)
 					so->killedItems = (int *)
-						palloc(MaxIndexTuplesPerPage * sizeof(int));
-				if (so->numKilled < MaxIndexTuplesPerPage)
+						palloc(MaxBTreeIndexTuplesPerPage * sizeof(int));
+				if (so->numKilled < MaxBTreeIndexTuplesPerPage)
 					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
 			}
 
@@ -1151,8 +1153,17 @@ restart:
 	}
 	else if (P_ISLEAF(opaque))
 	{
+		/* Deletable item state */
 		OffsetNumber deletable[MaxOffsetNumber];
 		int			ndeletable;
+		int			nhtidsdead;
+		int			nhtidslive;
+
+		/* Updatable item state (for posting lists) */
+		IndexTuple	updated[MaxOffsetNumber];
+		OffsetNumber updatable[MaxOffsetNumber];
+		int			nupdatable;
+
 		OffsetNumber offnum,
 					minoff,
 					maxoff;
@@ -1185,6 +1196,10 @@ restart:
 		 * callback function.
 		 */
 		ndeletable = 0;
+		nupdatable = 0;
+		/* Maintain stats counters for index tuple versions/heap TIDs */
+		nhtidsdead = 0;
+		nhtidslive = 0;
 		minoff = P_FIRSTDATAKEY(opaque);
 		maxoff = PageGetMaxOffsetNumber(page);
 		if (callback)
@@ -1194,11 +1209,9 @@ restart:
 				 offnum = OffsetNumberNext(offnum))
 			{
 				IndexTuple	itup;
-				ItemPointer htup;
 
 				itup = (IndexTuple) PageGetItem(page,
 												PageGetItemId(page, offnum));
-				htup = &(itup->t_tid);
 
 				/*
 				 * During Hot Standby we currently assume that it's okay that
@@ -1221,8 +1234,71 @@ restart:
 				 * applies to *any* type of index that marks index tuples as
 				 * killed.
 				 */
-				if (callback(htup, callback_state))
-					deletable[ndeletable++] = offnum;
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Regular tuple, standard heap TID representation */
+					ItemPointer htid = &(itup->t_tid);
+
+					if (callback(htid, callback_state))
+					{
+						deletable[ndeletable++] = offnum;
+						nhtidsdead++;
+					}
+					else
+						nhtidslive++;
+				}
+				else
+				{
+					ItemPointer newhtids;
+					int			nremaining;
+
+					/*
+					 * Posting list tuple, a physical tuple that represents
+					 * two or more logical tuples, any of which could be an
+					 * index row version that must be removed
+					 */
+					newhtids = btreevacuumposting(vstate, itup, &nremaining);
+					if (newhtids == NULL)
+					{
+						/*
+						 * All TIDs/logical tuples from the posting tuple
+						 * remain, so no update or delete required
+						 */
+						Assert(nremaining == BTreeTupleGetNPosting(itup));
+					}
+					else if (nremaining > 0)
+					{
+						IndexTuple	updatedtuple;
+
+						/*
+						 * Form new tuple that contains only remaining TIDs.
+						 * Remember this tuple and the offset of the old tuple
+						 * for when we update it in place
+						 */
+						Assert(nremaining < BTreeTupleGetNPosting(itup));
+						updatedtuple = _bt_form_posting(itup, newhtids,
+														nremaining);
+						updated[nupdatable] = updatedtuple;
+						updatable[nupdatable++] = offnum;
+						nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
+						pfree(newhtids);
+					}
+					else
+					{
+						/*
+						 * All TIDs/logical tuples from the posting list must
+						 * be deleted.  We'll delete the physical tuple
+						 * completely.
+						 */
+						deletable[ndeletable++] = offnum;
+						nhtidsdead += BTreeTupleGetNPosting(itup);
+
+						/* Free empty array of live items */
+						pfree(newhtids);
+					}
+
+					nhtidslive += nremaining;
+				}
 			}
 		}
 
@@ -1230,11 +1306,12 @@ restart:
 		 * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
 		 * call per page, so as to minimize WAL traffic.
 		 */
-		if (ndeletable > 0)
+		if (ndeletable > 0 || nupdatable > 0)
 		{
-			_bt_delitems_vacuum(rel, buf, deletable, ndeletable);
+			_bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
+								updated, nupdatable);
 
-			stats->tuples_removed += ndeletable;
+			stats->tuples_removed += nhtidsdead;
 			/* must recompute maxoff */
 			maxoff = PageGetMaxOffsetNumber(page);
 		}
@@ -1249,6 +1326,7 @@ restart:
 			 * We treat this like a hint-bit update because there's no need to
 			 * WAL-log it.
 			 */
+			Assert(nhtidsdead == 0);
 			if (vstate->cycleid != 0 &&
 				opaque->btpo_cycleid == vstate->cycleid)
 			{
@@ -1258,15 +1336,16 @@ restart:
 		}
 
 		/*
-		 * If it's now empty, try to delete; else count the live tuples. We
-		 * don't delete when recursing, though, to avoid putting entries into
+		 * If it's now empty, try to delete; else count the live tuples (live
+		 * heap TIDs in posting lists are counted as live tuples).  We don't
+		 * delete when recursing, though, to avoid putting entries into
 		 * freePages out-of-order (doesn't seem worth any extra code to handle
 		 * the case).
 		 */
 		if (minoff > maxoff)
 			delete_now = (blkno == orig_blkno);
 		else
-			stats->num_index_tuples += maxoff - minoff + 1;
+			stats->num_index_tuples += nhtidslive;
 	}
 
 	if (delete_now)
@@ -1309,6 +1388,68 @@ restart:
 	}
 }
 
+/*
+ * btreevacuumposting() -- determines which logical tuples must remain when
+ * VACUUMing a posting list tuple.
+ *
+ * Returns new palloc'd array of item pointers needed to build replacement
+ * posting list without the index row versions that are to be deleted.
+ *
+ * Note that returned array is NULL in the common case where there is nothing
+ * to delete in caller's posting list tuple.  The number of TIDs that should
+ * remain in the posting list tuple is set for caller in *nremaining.  This is
+ * also the size of the returned array (though only when array isn't just
+ * NULL).
+ */
+static ItemPointer
+btreevacuumposting(BTVacState *vstate, IndexTuple itup, int *nremaining)
+{
+	int			live = 0;
+	int			nitem = BTreeTupleGetNPosting(itup);
+	ItemPointer tmpitems = NULL,
+				items = BTreeTupleGetPosting(itup);
+
+	Assert(BTreeTupleIsPosting(itup));
+
+	/*
+	 * Check each tuple in the posting list.  Save live tuples into tmpitems,
+	 * though try to avoid memory allocation as an optimization.
+	 */
+	for (int i = 0; i < nitem; i++)
+	{
+		if (!vstate->callback(items + i, vstate->callback_state))
+		{
+			/*
+			 * Live heap TID.
+			 *
+			 * Only save live TID when we know that we're going to have to
+			 * kill at least one TID, and have already allocated memory.
+			 */
+			if (tmpitems)
+				tmpitems[live] = items[i];
+			live++;
+		}
+
+		/* Dead heap TID */
+		else if (tmpitems == NULL)
+		{
+			/*
+			 * Turns out we need to delete one or more dead heap TIDs, so
+			 * start maintaining an array of live TIDs for caller to
+			 * reconstruct smaller replacement posting list tuple
+			 */
+			tmpitems = palloc(sizeof(ItemPointerData) * nitem);
+
+			/* Copy live heap TIDs from previous loop iterations */
+			if (live > 0)
+				memcpy(tmpitems, items, sizeof(ItemPointerData) * live);
+		}
+	}
+
+	*nremaining = live;
+	return tmpitems;
+}
+
 /*
  *	btcanreturn() -- Check whether btree indexes support index-only scans.
  *
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 8e512461a0..c954926f2d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -26,10 +26,18 @@
 
 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static int	_bt_binsrch_posting(BTScanInsert key, Page page,
+								OffsetNumber offnum);
 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
 						 OffsetNumber offnum);
 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
 						 OffsetNumber offnum, IndexTuple itup);
+static void _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
+								  OffsetNumber offnum, ItemPointer heapTid,
+								  IndexTuple itup);
+static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+									   OffsetNumber offnum,
+									   ItemPointer heapTid);
 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
@@ -434,7 +442,10 @@ _bt_binsrch(Relation rel,
  * low) makes bounds invalid.
  *
  * Caller is responsible for invalidating bounds when it modifies the page
- * before calling here a second time.
+ * before calling here a second time, and for dealing with posting list
+ * tuple matches (callers can use insertstate's postingoff field to
+ * determine which existing heap TID will need to be replaced by their
+ * scantid/new heap TID).
  */
 OffsetNumber
 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
@@ -453,6 +464,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 
 	Assert(P_ISLEAF(opaque));
 	Assert(!key->nextkey);
+	Assert(insertstate->postingoff == 0);
 
 	if (!insertstate->bounds_valid)
 	{
@@ -509,6 +521,16 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 			if (result != 0)
 				stricthigh = high;
 		}
+
+		/*
+		 * If tuple at offset located by binary search is a posting list whose
+		 * TID range overlaps with caller's scantid, perform posting list
+		 * binary search to set postingoff for caller.  Caller must split the
+		 * posting list when postingoff is set.  This should happen
+		 * infrequently.
+		 */
+		if (unlikely(result == 0 && key->scantid != NULL))
+			insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
 	}
 
 	/*
@@ -528,6 +550,68 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 	return low;
 }
 
+/*----------
+ *	_bt_binsrch_posting() -- posting list binary search.
+ *
+ * Returns offset into posting list where caller's scantid belongs.
+ *----------
+ */
+static int
+_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
+{
+	IndexTuple	itup;
+	ItemId		itemid;
+	int			low,
+				high,
+				mid,
+				res;
+
+	/*
+	 * If this isn't a posting tuple, then the index must be corrupt (if it is
+	 * an ordinary non-pivot tuple then there must be an existing tuple with a
+	 * heap TID that equals inserter's new heap TID/scantid).  Defensively
+	 * check that tuple is a posting list tuple whose posting list range
+	 * includes caller's scantid.
+	 *
+	 * (This is also needed because contrib/amcheck's rootdescend option needs
+	 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
+	 */
+	itemid = PageGetItemId(page, offnum);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+	if (!BTreeTupleIsPosting(itup))
+		return 0;
+
+	/*
+	 * In the unlikely event that posting list tuple has LP_DEAD bit set,
+	 * signal to caller that it should kill the item and restart its binary
+	 * search.
+	 */
+	if (ItemIdIsDead(itemid))
+		return -1;
+
+	/* "high" is past end of posting list for loop invariant */
+	low = 0;
+	high = BTreeTupleGetNPosting(itup);
+	Assert(high >= 2);
+
+	while (high > low)
+	{
+		mid = low + ((high - low) / 2);
+		res = ItemPointerCompare(key->scantid,
+								 BTreeTupleGetPostingN(itup, mid));
+
+		if (res > 0)
+			low = mid + 1;
+		else if (res < 0)
+			high = mid;
+		else
+			return mid;
+	}
+
+	/* Exact match not found */
+	return low;
+}
+
 /*----------
  *	_bt_compare() -- Compare insertion-type scankey to tuple on a page.
  *
@@ -537,9 +621,14 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
  *			<0 if scankey < tuple at offnum;
  *			 0 if scankey == tuple at offnum;
  *			>0 if scankey > tuple at offnum.
- *		NULLs in the keys are treated as sortable values.  Therefore
- *		"equality" does not necessarily mean that the item should be
- *		returned to the caller as a matching key!
+ *
+ * NULLs in the keys are treated as sortable values.  Therefore
+ * "equality" does not necessarily mean that the item should be returned
+ * to the caller as a matching key.  Similarly, an insertion scankey
+ * with its scantid set is treated as equal to a posting tuple whose TID
+ * range overlaps with their scantid.  There generally won't be a
+ * matching TID in the posting tuple, which caller must handle
+ * themselves (e.g., by splitting the posting list tuple).
  *
  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
  * "minus infinity": this routine will always claim it is less than the
@@ -563,6 +652,7 @@ _bt_compare(Relation rel,
 	ScanKey		scankey;
 	int			ncmpkey;
 	int			ntupatts;
+	int32		result;
 
 	Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
 	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
@@ -597,7 +687,6 @@ _bt_compare(Relation rel,
 	{
 		Datum		datum;
 		bool		isNull;
-		int32		result;
 
 		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
 
@@ -713,8 +802,25 @@ _bt_compare(Relation rel,
 	if (heapTid == NULL)
 		return 1;
 
+	/*
+	 * scankey must be treated as equal to a posting list tuple if its scantid
+	 * value falls within the range of the posting list.  In all other cases
+	 * there can only be a single heap TID value, which is compared directly
+	 * as a simple scalar value.
+	 */
 	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
-	return ItemPointerCompare(key->scantid, heapTid);
+	result = ItemPointerCompare(key->scantid, heapTid);
+	if (result <= 0 || !BTreeTupleIsPosting(itup))
+		return result;
+	else
+	{
+		result = ItemPointerCompare(key->scantid,
+									BTreeTupleGetMaxHeapTID(itup));
+		if (result > 0)
+			return 1;
+	}
+
+	return 0;
 }
 
 /*
@@ -1230,6 +1336,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 
 	/* Initialize remaining insertion scan key fields */
 	inskey.heapkeyspace = _bt_heapkeyspace(rel);
+	inskey.safededup = false;	/* unused */
 	inskey.anynullkeys = false; /* unused */
 	inskey.nextkey = nextkey;
 	inskey.pivotsearch = false;
@@ -1451,6 +1558,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 	/* initialize tuple workspace to empty */
 	so->currPos.nextTupleOffset = 0;
+	so->currPos.postingTupleOffset = 0;
 
 	/*
 	 * Now that the current page has been made consistent, the macro should be
@@ -1484,9 +1592,31 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
 			{
-				/* tuple passes all scan key conditions, so remember it */
-				_bt_saveitem(so, itemIndex, offnum, itup);
-				itemIndex++;
+				/* tuple passes all scan key conditions */
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Remember it */
+					_bt_saveitem(so, itemIndex, offnum, itup);
+					itemIndex++;
+				}
+				else
+				{
+					/*
+					 * Set up state to return posting list, and remember first
+					 * "logical" tuple
+					 */
+					_bt_setuppostingitems(so, itemIndex, offnum,
+										  BTreeTupleGetPostingN(itup, 0),
+										  itup);
+					itemIndex++;
+					/* Remember additional logical tuples */
+					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i));
+						itemIndex++;
+					}
+				}
 			}
 			/* When !continuescan, there can't be any more matches, so stop */
 			if (!continuescan)
@@ -1519,7 +1649,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		if (!continuescan)
 			so->currPos.moreRight = false;
 
-		Assert(itemIndex <= MaxIndexTuplesPerPage);
+		Assert(itemIndex <= MaxBTreeIndexTuplesPerPage);
 		so->currPos.firstItem = 0;
 		so->currPos.lastItem = itemIndex - 1;
 		so->currPos.itemIndex = 0;
@@ -1527,7 +1657,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	else
 	{
 		/* load items[] in descending order */
-		itemIndex = MaxIndexTuplesPerPage;
+		itemIndex = MaxBTreeIndexTuplesPerPage;
 
 		offnum = Min(offnum, maxoff);
 
@@ -1568,9 +1698,37 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 										 &continuescan);
 			if (passes_quals && tuple_alive)
 			{
-				/* tuple passes all scan key conditions, so remember it */
-				itemIndex--;
-				_bt_saveitem(so, itemIndex, offnum, itup);
+				/* tuple passes all scan key conditions */
+				if (!BTreeTupleIsPosting(itup))
+				{
+					/* Remember it */
+					itemIndex--;
+					_bt_saveitem(so, itemIndex, offnum, itup);
+				}
+				else
+				{
+					int			i = BTreeTupleGetNPosting(itup) - 1;
+
+					/*
+					 * Set up state to return posting list, and remember last
+					 * "logical" tuple (since we'll return it first)
+					 */
+					itemIndex--;
+					_bt_setuppostingitems(so, itemIndex, offnum,
+										  BTreeTupleGetPostingN(itup, i--),
+										  itup);
+
+					/*
+					 * Remember additional logical tuples (use desc order to
+					 * be consistent with order of entire scan)
+					 */
+					for (; i >= 0; i--)
+					{
+						itemIndex--;
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i));
+					}
+				}
 			}
 			if (!continuescan)
 			{
@@ -1584,8 +1742,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 		Assert(itemIndex >= 0);
 		so->currPos.firstItem = itemIndex;
-		so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
-		so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
+		so->currPos.lastItem = MaxBTreeIndexTuplesPerPage - 1;
+		so->currPos.itemIndex = MaxBTreeIndexTuplesPerPage - 1;
 	}
 
 	return (so->currPos.firstItem <= so->currPos.lastItem);
@@ -1598,6 +1756,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 {
 	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
 
+	Assert(!BTreeTupleIsPosting(itup));
+
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
 	if (so->currTuples)
@@ -1610,6 +1770,64 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 	}
 }
 
+/*
+ * Setup state to save posting items from a single posting list tuple.  Saves
+ * the logical tuple that will be returned to scan first in passing.
+ *
+ * Saves an index item into so->currPos.items[itemIndex] for logical tuple
+ * that is returned to scan first.  Second or subsequent heap TID for posting
+ * list should be saved by calling _bt_savepostingitem().
+ */
+static void
+_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					  ItemPointer heapTid, IndexTuple itup)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+
+	if (so->currTuples)
+	{
+		/* Save base IndexTuple (truncate posting list) */
+		IndexTuple	base;
+		Size		itupsz = BTreeTupleGetPostingOffset(itup);
+
+		itupsz = MAXALIGN(itupsz);
+		currItem->tupleOffset = so->currPos.nextTupleOffset;
+		base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+		memcpy(base, itup, itupsz);
+		/* Defensively reduce work area index tuple header size */
+		base->t_info &= ~INDEX_SIZE_MASK;
+		base->t_info |= itupsz;
+		so->currPos.nextTupleOffset += itupsz;
+		so->currPos.postingTupleOffset = currItem->tupleOffset;
+	}
+}
+
+/*
+ * Save an index item into so->currPos.items[itemIndex] for posting tuple.
+ *
+ * Assumes that _bt_setuppostingitems() has already been called for current
+ * posting list tuple.
+ */
+static inline void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					ItemPointer heapTid)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = *heapTid;
+	currItem->indexOffset = offnum;
+
+	/*
+	 * Have index-only scans return the same base IndexTuple for every logical
+	 * tuple that originates from the same posting list
+	 */
+	if (so->currTuples)
+		currItem->tupleOffset = so->currPos.postingTupleOffset;
+}
+
 /*
  *	_bt_steppage() -- Step to next page containing valid data for scan
  *
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 1dd39a9535..b40559d45f 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -243,6 +243,7 @@ typedef struct BTPageState
 	BlockNumber btps_blkno;		/* block # to write this page at */
 	IndexTuple	btps_lowkey;	/* page's strict lower bound pivot tuple */
 	OffsetNumber btps_lastoff;	/* last item offset loaded */
+	Size		btps_lastextra; /* last item's extra posting list space */
 	uint32		btps_level;		/* tree level (0 = leaf) */
 	Size		btps_full;		/* "full" if less than this much free space */
 	struct BTPageState *btps_next;	/* link to parent level, if any */
@@ -277,7 +278,10 @@ static void _bt_slideleft(Page page);
 static void _bt_sortaddtup(Page page, Size itemsize,
 						   IndexTuple itup, OffsetNumber itup_off);
 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
-						 IndexTuple itup);
+						 IndexTuple itup, Size truncextra);
+static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
+										  BTPageState *state,
+										  BTDedupState dstate);
 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
 static void _bt_load(BTWriteState *wstate,
 					 BTSpool *btspool, BTSpool *btspool2);
@@ -711,6 +715,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level)
 	state->btps_lowkey = NULL;
 	/* initialize lastoff so first item goes into P_FIRSTKEY */
 	state->btps_lastoff = P_HIKEY;
+	state->btps_lastextra = 0;
 	state->btps_level = level;
 	/* set "full" threshold based on level.  See notes at head of file. */
 	if (level > 0)
@@ -789,7 +794,8 @@ _bt_sortaddtup(Page page,
 }
 
 /*----------
- * Add an item to a disk page from the sort output.
+ * Add an item to a disk page from the sort output (or add a posting list
+ * item formed from the sort output).
  *
  * We must be careful to observe the page layout conventions of nbtsearch.c:
  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
@@ -821,14 +827,27 @@ _bt_sortaddtup(Page page,
  * the truncated high key at offset 1.
  *
  * 'last' pointer indicates the last offset added to the page.
+ *
+ * 'truncextra' is the size of the posting list in itup, if any.  This
+ * information is stashed for the next call here, when we may benefit
+ * from considering the impact of truncating away the posting list on
+ * the page before deciding to finish the page off.  Posting lists are
+ * often relatively large, so it is worth going to the trouble of
+ * accounting for the saving from truncating away the posting list of
+ * the tuple that becomes the high key (that may be the only way to
+ * get close to target free space on the page).  Note that this is
+ * only used for the soft fillfactor-wise limit, not the critical hard
+ * limit.
  *----------
  */
 static void
-_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
+_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
+			 Size truncextra)
 {
 	Page		npage;
 	BlockNumber nblkno;
 	OffsetNumber last_off;
+	Size		last_truncextra;
 	Size		pgspc;
 	Size		itupsz;
 	bool		isleaf;
@@ -842,6 +861,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	npage = state->btps_page;
 	nblkno = state->btps_blkno;
 	last_off = state->btps_lastoff;
+	last_truncextra = state->btps_lastextra;
+	state->btps_lastextra = truncextra;
 
 	pgspc = PageGetFreeSpace(npage);
 	itupsz = IndexTupleSize(itup);
@@ -883,10 +904,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	 * page.  Disregard fillfactor and insert on "full" current page if we
 	 * don't have the minimum number of items yet.  (Note that we deliberately
 	 * assume that suffix truncation neither enlarges nor shrinks new high key
-	 * when applying soft limit.)
+	 * when applying soft limit, except when last tuple had a posting list.)
 	 */
 	if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
-		(pgspc < state->btps_full && last_off > P_FIRSTKEY))
+		(pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
 	{
 		/*
 		 * Finish off the page and write it out.
@@ -944,11 +965,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 			 * We don't try to bias our choice of split point to make it more
 			 * likely that _bt_truncate() can truncate away more attributes,
 			 * whereas the split point used within _bt_split() is chosen much
-			 * more delicately.  Suffix truncation is mostly useful because it
-			 * improves space utilization for workloads with random
-			 * insertions.  It doesn't seem worthwhile to add logic for
-			 * choosing a split point here for a benefit that is bound to be
-			 * much smaller.
+			 * more delicately.  On the other hand, non-unique index builds
+			 * usually deduplicate, which often results in every "physical"
+			 * tuple on the page having distinct key values.  When that
+			 * happens, _bt_truncate() will never need to include a heap TID
+			 * in the new high key.
 			 *
 			 * Overwrite the old item with new truncated high key directly.
 			 * oitup is already located at the physical beginning of tuple
@@ -983,7 +1004,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
 			   !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
 		BTreeInnerTupleSetDownLink(state->btps_lowkey, oblkno);
-		_bt_buildadd(wstate, state->btps_next, state->btps_lowkey);
+		_bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
 		pfree(state->btps_lowkey);
 
 		/*
@@ -1045,6 +1066,47 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	state->btps_lastoff = last_off;
 }
 
+/*
+ * Finalize pending posting list tuple, and add it to the index.  Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
+ * using _bt_buildadd() and does not maintain the intervals array.
+ */
+static void
+_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
+							  BTDedupState dstate)
+{
+	IndexTuple	final;
+	Size		truncextra;
+
+	Assert(dstate->nitems > 0);
+	truncextra = 0;
+	if (dstate->nitems == 1)
+		final = dstate->base;
+	else
+	{
+		IndexTuple	postingtuple;
+
+		/* form a tuple with a posting list */
+		postingtuple = _bt_form_posting(dstate->base,
+										dstate->htids,
+										dstate->nhtids);
+		final = postingtuple;
+		/* Determine size of posting list */
+		truncextra = IndexTupleSize(final) -
+			BTreeTupleGetPostingOffset(final);
+	}
+
+	_bt_buildadd(wstate, state, final, truncextra);
+
+	if (dstate->nitems > 1)
+		pfree(final);
+	/* Don't maintain  dedup_intervals array, or alltupsize */
+	dstate->nhtids = 0;
+	dstate->nitems = 0;
+}
+
 /*
  * Finish writing out the completed btree.
  */
@@ -1090,7 +1152,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 			Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
 				   !P_LEFTMOST(opaque));
 			BTreeInnerTupleSetDownLink(s->btps_lowkey, blkno);
-			_bt_buildadd(wstate, s->btps_next, s->btps_lowkey);
+			_bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
 			pfree(s->btps_lowkey);
 			s->btps_lowkey = NULL;
 		}
@@ -1111,7 +1173,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	 * by filling in a valid magic number in the metapage.
 	 */
 	metapage = (Page) palloc(BLCKSZ);
-	_bt_initmetapage(metapage, rootblkno, rootlevel);
+	_bt_initmetapage(metapage, rootblkno, rootlevel,
+					 wstate->inskey->safededup);
 	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
 }
 
@@ -1132,6 +1195,9 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
 	SortSupport sortKeys;
 	int64		tuples_done = 0;
+	bool		deduplicate;
+
+	deduplicate = wstate->inskey->safededup && BTGetUseDedup(wstate->index);
 
 	if (merge)
 	{
@@ -1228,12 +1294,12 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 
 			if (load1)
 			{
-				_bt_buildadd(wstate, state, itup);
+				_bt_buildadd(wstate, state, itup, 0);
 				itup = tuplesort_getindextuple(btspool->sortstate, true);
 			}
 			else
 			{
-				_bt_buildadd(wstate, state, itup2);
+				_bt_buildadd(wstate, state, itup2, 0);
 				itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
 			}
 
@@ -1243,9 +1309,113 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 		}
 		pfree(sortKeys);
 	}
+	else if (deduplicate)
+	{
+		/* merge is unnecessary, deduplicate into posting lists */
+		BTDedupState dstate;
+		IndexTuple	newbase;
+
+		dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
+		dstate->maxitemsize = 0;	/* set later */
+		dstate->checkingunique = false; /* unused */
+		dstate->skippedbase = InvalidOffsetNumber;
+		dstate->newitem = NULL;
+		/* Metadata about current pending posting list */
+		dstate->htids = NULL;
+		dstate->nhtids = 0;
+		dstate->nitems = 0;
+		dstate->overlap = false;
+		dstate->alltupsize = 0; /* unused */
+		/* Metadata about based tuple of current pending posting list */
+		dstate->base = NULL;
+		dstate->baseoff = InvalidOffsetNumber;	/* unused */
+		dstate->basetupsize = 0;
+
+		while ((itup = tuplesort_getindextuple(btspool->sortstate,
+											   true)) != NULL)
+		{
+			/* When we see first tuple, create first index page */
+			if (state == NULL)
+			{
+				state = _bt_pagestate(wstate, 0);
+
+				/*
+				 * Limit size of posting list tuples to the size of the free
+				 * space we want to leave behind on the page, plus space for
+				 * final item's line pointer (but make sure that posting list
+				 * tuple size won't exceed the generic 1/3 of a page limit).
+				 *
+				 * This is more conservative than the approach taken in the
+				 * retail insert path, but it allows us to get most of the
+				 * space savings deduplication provides without noticeably
+				 * impacting how much free space is left behind on each leaf
+				 * page.
+				 */
+				dstate->maxitemsize =
+					Min(BTMaxItemSize(state->btps_page),
+						MAXALIGN_DOWN(state->btps_full) - sizeof(ItemIdData));
+				/* Minimum posting tuple size used here is arbitrary: */
+				dstate->maxitemsize = Max(dstate->maxitemsize, 100);
+				dstate->htids = palloc(dstate->maxitemsize);
+
+				/*
+				 * No previous/base tuple, since itup is the  first item
+				 * returned by the tuplesort -- use itup as base tuple of
+				 * first pending posting list for entire index build
+				 */
+				newbase = CopyIndexTuple(itup);
+				_bt_dedup_start_pending(dstate, newbase, InvalidOffsetNumber);
+			}
+			else if (_bt_keep_natts_fast(wstate->index, dstate->base,
+										 itup) > keysz &&
+					 _bt_dedup_save_htid(dstate, itup))
+			{
+				/*
+				 * Tuple is equal to base tuple of pending posting list, and
+				 * merging itup into pending posting list won't exceed the
+				 * maxitemsize limit.  Heap TID(s) for itup have been saved in
+				 * state.  The next iteration will also end up here if it's
+				 * possible to merge the next tuple into the same pending
+				 * posting list.
+				 */
+			}
+			else
+			{
+				/*
+				 * Tuple is not equal to pending posting list tuple, or
+				 * maxitemsize limit was reached
+				 */
+				_bt_sort_dedup_finish_pending(wstate, state, dstate);
+				/* Base tuple is always a copy */
+				pfree(dstate->base);
+
+				/* itup starts new pending posting list */
+				newbase = CopyIndexTuple(itup);
+				_bt_dedup_start_pending(dstate, newbase, InvalidOffsetNumber);
+			}
+
+			/* Report progress */
+			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+										 ++tuples_done);
+		}
+
+		/*
+		 * Handle the last item (there must be a last item when the tuplesort
+		 * returned one or more tuples)
+		 */
+		if (state)
+		{
+			_bt_sort_dedup_finish_pending(wstate, state, dstate);
+			/* Base tuple is always a copy */
+			pfree(dstate->base);
+			pfree(dstate->htids);
+		}
+
+		pfree(dstate);
+	}
 	else
 	{
-		/* merge is unnecessary */
+		/* merging and deduplication are both unnecessary */
 		while ((itup = tuplesort_getindextuple(btspool->sortstate,
 											   true)) != NULL)
 		{
@@ -1253,7 +1423,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 			if (state == NULL)
 				state = _bt_pagestate(wstate, 0);
 
-			_bt_buildadd(wstate, state, itup);
+			_bt_buildadd(wstate, state, itup, 0);
 
 			/* Report progress */
 			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index 29167f1ef5..ffec42e78a 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -51,6 +51,7 @@ typedef struct
 	Size		newitemsz;		/* size of newitem (includes line pointer) */
 	bool		is_leaf;		/* T if splitting a leaf page */
 	bool		is_rightmost;	/* T if splitting rightmost page on level */
+	bool		is_deduped;		/* T if posting list truncation expected */
 	OffsetNumber newitemoff;	/* where the new item is to be inserted */
 	int			leftspace;		/* space available for items on left page */
 	int			rightspace;		/* space available for items on right page */
@@ -177,12 +178,16 @@ _bt_findsplitloc(Relation rel,
 	state.newitemsz = newitemsz;
 	state.is_leaf = P_ISLEAF(opaque);
 	state.is_rightmost = P_RIGHTMOST(opaque);
+	state.is_deduped = state.is_leaf && BTGetUseDedup(rel);
 	state.leftspace = leftspace;
 	state.rightspace = rightspace;
 	state.olddataitemstotal = olddataitemstotal;
 	state.minfirstrightsz = SIZE_MAX;
 	state.newitemoff = newitemoff;
 
+	/* newitem cannot be a posting list item */
+	Assert(!BTreeTupleIsPosting(newitem));
+
 	/*
 	 * maxsplits should never exceed maxoff because there will be at most as
 	 * many candidate split points as there are points _between_ tuples, once
@@ -459,6 +464,7 @@ _bt_recsplitloc(FindSplitData *state,
 	int16		leftfree,
 				rightfree;
 	Size		firstrightitemsz;
+	Size		postingsz = 0;
 	bool		newitemisfirstonright;
 
 	/* Is the new item going to be the first item on the right page? */
@@ -468,8 +474,31 @@ _bt_recsplitloc(FindSplitData *state,
 	if (newitemisfirstonright)
 		firstrightitemsz = state->newitemsz;
 	else
+	{
 		firstrightitemsz = firstoldonrightsz;
 
+		/*
+		 * Calculate suffix truncation space saving when firstright is a
+		 * posting list tuple.
+		 *
+		 * Individual posting lists often take up a significant fraction of
+		 * all space on a page.  Failing to consider that the new high key
+		 * won't need to store the posting list a second time really matters.
+		 */
+		if (state->is_leaf && state->is_deduped)
+		{
+			ItemId		itemid;
+			IndexTuple	newhighkey;
+
+			itemid = PageGetItemId(state->page, firstoldonright);
+			newhighkey = (IndexTuple) PageGetItem(state->page, itemid);
+
+			if (BTreeTupleIsPosting(newhighkey))
+				postingsz = IndexTupleSize(newhighkey) -
+					BTreeTupleGetPostingOffset(newhighkey);
+		}
+	}
+
 	/* Account for all the old tuples */
 	leftfree = state->leftspace - olddataitemstoleft;
 	rightfree = state->rightspace -
@@ -492,9 +521,11 @@ _bt_recsplitloc(FindSplitData *state,
 	 * adding a heap TID to the left half's new high key when splitting at the
 	 * leaf level.  In practice the new high key will often be smaller and
 	 * will rarely be larger, but conservatively assume the worst case.
+	 * Truncation always truncates away any posting list that appears in the
+	 * first right tuple, though, so it's safe to subtract that overhead.
 	 */
 	if (state->is_leaf)
-		leftfree -= (int16) (firstrightitemsz +
+		leftfree -= (int16) ((firstrightitemsz - postingsz) +
 							 MAXALIGN(sizeof(ItemPointerData)));
 	else
 		leftfree -= (int16) firstrightitemsz;
@@ -691,7 +722,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
 	itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
 	tup = (IndexTuple) PageGetItem(state->page, itemid);
 	/* Do cheaper test first */
-	if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
+	if (BTreeTupleIsPosting(tup) ||
+		!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
 		return false;
 	/* Check same conditions as rightmost item case, too */
 	keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index ee972a1465..cb6a5b9335 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -20,6 +20,7 @@
 #include "access/nbtree.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
+#include "catalog/catalog.h"
 #include "commands/progress.h"
 #include "lib/qunique.h"
 #include "miscadmin.h"
@@ -98,8 +99,6 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	indoption = rel->rd_indoption;
 	tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
 
-	Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
-
 	/*
 	 * We'll execute search using scan key constructed on key columns.
 	 * Truncated attributes and non-key attributes are omitted from the final
@@ -108,12 +107,25 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	key = palloc(offsetof(BTScanInsertData, scankeys) +
 				 sizeof(ScanKeyData) * indnkeyatts);
 	key->heapkeyspace = itup == NULL || _bt_heapkeyspace(rel);
+	key->safededup = itup == NULL ? _bt_opclasses_support_dedup(rel) :
+		_bt_safededup(rel);
 	key->anynullkeys = false;	/* initial assumption */
 	key->nextkey = false;
 	key->pivotsearch = false;
+	key->scantid = NULL;
 	key->keysz = Min(indnkeyatts, tupnatts);
-	key->scantid = key->heapkeyspace && itup ?
-		BTreeTupleGetHeapTID(itup) : NULL;
+
+	Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
+	Assert(!itup || !BTreeTupleIsPosting(itup) || key->heapkeyspace);
+
+	/*
+	 * When caller passes a tuple with a heap TID, use it to set scantid. Note
+	 * that this handles posting list tuples by setting scantid to the lowest
+	 * heap TID in the posting list.
+	 */
+	if (itup && key->heapkeyspace)
+		key->scantid = BTreeTupleGetHeapTID(itup);
+
 	skey = key->scankeys;
 	for (i = 0; i < indnkeyatts; i++)
 	{
@@ -1373,6 +1385,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * attribute passes the qual.
 			 */
 			Assert(ScanDirectionIsForward(dir));
+			Assert(BTreeTupleIsPivot(tuple));
 			continue;
 		}
 
@@ -1534,6 +1547,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 			 * attribute passes the qual.
 			 */
 			Assert(ScanDirectionIsForward(dir));
+			Assert(BTreeTupleIsPivot(tuple));
 			cmpresult = 0;
 			if (subkey->sk_flags & SK_ROW_END)
 				break;
@@ -1773,10 +1787,35 @@ _bt_killitems(IndexScanDesc scan)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
+			bool		killtuple = false;
 
-			if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+			if (BTreeTupleIsPosting(ituple))
 			{
-				/* found the item */
+				int			pi = i + 1;
+				int			nposting = BTreeTupleGetNPosting(ituple);
+				int			j;
+
+				for (j = 0; j < nposting; j++)
+				{
+					ItemPointer item = BTreeTupleGetPostingN(ituple, j);
+
+					if (!ItemPointerEquals(item, &kitem->heapTid))
+						break;	/* out of posting list loop */
+
+					/* Read-ahead to later kitems */
+					if (pi < numKilled)
+						kitem = &so->currPos.items[so->killedItems[pi++]];
+				}
+
+				if (j == nposting)
+					killtuple = true;
+			}
+			else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+				killtuple = true;
+
+			if (killtuple)
+			{
+				/* found the item/all posting list items */
 				ItemIdMarkDead(iid);
 				killedsomething = true;
 				break;			/* out of inner search loop */
@@ -2017,7 +2056,9 @@ btoptions(Datum reloptions, bool validate)
 	static const relopt_parse_elt tab[] = {
 		{"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
 		{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
-		offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}
+		offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
+		{"deduplication", RELOPT_TYPE_BOOL,
+		offsetof(BTOptions, deduplication)}
 
 	};
 
@@ -2138,6 +2179,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 
 		pivot = index_truncate_tuple(itupdesc, firstright, keepnatts);
 
+		if (BTreeTupleIsPosting(firstright))
+		{
+			BTreeTupleClearBtIsPosting(pivot);
+			BTreeTupleSetNAtts(pivot, keepnatts);
+			if (keepnatts == natts)
+			{
+				/*
+				 * index_truncate_tuple() just returned a copy of the
+				 * original, so make sure that the size of the new pivot tuple
+				 * doesn't have posting list overhead
+				 */
+				pivot->t_info &= ~INDEX_SIZE_MASK;
+				pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
+			}
+		}
+
+		Assert(!BTreeTupleIsPosting(pivot));
+
 		/*
 		 * If there is a distinguishing key attribute within new pivot tuple,
 		 * there is no need to add an explicit heap TID attribute
@@ -2154,6 +2213,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		 * attribute to the new pivot tuple.
 		 */
 		Assert(natts != nkeyatts);
+		Assert(!BTreeTupleIsPosting(lastleft) &&
+			   !BTreeTupleIsPosting(firstright));
 		newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData));
 		tidpivot = palloc0(newsize);
 		memcpy(tidpivot, pivot, IndexTupleSize(pivot));
@@ -2161,6 +2222,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		pfree(pivot);
 		pivot = tidpivot;
 	}
+	else if (BTreeTupleIsPosting(firstright))
+	{
+		/*
+		 * No truncation was possible, since key attributes are all equal.  We
+		 * can always truncate away a posting list, though.
+		 *
+		 * It's necessary to add a heap TID attribute to the new pivot tuple.
+		 */
+		newsize = MAXALIGN(BTreeTupleGetPostingOffset(firstright)) +
+			MAXALIGN(sizeof(ItemPointerData));
+		pivot = palloc0(newsize);
+		memcpy(pivot, firstright, BTreeTupleGetPostingOffset(firstright));
+
+		pivot->t_info &= ~INDEX_SIZE_MASK;
+		pivot->t_info |= newsize;
+		BTreeTupleClearBtIsPosting(pivot);
+		BTreeTupleSetAltHeapTID(pivot);
+	}
 	else
 	{
 		/*
@@ -2186,6 +2265,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * nbtree (e.g., there is no pg_attribute entry).
 	 */
 	Assert(itup_key->heapkeyspace);
+	Assert(!BTreeTupleIsPosting(pivot));
 	pivot->t_info &= ~INDEX_SIZE_MASK;
 	pivot->t_info |= newsize;
 
@@ -2198,7 +2278,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	pivotheaptid = (ItemPointer) ((char *) pivot + newsize -
 								  sizeof(ItemPointerData));
-	ItemPointerCopy(&lastleft->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
 
 	/*
 	 * Lehman and Yao require that the downlink to the right page, which is to
@@ -2209,9 +2289,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * tiebreaker.
 	 */
 #ifndef DEBUG_NO_TRUNCATE
-	Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0);
-	Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0);
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
+							  BTreeTupleGetHeapTID(firstright)) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(lastleft)) >= 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
 #else
 
 	/*
@@ -2224,7 +2307,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * attribute values along with lastleft's heap TID value when lastleft's
 	 * TID happens to be greater than firstright's TID.
 	 */
-	ItemPointerCopy(&firstright->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
 
 	/*
 	 * Pivot heap TID should never be fully equal to firstright.  Note that
@@ -2233,7 +2316,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	ItemPointerSetOffsetNumber(pivotheaptid,
 							   OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
 #endif
 
 	BTreeTupleSetNAtts(pivot, nkeyatts);
@@ -2314,13 +2398,16 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
  * The approach taken here usually provides the same answer as _bt_keep_natts
  * will (for the same pair of tuples from a heapkeyspace index), since the
  * majority of btree opclasses can never indicate that two datums are equal
- * unless they're bitwise equal after detoasting.
+ * unless they're bitwise equal after detoasting.  When an index is considered
+ * deduplication-safe by _bt_opclasses_support_dedup, routine is guaranteed to
+ * give the same result as _bt_keep_natts would.
  *
- * These issues must be acceptable to callers, typically because they're only
- * concerned about making suffix truncation as effective as possible without
- * leaving excessive amounts of free space on either side of page split.
- * Callers can rely on the fact that attributes considered equal here are
- * definitely also equal according to _bt_keep_natts.
+ * Suffix truncation callers can rely on the fact that attributes considered
+ * equal here are definitely also equal according to _bt_keep_natts, even when
+ * the index uses an opclass or collation that is not deduplication-safe.
+ * This weaker guarantee is good enough for these callers, since false
+ * negatives generally only have the effect of making leaf page splits use a
+ * more balanced split point.
  */
 int
 _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
@@ -2398,22 +2485,30 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 	tupnatts = BTreeTupleGetNAtts(itup, rel);
 
+	/* !heapkeyspace indexes do not support deduplication */
+	if (!heapkeyspace && BTreeTupleIsPosting(itup))
+		return false;
+
+	/* INCLUDE indexes do not support deduplication */
+	if (natts != nkeyatts && BTreeTupleIsPosting(itup))
+		return false;
+
 	if (P_ISLEAF(opaque))
 	{
 		if (offnum >= P_FIRSTDATAKEY(opaque))
 		{
 			/*
-			 * Non-pivot tuples currently never use alternative heap TID
-			 * representation -- even those within heapkeyspace indexes
+			 * Non-pivot tuple should never be explicitly marked as a pivot
+			 * tuple
 			 */
-			if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
+			if (BTreeTupleIsPivot(itup))
 				return false;
 
 			/*
 			 * Leaf tuples that are not the page high key (non-pivot tuples)
 			 * should never be truncated.  (Note that tupnatts must have been
-			 * inferred, rather than coming from an explicit on-disk
-			 * representation.)
+			 * inferred, even with a posting list tuple, because only pivot
+			 * tuples store tupnatts directly.)
 			 */
 			return tupnatts == natts;
 		}
@@ -2457,12 +2552,12 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 			 * non-zero, or when there is no explicit representation and the
 			 * tuple is evidently not a pre-pg_upgrade tuple.
 			 *
-			 * Prior to v11, downlinks always had P_HIKEY as their offset. Use
-			 * that to decide if the tuple is a pre-v11 tuple.
+			 * Prior to v11, downlinks always had P_HIKEY as their offset.
+			 * Accept that as an alternative indication of a valid
+			 * !heapkeyspace negative infinity tuple.
 			 */
 			return tupnatts == 0 ||
-				((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
-				 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
+				ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
 		}
 		else
 		{
@@ -2488,7 +2583,11 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 	 * heapkeyspace index pivot tuples, regardless of whether or not there are
 	 * non-key attributes.
 	 */
-	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+	if (!BTreeTupleIsPivot(itup))
+		return false;
+
+	/* Pivot tuple should not use posting list representation (redundant) */
+	if (BTreeTupleIsPosting(itup))
 		return false;
 
 	/*
@@ -2558,11 +2657,54 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
 					BTMaxItemSizeNoHeapTid(page),
 					RelationGetRelationName(rel)),
 			 errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
-					   ItemPointerGetBlockNumber(&newtup->t_tid),
-					   ItemPointerGetOffsetNumber(&newtup->t_tid),
+					   ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
+					   ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
 					   RelationGetRelationName(heap)),
 			 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
 					 "Consider a function index of an MD5 hash of the value, "
 					 "or use full text indexing."),
 			 errtableconstraint(heap, RelationGetRelationName(rel))));
 }
+
+/*
+ * Is it safe to perform deduplication for an index, given the opclasses and
+ * collations used?
+ *
+ * Returned value is stored in index metapage during index builds.  Function
+ * does not account for incompatibilities caused by index being on an earlier
+ * nbtree version.
+ */
+bool
+_bt_opclasses_support_dedup(Relation index)
+{
+	/* INCLUDE indexes don't support deduplication */
+	if (IndexRelationGetNumberOfAttributes(index) !=
+		IndexRelationGetNumberOfKeyAttributes(index))
+		return false;
+
+	/*
+	 * There is no reason why deduplication cannot be used with system catalog
+	 * indexes.  However, we deem it generally unsafe because it's not clear
+	 * how it could be disabled.  (ALTER INDEX is not supported with system
+	 * catalog indexes, so users have no way to set the "deduplicate" storage
+	 * parameter.)
+	 */
+	if (IsCatalogRelation(index))
+		return false;
+
+	for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++)
+	{
+		Oid			opfamily = index->rd_opfamily[i];
+		Oid			collation = index->rd_indcollation[i];
+
+		/* TODO add adequate check of opclasses and collations */
+		elog(DEBUG4, "index %s column i %d opfamilyOid %u collationOid %u",
+			 RelationGetRelationName(index), i, opfamily, collation);
+
+		/* NUMERIC btree opfamily OID is 1988 */
+		if (opfamily == 1988)
+			return false;
+	}
+
+	return true;
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 72a601bb22..191ab63a9b 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -22,6 +22,9 @@
 #include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "storage/procarray.h"
+#include "utils/memutils.h"
+
+static MemoryContext opCtx;		/* working memory for operations */
 
 /*
  * _bt_restore_page -- re-enter all the index tuples on a page
@@ -111,6 +114,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
 	Assert(md->btm_version >= BTREE_NOVAC_VERSION);
 	md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
 	md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
+	md->btm_safededup = xlrec->btm_safededup;
 
 	pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
 	pageop->btpo_flags = BTP_META;
@@ -156,7 +160,8 @@ _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
 }
 
 static void
-btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
+btree_xlog_insert(bool isleaf, bool ismeta, bool posting,
+				  XLogReaderState *record)
 {
 	XLogRecPtr	lsn = record->EndRecPtr;
 	xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
@@ -181,9 +186,52 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
 
 		page = BufferGetPage(buffer);
 
-		if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
-						false, false) == InvalidOffsetNumber)
-			elog(PANIC, "btree_xlog_insert: failed to add item");
+		if (likely(!posting))
+		{
+			/* Simple retail insertion */
+			if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
+							false, false) == InvalidOffsetNumber)
+				elog(PANIC, "btree_xlog_insert: failed to add item");
+		}
+		else
+		{
+			ItemId		itemid;
+			IndexTuple	oposting,
+						newitem,
+						nposting;
+			uint16		postingoff;
+
+			/*
+			 * A posting list split occurred during leaf page insertion.  WAL
+			 * record data will start with an offset number representing the
+			 * point in an existing posting list that a split occurs at.
+			 *
+			 * Use _bt_swap_posting() to repeat posting list split steps from
+			 * primary.  Note that newitem from WAL record is 'orignewitem',
+			 * not the final version of newitem that is actually inserted on
+			 * page.
+			 */
+			postingoff = *((uint16 *) datapos);
+			datapos += sizeof(uint16);
+			datalen -= sizeof(uint16);
+
+			itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum));
+			oposting = (IndexTuple) PageGetItem(page, itemid);
+
+			/* newitem must be mutable copy for _bt_swap_posting() */
+			Assert(isleaf && postingoff > 0);
+			newitem = CopyIndexTuple((IndexTuple) datapos);
+			nposting = _bt_swap_posting(newitem, oposting, postingoff);
+
+			/* Replace existing posting list with post-split version */
+			memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
+			/* insert "final" new item (not orignewitem from WAL stream) */
+			Assert(IndexTupleSize(newitem) == datalen);
+			if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum,
+							false, false) == InvalidOffsetNumber)
+				elog(PANIC, "btree_xlog_insert: failed to add posting split new item");
+		}
 
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
@@ -265,20 +313,38 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
 		BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
 		OffsetNumber off;
 		IndexTuple	newitem = NULL,
-					left_hikey = NULL;
+					left_hikey = NULL,
+					nposting = NULL;
 		Size		newitemsz = 0,
 					left_hikeysz = 0;
 		Page		newlpage;
-		OffsetNumber leftoff;
+		OffsetNumber leftoff,
+					replacepostingoff = InvalidOffsetNumber;
 
 		datapos = XLogRecGetBlockData(record, 0, &datalen);
 
-		if (onleft)
+		if (onleft || xlrec->postingoff != 0)
 		{
 			newitem = (IndexTuple) datapos;
 			newitemsz = MAXALIGN(IndexTupleSize(newitem));
 			datapos += newitemsz;
 			datalen -= newitemsz;
+
+			if (xlrec->postingoff != 0)
+			{
+				ItemId		itemid;
+				IndexTuple	oposting;
+
+				/* Posting list must be at offset number before new item's */
+				replacepostingoff = OffsetNumberPrev(xlrec->newitemoff);
+
+				/* newitem must be mutable copy for _bt_swap_posting() */
+				newitem = CopyIndexTuple(newitem);
+				itemid = PageGetItemId(lpage, replacepostingoff);
+				oposting = (IndexTuple) PageGetItem(lpage, itemid);
+				nposting = _bt_swap_posting(newitem, oposting,
+											xlrec->postingoff);
+			}
 		}
 
 		/* Extract left hikey and its size (assuming 16-bit alignment) */
@@ -304,8 +370,20 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
 			Size		itemsz;
 			IndexTuple	item;
 
+			/* Add replacement posting list when required */
+			if (off == replacepostingoff)
+			{
+				Assert(onleft || xlrec->firstright == xlrec->newitemoff);
+				if (PageAddItem(newlpage, (Item) nposting,
+								MAXALIGN(IndexTupleSize(nposting)), leftoff,
+								false, false) == InvalidOffsetNumber)
+					elog(ERROR, "failed to add new posting list item to left page after split");
+				leftoff = OffsetNumberNext(leftoff);
+				continue;
+			}
+
 			/* add the new item if it was inserted on left page */
-			if (onleft && off == xlrec->newitemoff)
+			else if (onleft && off == xlrec->newitemoff)
 			{
 				if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
 								false, false) == InvalidOffsetNumber)
@@ -379,6 +457,84 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
 	}
 }
 
+static void
+btree_xlog_dedup(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	Buffer		buf;
+	xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record);
+
+	if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
+	{
+		/*
+		 * Initialize a temporary empty page and copy all the items to that in
+		 * item number order.
+		 */
+		Page		page = (Page) BufferGetPage(buf);
+		OffsetNumber offnum;
+		BTDedupState state;
+
+		state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+
+		state->maxitemsize = BTMaxItemSize(page);
+		state->checkingunique = false;	/* unused */
+		state->skippedbase = InvalidOffsetNumber;
+		state->newitem = NULL;
+		/* Metadata about current pending posting list */
+		state->htids = NULL;
+		state->nhtids = 0;
+		state->nitems = 0;
+		state->alltupsize = 0;
+		state->overlap = false;
+		/* Metadata about based tuple of current pending posting list */
+		state->base = NULL;
+		state->baseoff = InvalidOffsetNumber;
+		state->basetupsize = 0;
+
+		/* Conservatively size array */
+		state->htids = palloc(state->maxitemsize);
+
+		/*
+		 * Iterate over tuples on the page belonging to the interval to
+		 * deduplicate them into a posting list.
+		 */
+		for (offnum = xlrec->baseoff;
+			 offnum < xlrec->baseoff + xlrec->nitems;
+			 offnum = OffsetNumberNext(offnum))
+		{
+			ItemId		itemid = PageGetItemId(page, offnum);
+			IndexTuple	itup = (IndexTuple) PageGetItem(page, itemid);
+
+			Assert(!ItemIdIsDead(itemid));
+
+			if (offnum == xlrec->baseoff)
+			{
+				/*
+				 * No previous/base tuple for first data item -- use first
+				 * data item as base tuple of first pending posting list
+				 */
+				_bt_dedup_start_pending(state, itup, offnum);
+			}
+			else
+			{
+				/* Heap TID(s) for itup will be saved in state */
+				if (!_bt_dedup_save_htid(state, itup))
+					elog(ERROR, "could not add heap tid to pending posting list");
+			}
+		}
+
+		Assert(state->nitems == xlrec->nitems);
+		/* Handle the last item */
+		_bt_dedup_finish_pending(buf, state, false);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buf);
+	}
+
+	if (BufferIsValid(buf))
+		UnlockReleaseBuffer(buf);
+}
+
 static void
 btree_xlog_vacuum(XLogReaderState *record)
 {
@@ -395,7 +551,38 @@ btree_xlog_vacuum(XLogReaderState *record)
 
 		page = (Page) BufferGetPage(buffer);
 
-		PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
+		/*
+		 * Must update posting list tuples before deleting whole items, since
+		 * offset numbers are based on original page contents
+		 */
+		if (xlrec->nupdated > 0)
+		{
+			OffsetNumber *updatedoffsets;
+			IndexTuple	updated;
+			Size		itemsz;
+
+			updatedoffsets = (OffsetNumber *)
+				(ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+			updated = (IndexTuple) ((char *) updatedoffsets +
+									xlrec->nupdated * sizeof(OffsetNumber));
+
+			/* Handle posting tuples */
+			for (int i = 0; i < xlrec->nupdated; i++)
+			{
+				PageIndexTupleDelete(page, updatedoffsets[i]);
+
+				itemsz = MAXALIGN(IndexTupleSize(updated));
+
+				if (PageAddItem(page, (Item) updated, itemsz, updatedoffsets[i],
+								false, false) == InvalidOffsetNumber)
+					elog(PANIC, "failed to add updated posting list item");
+
+				updated = (IndexTuple) ((char *) updated + itemsz);
+			}
+		}
+
+		if (xlrec->ndeleted)
+			PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
 
 		/*
 		 * Mark the page as not containing any LP_DEAD items --- see comments
@@ -729,17 +916,22 @@ void
 btree_redo(XLogReaderState *record)
 {
 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	MemoryContext oldCtx;
 
+	oldCtx = MemoryContextSwitchTo(opCtx);
 	switch (info)
 	{
 		case XLOG_BTREE_INSERT_LEAF:
-			btree_xlog_insert(true, false, record);
+			btree_xlog_insert(true, false, false, record);
 			break;
 		case XLOG_BTREE_INSERT_UPPER:
-			btree_xlog_insert(false, false, record);
+			btree_xlog_insert(false, false, false, record);
 			break;
 		case XLOG_BTREE_INSERT_META:
-			btree_xlog_insert(false, true, record);
+			btree_xlog_insert(false, true, false, record);
+			break;
+		case XLOG_BTREE_INSERT_POST:
+			btree_xlog_insert(true, false, true, record);
 			break;
 		case XLOG_BTREE_SPLIT_L:
 			btree_xlog_split(true, record);
@@ -747,6 +939,9 @@ btree_redo(XLogReaderState *record)
 		case XLOG_BTREE_SPLIT_R:
 			btree_xlog_split(false, record);
 			break;
+		case XLOG_BTREE_DEDUP_PAGE:
+			btree_xlog_dedup(record);
+			break;
 		case XLOG_BTREE_VACUUM:
 			btree_xlog_vacuum(record);
 			break;
@@ -772,6 +967,23 @@ btree_redo(XLogReaderState *record)
 		default:
 			elog(PANIC, "btree_redo: unknown op code %u", info);
 	}
+	MemoryContextSwitchTo(oldCtx);
+	MemoryContextReset(opCtx);
+}
+
+void
+btree_xlog_startup(void)
+{
+	opCtx = AllocSetContextCreate(CurrentMemoryContext,
+								  "Btree recovery temporary context",
+								  ALLOCSET_DEFAULT_SIZES);
+}
+
+void
+btree_xlog_cleanup(void)
+{
+	MemoryContextDelete(opCtx);
+	opCtx = NULL;
 }
 
 /*
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index 497f8dc77e..23e951aa9e 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -27,6 +27,7 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 		case XLOG_BTREE_INSERT_LEAF:
 		case XLOG_BTREE_INSERT_UPPER:
 		case XLOG_BTREE_INSERT_META:
+		case XLOG_BTREE_INSERT_POST:
 			{
 				xl_btree_insert *xlrec = (xl_btree_insert *) rec;
 
@@ -38,15 +39,27 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 			{
 				xl_btree_split *xlrec = (xl_btree_split *) rec;
 
-				appendStringInfo(buf, "level %u, firstright %d, newitemoff %d",
-								 xlrec->level, xlrec->firstright, xlrec->newitemoff);
+				appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d",
+								 xlrec->level,
+								 xlrec->firstright,
+								 xlrec->newitemoff,
+								 xlrec->postingoff);
+				break;
+			}
+		case XLOG_BTREE_DEDUP_PAGE:
+			{
+				xl_btree_dedup *xlrec = (xl_btree_dedup *) rec;
+
+				appendStringInfo(buf, "baseoff %u; nitems %u",
+								 xlrec->baseoff, xlrec->nitems);
 				break;
 			}
 		case XLOG_BTREE_VACUUM:
 			{
 				xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
 
-				appendStringInfo(buf, "ndeleted %u", xlrec->ndeleted);
+				appendStringInfo(buf, "ndeleted %u; nupdated %u",
+								 xlrec->ndeleted, xlrec->nupdated);
 				break;
 			}
 		case XLOG_BTREE_DELETE:
@@ -130,6 +143,12 @@ btree_identify(uint8 info)
 		case XLOG_BTREE_SPLIT_R:
 			id = "SPLIT_R";
 			break;
+		case XLOG_BTREE_DEDUP_PAGE:
+			id = "DEDUPLICATE";
+			break;
+		case XLOG_BTREE_INSERT_POST:
+			id = "INSERT_POST";
+			break;
 		case XLOG_BTREE_VACUUM:
 			id = "VACUUM";
 			break;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ba4edde71a..6b5d36de57 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -28,6 +28,7 @@
 
 #include "access/commit_ts.h"
 #include "access/gin.h"
+#include "access/nbtree.h"
 #include "access/rmgr.h"
 #include "access/tableam.h"
 #include "access/transam.h"
@@ -363,6 +364,23 @@ static const struct config_enum_entry backslash_quote_options[] = {
 	{NULL, 0, false}
 };
 
+/*
+ * Although only "on", "off", and "nonunique" are documented, we accept all
+ * the likely variants of "on" and "off".
+ */
+static const struct config_enum_entry btree_deduplication_options[] = {
+	{"off", DEDUP_OFF, false},
+	{"on", DEDUP_ON, false},
+	{"nonunique", DEDUP_NONUNIQUE, false},
+	{"false", DEDUP_OFF, true},
+	{"true", DEDUP_ON, true},
+	{"no", DEDUP_OFF, true},
+	{"yes", DEDUP_ON, true},
+	{"0", DEDUP_OFF, true},
+	{"1", DEDUP_ON, true},
+	{NULL, 0, false}
+};
+
 /*
  * Although only "on", "off", and "partition" are documented, we
  * accept all the likely variants of "on" and "off".
@@ -4271,6 +4289,16 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"btree_deduplication", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Enables B-tree index deduplication optimization."),
+			NULL
+		},
+		&btree_deduplication,
+		DEDUP_NONUNIQUE, btree_deduplication_options,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"bytea_output", PGC_USERSET, CLIENT_CONN_STATEMENT,
 			gettext_noop("Sets the output format for bytea."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 46a06ffacd..0b8aa56b3a 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -650,6 +650,7 @@
 #vacuum_cleanup_index_scale_factor = 0.1	# fraction of total number of tuples
 						# before index cleanup, 0 always performs
 						# index cleanup
+#btree_deduplication = 'nonunique'	# off, on, or nonunique
 #bytea_output = 'hex'			# hex, escape
 #xmlbinary = 'base64'
 #xmloption = 'content'
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index df26826993..7e55c0ff90 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -1677,14 +1677,14 @@ psql_completion(const char *text, int start, int end)
 	/* ALTER INDEX <foo> SET|RESET ( */
 	else if (Matches("ALTER", "INDEX", MatchAny, "RESET", "("))
 		COMPLETE_WITH("fillfactor",
-					  "vacuum_cleanup_index_scale_factor",	/* BTREE */
+					  "vacuum_cleanup_index_scale_factor", "deduplication",	/* BTREE */
 					  "fastupdate", "gin_pending_list_limit",	/* GIN */
 					  "buffering",	/* GiST */
 					  "pages_per_range", "autosummarize"	/* BRIN */
 			);
 	else if (Matches("ALTER", "INDEX", MatchAny, "SET", "("))
 		COMPLETE_WITH("fillfactor =",
-					  "vacuum_cleanup_index_scale_factor =",	/* BTREE */
+					  "vacuum_cleanup_index_scale_factor =", "deduplication =",	/* BTREE */
 					  "fastupdate =", "gin_pending_list_limit =",	/* GIN */
 					  "buffering =",	/* GiST */
 					  "pages_per_range =", "autosummarize ="	/* BRIN */
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 3542545de5..8b1223a817 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -145,6 +145,7 @@ static void bt_tuple_present_callback(Relation index, ItemPointer tid,
 									  bool tupleIsAlive, void *checkstate);
 static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
 									 IndexTuple itup);
+static inline IndexTuple bt_posting_logical_tuple(IndexTuple itup, int n);
 static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup);
 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
 											   OffsetNumber offset);
@@ -419,12 +420,13 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 		/*
 		 * Size Bloom filter based on estimated number of tuples in index,
 		 * while conservatively assuming that each block must contain at least
-		 * MaxIndexTuplesPerPage / 5 non-pivot tuples.  (Non-leaf pages cannot
-		 * contain non-pivot tuples.  That's okay because they generally make
-		 * up no more than about 1% of all pages in the index.)
+		 * MaxBTreeIndexTuplesPerPage / 3 "logical" tuples.  heapallindexed
+		 * verification fingerprints posting list heap TIDs as plain non-pivot
+		 * tuples, complete with index keys.  This allows its heap scan to
+		 * behave as if posting lists do not exist.
 		 */
 		total_pages = RelationGetNumberOfBlocks(rel);
-		total_elems = Max(total_pages * (MaxIndexTuplesPerPage / 5),
+		total_elems = Max(total_pages * (MaxBTreeIndexTuplesPerPage / 3),
 						  (int64) state->rel->rd_rel->reltuples);
 		/* Random seed relies on backend srandom() call to avoid repetition */
 		seed = random();
@@ -924,6 +926,7 @@ bt_target_page_check(BtreeCheckState *state)
 		size_t		tupsize;
 		BTScanInsert skey;
 		bool		lowersizelimit;
+		ItemPointer	scantid;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -994,29 +997,72 @@ bt_target_page_check(BtreeCheckState *state)
 
 		/*
 		 * Readonly callers may optionally verify that non-pivot tuples can
-		 * each be found by an independent search that starts from the root
+		 * each be found by an independent search that starts from the root.
+		 * Note that we deliberately don't do individual searches for each
+		 * "logical" posting list tuple, since the posting list itself is
+		 * validated by other checks.
 		 */
 		if (state->rootdescend && P_ISLEAF(topaque) &&
 			!bt_rootdescend(state, itup))
 		{
+			ItemPointer tid = BTreeTupleGetHeapTID(itup);
 			char	   *itid,
 					   *htid;
 
 			itid = psprintf("(%u,%u)", state->targetblock, offset);
-			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumber(&(itup->t_tid)),
-							ItemPointerGetOffsetNumber(&(itup->t_tid)));
+			htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid),
+							ItemPointerGetOffsetNumber(tid));
 
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg("could not find tuple using search from root page in index \"%s\"",
 							RelationGetRelationName(state->rel)),
-					 errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.",
+					 errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.",
 										itid, htid,
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
 		}
 
+		/*
+		 * If tuple is actually a posting list, make sure posting list TIDs
+		 * are in order.
+		 */
+		if (BTreeTupleIsPosting(itup))
+		{
+			ItemPointerData last;
+			ItemPointer		current;
+
+			ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
+
+			for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+			{
+
+				current = BTreeTupleGetPostingN(itup, i);
+
+				if (ItemPointerCompare(current, &last) <= 0)
+				{
+					char	   *itid,
+							   *htid;
+
+					itid = psprintf("(%u,%u)", state->targetblock, offset);
+					htid = psprintf("(%u,%u)",
+									ItemPointerGetBlockNumberNoCheck(current),
+									ItemPointerGetOffsetNumberNoCheck(current));
+
+					ereport(ERROR,
+							(errcode(ERRCODE_INDEX_CORRUPTED),
+							 errmsg("posting list heap TIDs out of order in index \"%s\"",
+									RelationGetRelationName(state->rel)),
+							 errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.",
+												itid, htid,
+												(uint32) (state->targetlsn >> 32),
+												(uint32) state->targetlsn)));
+				}
+
+				ItemPointerCopy(current, &last);
+			}
+		}
+
 		/* Build insertion scankey for current page offset */
 		skey = bt_mkscankey_pivotsearch(state->rel, itup);
 
@@ -1074,12 +1120,32 @@ bt_target_page_check(BtreeCheckState *state)
 		{
 			IndexTuple	norm;
 
-			norm = bt_normalize_tuple(state, itup);
-			bloom_add_element(state->filter, (unsigned char *) norm,
-							  IndexTupleSize(norm));
-			/* Be tidy */
-			if (norm != itup)
-				pfree(norm);
+			if (BTreeTupleIsPosting(itup))
+			{
+				/* Fingerprint all elements as distinct "logical" tuples */
+				for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+				{
+					IndexTuple	logtuple;
+
+					logtuple = bt_posting_logical_tuple(itup, i);
+					norm = bt_normalize_tuple(state, logtuple);
+					bloom_add_element(state->filter, (unsigned char *) norm,
+									  IndexTupleSize(norm));
+					/* Be tidy */
+					if (norm != logtuple)
+						pfree(norm);
+					pfree(logtuple);
+				}
+			}
+			else
+			{
+				norm = bt_normalize_tuple(state, itup);
+				bloom_add_element(state->filter, (unsigned char *) norm,
+								  IndexTupleSize(norm));
+				/* Be tidy */
+				if (norm != itup)
+					pfree(norm);
+			}
 		}
 
 		/*
@@ -1087,7 +1153,8 @@ bt_target_page_check(BtreeCheckState *state)
 		 *
 		 * If there is a high key (if this is not the rightmost page on its
 		 * entire level), check that high key actually is upper bound on all
-		 * page items.
+		 * page items.  If this is a posting list tuple, we'll need to set
+		 * scantid to be highest TID in posting list.
 		 *
 		 * We prefer to check all items against high key rather than checking
 		 * just the last and trusting that the operator class obeys the
@@ -1127,6 +1194,9 @@ bt_target_page_check(BtreeCheckState *state)
 		 * tuple. (See also: "Notes About Data Representation" in the nbtree
 		 * README.)
 		 */
+		scantid = skey->scantid;
+		if (state->heapkeyspace && !BTreeTupleIsPivot(itup))
+			skey->scantid = BTreeTupleGetMaxHeapTID(itup);
 		if (!P_RIGHTMOST(topaque) &&
 			!(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
 			  invariant_l_offset(state, skey, P_HIKEY)))
@@ -1150,6 +1220,7 @@ bt_target_page_check(BtreeCheckState *state)
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
 		}
+		skey->scantid = scantid;
 
 		/*
 		 * * Item order check *
@@ -1160,15 +1231,17 @@ bt_target_page_check(BtreeCheckState *state)
 		if (OffsetNumberNext(offset) <= max &&
 			!invariant_l_offset(state, skey, OffsetNumberNext(offset)))
 		{
+			ItemPointer tid;
 			char	   *itid,
 					   *htid,
 					   *nitid,
 					   *nhtid;
 
 			itid = psprintf("(%u,%u)", state->targetblock, offset);
+			tid = BTreeTupleGetHeapTID(itup);
 			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							ItemPointerGetBlockNumberNoCheck(tid),
+							ItemPointerGetOffsetNumberNoCheck(tid));
 			nitid = psprintf("(%u,%u)", state->targetblock,
 							 OffsetNumberNext(offset));
 
@@ -1177,9 +1250,11 @@ bt_target_page_check(BtreeCheckState *state)
 										  state->target,
 										  OffsetNumberNext(offset));
 			itup = (IndexTuple) PageGetItem(state->target, itemid);
+
+			tid = BTreeTupleGetHeapTID(itup);
 			nhtid = psprintf("(%u,%u)",
-							 ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							 ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							 ItemPointerGetBlockNumberNoCheck(tid),
+							 ItemPointerGetOffsetNumberNoCheck(tid));
 
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -1189,10 +1264,10 @@ bt_target_page_check(BtreeCheckState *state)
 										"higher index tid=%s (points to %s tid=%s) "
 										"page lsn=%X/%X.",
 										itid,
-										P_ISLEAF(topaque) ? "heap" : "index",
+										P_ISLEAF(topaque) ? "min heap" : "index",
 										htid,
 										nitid,
-										P_ISLEAF(topaque) ? "heap" : "index",
+										P_ISLEAF(topaque) ? "min heap" : "index",
 										nhtid,
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
@@ -1953,10 +2028,10 @@ bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values,
  * verification.  In particular, it won't try to normalize opclass-equal
  * datums with potentially distinct representations (e.g., btree/numeric_ops
  * index datums will not get their display scale normalized-away here).
- * Normalization may need to be expanded to handle more cases in the future,
- * though.  For example, it's possible that non-pivot tuples could in the
- * future have alternative logically equivalent representations due to using
- * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication.
+ * Caller does normalization for non-pivot tuples that have a posting list,
+ * since dummy CREATE INDEX callback code generates new tuples with the same
+ * normalized representation.  Deduplication is performed opportunistically,
+ * and in general there is no guarantee about how or when it will be applied.
  */
 static IndexTuple
 bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
@@ -1969,6 +2044,9 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
 	IndexTuple	reformed;
 	int			i;
 
+	/* Caller should only pass "logical" non-pivot tuples here */
+	Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup));
+
 	/* Easy case: It's immediately clear that tuple has no varlena datums */
 	if (!IndexTupleHasVarwidths(itup))
 		return itup;
@@ -2031,6 +2109,30 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
 	return reformed;
 }
 
+/*
+ * Produce palloc()'d "logical" tuple for nth posting list entry.
+ *
+ * In general, deduplication is not supposed to change the logical contents of
+ * an index.  Multiple logical index tuples are folded together into one
+ * physical posting list index tuple when convenient.
+ *
+ * heapallindexed verification must normalize-away this variation in
+ * representation by converting posting list tuples into two or more "logical"
+ * tuples.  Each logical tuple must be fingerprinted separately -- there must
+ * be one logical tuple for each corresponding Bloom filter probe during the
+ * heap scan.
+ *
+ * Note: Caller needs to call bt_normalize_tuple() with returned tuple.
+ */
+static inline IndexTuple
+bt_posting_logical_tuple(IndexTuple itup, int n)
+{
+	Assert(BTreeTupleIsPosting(itup));
+
+	/* Returns non-posting-list tuple */
+	return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1);
+}
+
 /*
  * Search for itup in index, starting from fast root page.  itup must be a
  * non-pivot tuple.  This is only supported with heapkeyspace indexes, since
@@ -2087,6 +2189,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		insertstate.itup = itup;
 		insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
 		insertstate.itup_key = key;
+		insertstate.postingoff = 0;
 		insertstate.bounds_valid = false;
 		insertstate.buf = lbuf;
 
@@ -2094,7 +2197,9 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
 		offnum = _bt_binsrch_insert(state->rel, &insertstate);
 		/* Compare first >= matching item on leaf page, if any */
 		page = BufferGetPage(lbuf);
+		/* Should match on first heap TID when tuple has a posting list */
 		if (offnum <= PageGetMaxOffsetNumber(page) &&
+			insertstate.postingoff <= 0 &&
 			_bt_compare(state->rel, key, page, offnum) == 0)
 			exists = true;
 		_bt_relbuf(state->rel, lbuf);
@@ -2548,26 +2653,25 @@ PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page,
 }
 
 /*
- * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must
- * be present in cases where that is mandatory.
- *
- * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK
- * bit is effectively a proxy for whether or not the tuple is a pivot tuple.
- * It may become more useful in the future, when non-pivot tuples support their
- * own alternative INDEX_ALT_TID_MASK representation.
+ * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in
+ * cases where that is mandatory (i.e. for non-pivot tuples).
  */
 static inline ItemPointer
 BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup,
 							bool nonpivot)
 {
-	ItemPointer result = BTreeTupleGetHeapTID(itup);
-	BlockNumber targetblock = state->targetblock;
+	Assert(state->heapkeyspace);
 
-	if (result == NULL && nonpivot)
+	/*
+	 * Make sure that tuple type (pivot vs non-pivot) matches caller's
+	 * expectation
+	 */
+	if (BTreeTupleIsPivot(itup) == nonpivot)
 		ereport(ERROR,
 				(errcode(ERRCODE_INDEX_CORRUPTED),
 				 errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
-						targetblock, RelationGetRelationName(state->rel))));
+						state->targetblock,
+						RelationGetRelationName(state->rel))));
 
-	return result;
+	return BTreeTupleGetHeapTID(itup);
 }
diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml
index 5881ea5dd6..13d9b2ff96 100644
--- a/doc/src/sgml/btree.sgml
+++ b/doc/src/sgml/btree.sgml
@@ -433,11 +433,130 @@ returns bool
 <sect1 id="btree-implementation">
  <title>Implementation</title>
 
+ <para>
+  Internally, a B-tree index consists of a tree structure with leaf
+  pages.  Each leaf page contains tuples that point to table entries
+  using a heap item pointer. Each tuple's key is considered unique
+  internally, since the item pointer is treated as part of the key.
+ </para>
+ <para>
+  An introduction to the btree index implementation can be found in
+  <filename>src/backend/access/nbtree/README</filename>.
+ </para>
+
+ <sect2 id="btree-deduplication">
+  <title>Deduplication</title>
   <para>
-   An introduction to the btree index implementation can be found in
-   <filename>src/backend/access/nbtree/README</filename>.
+   B-Tree supports <firstterm>deduplication</firstterm>.  Existing
+   leaf page tuples with fully equal keys (equal prior to the heap
+   item pointer) are merged together into a single <quote>posting
+   list</quote> tuple.  The keys appear only once in this
+   representation.  A simple array of heap item pointers follows.
+   Posting lists are formed <quote>lazily</quote>, when a new item is
+   inserted that cannot fit on an existing leaf page.  The immediate
+   goal of the deduplication process is to at least free enough space
+   to fit the new item; otherwise a leaf page split occurs, which
+   allocates a new leaf page.  The <firstterm>key space</firstterm>
+   covered by the original leaf page is shared among the original page,
+   and its new right sibling page.
+  </para>
+  <para>
+   Deduplication can greatly increase index space efficiency with data
+   sets where each distinct key appears at least a few times on
+   average.  It can also reduce the cost of subsequent index scans,
+   especially when many leaf pages must be accessed.  For example, an
+   index on a simple <type>integer</type> column that uses
+   deduplication will have a storage size that is only about 65% of an
+   equivalent unoptimized index when each distinct
+   <type>integer</type> value appears three times.  If each distinct
+   <type>integer</type> value appears six times, the storage overhead
+   can be as low as 50% of baseline.  With hundreds of duplicates per
+   distinct value (or with larger <quote>base</quote> key values) a
+   storage size of about <emphasis>one third</emphasis> of the
+   unoptimized case is expected.  There is often a direct benefit for
+   queries, as well as an indirect benefit due to reduced I/O during
+   routine vacuuming.
+  </para>
+  <para>
+   Cases that don't benefit due to having no duplicate values will
+   incur a small performance penalty with mixed read-write workloads.
+   There is no performance penalty with read-only workloads, since
+   reading from posting lists is at least as efficient as reading the
+   standard index tuple representation.
+  </para>
+ </sect2>
+
+ <sect2 id="btree-deduplication-configure">
+  <title>Configuring Deduplication</title>
+
+  <para>
+   The <xref linkend="guc-btree-deduplication"/> configuration
+   parameter controls deduplication.  By default, deduplication is
+   only used with non-unique indexes.  The
+   <literal>deduplication</literal> storage parameter can be used to
+   override the configuration paramater for individual indexes.  See
+   <xref linkend="sql-createindex-storage-parameters"/> from the
+   <command>CREATE INDEX</command> documentation for details.
+  </para>
+ </sect2>
+
+ <sect2 id="btree-deduplication-unique">
+  <title>Unique indexes and Deduplication</title>
+
+  <para>
+   Unique indexes can also use deduplication, despite the fact that
+   unique indexes do not <emphasis>logically</emphasis> contain
+   duplicates; implementation-level <emphasis>physical</emphasis>
+   duplicates may still be present.  Unique indexes that are prone to
+   becoming bloated due to a short term burst in updates are good
+   candidates.  <command>VACUUM</command> will eventually remove dead
+   versions of tuples from unique indexes, but it may not be possible
+   for it to do so before some number of <quote>unnecessary</quote>
+   page splits have taken place.  Deduplication can prevent these page
+   splits from happening.  Note that page splits can only be reversed
+   by <command>VACUUM</command> when the page is
+   <emphasis>completely</emphasis> empty, which isn't expected in this
+   scenario.
+  </para>
+  <para>
+   In other cases, deduplication can be effective with unique indexes
+   just because of the presence of many <literal>NULL</literal> values
+   in the unique index.  The influence of <xref
+   linkend="guc-vacuum-cleanup-index-scale-factor"/> must also be
+   considered.
+  </para>
+  <para>
+   For more information about automatic and manual vacuuming, see
+   <xref linkend="routine-vacuuming"/>.  Note that the heap-only tuple
+   (<acronym>HOT</acronym>) optimization can also prevent page splits
+   caused only by versioned tuples rather than by insertions of new
+   values.
   </para>
 
+ </sect2>
+
+ <sect2 id="btree-deduplication-restrictions">
+  <title>Restrictions</title>
+
+  <para>
+   Deduplication can only be used with indexes that use B-Tree
+   operator classes that were declared <literal>BITWISE</literal>.  In
+   practice almost all datatypes support deduplication, though
+   <type>numeric</type> is a notable exception (the <quote>display
+   scale</quote> feature makes it impossible to enable deduplication
+   without losing useful information about equal <type>numeric</type>
+   datums).  Deduplication is not supported with nondeterministic
+   collations, nor is it supported with <literal>INCLUDE</literal>
+   indexes.
+  </para>
+  <para>
+   Note that a multicolumn index is only considered to have duplicates
+   when there are index entries that repeat entire
+   <emphasis>combinations</emphasis> of values (the values stored in
+   each and every column must be equal).
+  </para>
+
+ </sect2>
 </sect1>
 
 </chapter>
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
index 55669b5cad..9f371d3e3a 100644
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -928,10 +928,11 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr
      nondeterministic collations give a more <quote>correct</quote> behavior,
      especially when considering the full power of Unicode and its many
      special cases, they also have some drawbacks.  Foremost, their use leads
-     to a performance penalty.  Also, certain operations are not possible with
-     nondeterministic collations, such as pattern matching operations.
-     Therefore, they should be used only in cases where they are specifically
-     wanted.
+     to a performance penalty.  Note, in particular, that B-tree cannot use
+     deduplication with indexes that use a nondeterministic collation.  Also,
+     certain operations are not possible with nondeterministic collations,
+     such as pattern matching operations.  Therefore, they should be used
+     only in cases where they are specifically wanted.
     </para>
    </sect3>
   </sect2>
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index d4d1fe45cc..6f89e4a51f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -8000,6 +8000,39 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-btree-deduplication" xreflabel="btree_deduplication">
+      <term><varname>btree_deduplication</varname> (<type>enum</type>)
+      <indexterm>
+       <primary><varname>btree_deduplication</varname></primary>
+       <secondary>configuration parameter</secondary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Controls the use of deduplication within B-Tree indexes.
+        Deduplication is an optimization that reduces the storage size
+        of indexes by storing equal index keys only once.  See <xref
+        linkend="btree-deduplication"/> for more information.
+       </para>
+
+       <para>
+        In addition to <literal>off</literal>, to disable, there are
+        two modes: <literal>on</literal>, and
+        <literal>nonunique</literal>.  When
+        <varname>btree_deduplication</varname> is set to
+        <literal>nonunique</literal>, the default, deduplication is
+        only used for non-unique B-Tree indexes.
+       </para>
+
+       <para>
+        This setting can be overridden for individual B-Tree indexes
+        by changing index storage parameters.  See <xref
+        linkend="sql-createindex-storage-parameters"/> from the
+        <command>CREATE INDEX</command> documentation for details.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-bytea-output" xreflabel="bytea_output">
       <term><varname>bytea_output</varname> (<type>enum</type>)
       <indexterm>
diff --git a/doc/src/sgml/maintenance.sgml b/doc/src/sgml/maintenance.sgml
index ec8bdcd7a4..695aa9123d 100644
--- a/doc/src/sgml/maintenance.sgml
+++ b/doc/src/sgml/maintenance.sgml
@@ -887,6 +887,14 @@ analyze threshold = analyze base threshold + analyze scale factor * number of tu
    might be worthwhile to reindex periodically just to improve access speed.
   </para>
 
+  <tip>
+  <para>
+   Enabling B-tree deduplication in unique indexes can be an effective
+   way to control index bloat in extreme cases.  See <xref
+   linkend="btree-deduplication-unique"/> for details.
+  </para>
+  </tip>
+
   <para>
    <xref linkend="sql-reindex"/> can be used safely and easily in all cases.
    This command requires an <literal>ACCESS EXCLUSIVE</literal> lock by
diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml
index 629a31ef79..abc7db4820 100644
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@@ -166,6 +166,8 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
         maximum size allowed for the index type, data insertion will fail.
         In any case, non-key columns duplicate data from the index's table
         and bloat the size of the index, thus potentially slowing searches.
+        Moreover, B-tree deduplication is never used with indexes that
+        have a non-key column.
        </para>
 
        <para>
@@ -388,10 +390,39 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
    </variablelist>
 
    <para>
-    B-tree indexes additionally accept this parameter:
+    B-tree indexes also accept these parameters:
    </para>
 
    <variablelist>
+   <varlistentry id="index-reloption-deduplication" xreflabel="deduplication">
+    <term><literal>deduplication</literal>
+     <indexterm>
+      <primary><varname>deduplication</varname></primary>
+      <secondary>storage parameter</secondary>
+     </indexterm>
+    </term>
+    <listitem>
+    <para>
+      Per-index value for <xref linkend="guc-btree-deduplication"/>.
+      Controls usage of the B-tree deduplication technique described
+      in <xref linkend="btree-deduplication"/>.  Set to
+      <literal>ON</literal> or <literal>OFF</literal> to override GUC.
+      (Alternative spellings of <literal>ON</literal> and
+      <literal>OFF</literal> are allowed as described in <xref
+      linkend="config-setting"/>.)
+    </para>
+
+    <note>
+     <para>
+      Turning <literal>deduplication</literal> off via <command>ALTER
+      INDEX</command> prevents future insertions from triggering
+      deduplication, but does not in itself make existing posting list
+      tuples use the standard tuple representation.
+     </para>
+    </note>
+    </listitem>
+   </varlistentry>
+
    <varlistentry id="index-reloption-vacuum-cleanup-index-scale-factor" xreflabel="vacuum_cleanup_index_scale_factor">
     <term><literal>vacuum_cleanup_index_scale_factor</literal>
      <indexterm>
@@ -446,9 +477,7 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
      This setting controls usage of the fast update technique described in
      <xref linkend="gin-fast-update"/>.  It is a Boolean parameter:
      <literal>ON</literal> enables fast update, <literal>OFF</literal> disables it.
-     (Alternative spellings of <literal>ON</literal> and <literal>OFF</literal> are
-     allowed as described in <xref linkend="config-setting"/>.)  The
-     default is <literal>ON</literal>.
+     The default is <literal>ON</literal>.
     </para>
 
     <note>
@@ -831,6 +860,13 @@ CREATE UNIQUE INDEX title_idx ON films (title) WITH (fillfactor = 70);
 </programlisting>
   </para>
 
+  <para>
+   To create a unique index with deduplication enabled:
+<programlisting>
+CREATE UNIQUE INDEX title_idx ON films (title) WITH (deduplication = on);
+</programlisting>
+  </para>
+
   <para>
    To create a <acronym>GIN</acronym> index with fast updates disabled:
 <programlisting>
diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml
index 10881ab03a..c9a5349019 100644
--- a/doc/src/sgml/ref/reindex.sgml
+++ b/doc/src/sgml/ref/reindex.sgml
@@ -58,8 +58,9 @@ REINDEX [ ( VERBOSE ) ] { INDEX | TABLE | SCHEMA | DATABASE | SYSTEM } [ CONCURR
 
     <listitem>
      <para>
-      You have altered a storage parameter (such as fillfactor)
-      for an index, and wish to ensure that the change has taken full effect.
+      You have altered a storage parameter (such as fillfactor or
+      deduplication) for an index, and wish to ensure that the change has
+      taken full effect.
      </para>
     </listitem>
 
diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
index f567117a46..53bcd1f30a 100644
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -266,6 +266,22 @@ select * from btree_bpchar where f1::bpchar like 'foo%';
  fool
 (2 rows)
 
+--
+-- Test deduplication within a unique index
+--
+CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false);
+CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplication=on);
+CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplication=off);
+-- Generate enough garbage tuples in index to ensure that even the unique index
+-- with deduplication enabled has to check multiple leaf pages during unique
+-- checking (at least with a BLCKSZ of 8192 or less)
+DO $$
+BEGIN
+    FOR r IN 1..1350 LOOP
+        DELETE FROM dedup_unique_test_table;
+        INSERT INTO dedup_unique_test_table SELECT 1;
+    END LOOP;
+END$$;
 --
 -- Test B-tree fast path (cache rightmost leaf page) optimization.
 --
diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
index 558dcae0ec..f008a5a55f 100644
--- a/src/test/regress/sql/btree_index.sql
+++ b/src/test/regress/sql/btree_index.sql
@@ -103,6 +103,23 @@ explain (costs off)
 select * from btree_bpchar where f1::bpchar like 'foo%';
 select * from btree_bpchar where f1::bpchar like 'foo%';
 
+--
+-- Test deduplication within a unique index
+--
+CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false);
+CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplication=on);
+CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplication=off);
+-- Generate enough garbage tuples in index to ensure that even the unique index
+-- with deduplication enabled has to check multiple leaf pages during unique
+-- checking (at least with a BLCKSZ of 8192 or less)
+DO $$
+BEGIN
+    FOR r IN 1..1350 LOOP
+        DELETE FROM dedup_unique_test_table;
+        INSERT INTO dedup_unique_test_table SELECT 1;
+    END LOOP;
+END$$;
+
 --
 -- Test B-tree fast path (cache rightmost leaf page) optimization.
 --
-- 
2.17.1