From ba9e0be13c2573b63639452ababcd0116d2674be Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 9 Nov 2020 12:59:29 -0800
Subject: [PATCH v10 1/3] Teach heapam to support bottom-up index deletion.

Teach heapam about bottom-up index deletion.  This mechanism allows an
index AM to cooperate with a tableam in deleting index tuples at regular
intervals.  The general idea is to avoid accumulating too many versions
in the index for any given logically row, without doing any extra work
before the situation gets out of hand at a localized page in an index.

This commit isn't useful on its own.  An upcoming commit will add
support to nbtree.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Victor Yegorov <vyegorov@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
---
 src/include/access/heapam.h              |   2 +
 src/include/access/tableam.h             | 113 ++++
 src/backend/access/heap/heapam.c         | 660 +++++++++++++++++++++++
 src/backend/access/heap/heapam_handler.c |   1 +
 src/backend/access/table/tableam.c       |   6 +-
 5 files changed, 779 insertions(+), 3 deletions(-)

diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 54b2eb7378..289d0ca5c6 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -169,6 +169,8 @@ extern void simple_heap_update(Relation relation, ItemPointer otid,
 extern TransactionId heap_compute_xid_horizon_for_tuples(Relation rel,
 														 ItemPointerData *items,
 														 int nitems);
+extern TransactionId heapam_index_delete_check(Relation rel,
+											   TM_IndexDeleteOp *delstate);
 
 /* in heap/pruneheap.c */
 struct GlobalVisState;
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 387eb34a61..998f8b2996 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -128,6 +128,99 @@ typedef struct TM_FailureData
 	bool		traversed;
 } TM_FailureData;
 
+/*
+ * State used when calling table_index_delete_check() to perform "bottom up"
+ * deletion of duplicate index tuples.  State is intialized by index AM
+ * caller, while state is finalized by tableam, which modifies state.
+ */
+typedef struct TM_IndexDelete
+{
+	ItemPointerData tid;		/* table TID from index tuple */
+	int16		id;				/* Offset into TM_IndexStatus array */
+} TM_IndexDelete;
+
+typedef struct TM_IndexStatus
+{
+	OffsetNumber idxoffnum;		/* Index am page offset number */
+	int16		tupsize;		/* Space freed in index if tuple deleted */
+	bool		ispromising;	/* Duplicate in index? */
+	bool		deleteitup;		/* Known dead-to-all? */
+} TM_IndexStatus;
+
+/*
+ * State representing one single bottom-up index deletion operation.
+ *
+ * Index am caller provides a TM_IndexDeleteOp, which points to two palloc()'d
+ * arrays.  Each array has one entry per TID that the tableam is asked to
+ * consider (typically these are all of the TIDs from a single index page, so
+ * there could be hundreds or even thousand of entries in arrays).  ndeltids
+ * tracks the current number of entries.  It is set by index AM initially, and
+ * generally modified by tableam (which conceptually shrinks the array for
+ * most calls).
+ *
+ * The two arrays are conceptually one single array.  Two arrays/structs are
+ * used for performance reasons.  (We really need to keep the TM_IndexDelete
+ * struct small so that the tableam can do an initial sort by TID as quickly
+ * as possible.)
+ *
+ * The index AM should keep track of which index tuple relates to which entry
+ * by setting idxoffnum (and/or relying on each entry being uniquely
+ * identifiable using tid).  Index AM requests target free space indicated by
+ * "targetfreespace".  Index AM also represents the space saving for each TID
+ * by filling in the tupsize for each array element.  The tableam must balance
+ * the requirements of the index AM against the costs paid in the tableam.
+ *
+ * Callers that simply want the tableam to visit all the table blocks from any
+ * TIDs should set alltids = true.  Callers that do that can initialize any
+ * field concerned with managing the cost of visiting table blocks to 0, since
+ * they won't be used at all.
+ *
+ * The index AM provides strong hints about where to look to the tableam by
+ * marking some entries as "promising".  Index AM does this with duplicate
+ * index tuples that are strongly suspected to be old versions left behind by
+ * UPDATEs that did not logically changed any indexed values.  Index AM may
+ * find it helpful to only mark TIDs/entries as promising when they're thought
+ * to have been affected by such an UPDATE in the recent past.  Though none of
+ * this matters to alltids callers, since the tableam will be obligated to
+ * visit every table block anyway.
+ *
+ * The tableam marks individual entries as deletable for the index AM.  It's
+ * common for the final array to be shrunk in size.  The index AM caller
+ * should do nothing if on return delstate.ndeltids is found set to zero.  The
+ * index AM caller only needs to consider the first ndeltids from the final
+ * array, which is typically much smaller than its original size (tableam
+ * updates ndeltids in state).  One reason for this is that the tableam can
+ * naturally only afford to a few tableam blocks on each call -- it typically
+ * won't even try to check most of the entries from the tableam.
+ *
+ * Note that index AM caller can mark entries that are known dead-to-all as
+ * deletable up-front, saving the tableam a little work.  This is only allowed
+ * for callers that specify alltids, though.  The tableam will check entries
+ * that are initially marked !deleteitup on a best-effort basis (it should be
+ * able to process every entry for an alltids caller in practice, but that
+ * isn't actually a requirement).
+ *
+ * In general, tableams use significant discretion about how much
+ * non-essential work they perform during each !alltids call.  However, the
+ * tableam is obligated to do whatever it must to make it safe for the index
+ * AM to perform deletion of entries that it marked deleteitup up-front.  The
+ * tableam cannot opt out of facilitating deletion of index tuples that are
+ * already marked LP_DEAD in index (again, only relevant to alltids calls).
+ *
+ * The tableam typically sorts the delstate.deltids array by TID.  The index
+ * AM should be prepared to restore the array to its useful/original order.
+ * The array is typically far smaller than its original size by then, so that
+ * step should be relatively fast.
+ */
+typedef struct TM_IndexDeleteOp
+{
+	bool		alltids;		/* tableam must visit all blocks? */
+	int			ndeltids;		/* Number of deltids/status for op */
+	TM_IndexDelete *deltids;
+	TM_IndexStatus *status;
+	int			targetfreespace;	/* Guides tableam on requirements */
+} TM_IndexDeleteOp;
+
 /* "options" flag bits for table_tuple_insert */
 /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */
 #define TABLE_INSERT_SKIP_FSM		0x0002
@@ -409,6 +502,10 @@ typedef struct TableAmRoutine
 							   uint8 flags,
 							   TM_FailureData *tmfd);
 
+	/* see table_index_delete_check() for reference about parameters */
+	TransactionId (*index_delete_check) (Relation rel,
+										 TM_IndexDeleteOp *delstate);
+
 	/*
 	 * Perform operations necessary to complete insertions made via
 	 * tuple_insert and multi_insert with a BulkInsertState specified. In-tree
@@ -1363,6 +1460,22 @@ table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot,
 									   flags, tmfd);
 }
 
+/*
+ * Index tuple deletion check, for use by index AMs.
+ *
+ * Sets deletable tuples in entries from caller's TM_IndexDeleteOp state that
+ * are found to point to dead-to-all tuples in the table, rel.  See the
+ * TM_IndexDeleteOp struct for full details.
+ *
+ * Returns a latestRemovedXid transaction ID that index AM must use to
+ * generate a recovery conflict when required.
+ */
+static inline TransactionId
+table_index_delete_check(Relation rel, TM_IndexDeleteOp *delstate)
+{
+	return rel->rd_tableam->index_delete_check(rel, delstate);
+}
+
 /*
  * Perform operations necessary to complete insertions made via
  * tuple_insert and multi_insert with a BulkInsertState specified.
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 1b2f70499e..78a59fd0b4 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -55,6 +55,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "port/atomics.h"
+#include "port/pg_bitutils.h"
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/lmgr.h"
@@ -102,6 +103,7 @@ static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 in
 							int *remaining);
 static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
 									   uint16 infomask, Relation rel, int *remaining);
+static int	heapam_index_delete_check_sort(Relation rel, TM_IndexDeleteOp *delstate);
 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed,
 										bool *copy);
@@ -178,6 +180,17 @@ typedef struct
 } XidHorizonPrefetchState;
 #endif
 
+/*
+ * heapam_index_delete_check uses this structure to determine which heap pages
+ * to visit, and in what order
+ */
+typedef struct IndexDeleteCounts
+{
+	int16		npromisingtids;
+	int16		ntids;
+	int16		ideltids;
+} IndexDeleteCounts;
+
 /*
  * This table maps tuple lock strength values for each particular
  * MultiXactStatus value.
@@ -192,6 +205,11 @@ static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
 	LockTupleExclusive			/* Update */
 };
 
+/*
+ * Shellsort gap sequence (taken from Sedgewick-Incerpi paper)
+ */
+static const int ShellsortGaps[8] = {861, 336, 112, 48, 21, 7, 3, 1};
+
 /* Get the LockTupleMode for a given MultiXactStatus */
 #define TUPLOCK_from_mxstatus(status) \
 			(MultiXactStatusLock[(status)])
@@ -6987,6 +7005,9 @@ xid_horizon_prefetch_buffer(Relation rel,
  * deleting hundreds of tuples from a single index block.  To amortize that
  * cost to some degree, this uses prefetching and combines repeat accesses to
  * the same block.
+ *
+ * Note: The logic for maintaining latestRemovedXid here is duplicated by code
+ * within heapam_index_delete_check().  Make sure that they stay in sync.
  */
 TransactionId
 heap_compute_xid_horizon_for_tuples(Relation rel,
@@ -7133,6 +7154,645 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 	return latestRemovedXid;
 }
 
+#define MAX_DELETE_HEAP_BLOCKS 4
+#define FAVORABLE_BLOCK_STRIDE 3
+
+/*
+ * Determine which heap tuples from a list of TIDs provided by caller are
+ * dead.  It is safe to delete index tuples that point to these dead heap
+ * tuples.
+ *
+ * This is used by index AMs that support "bottom up" deletion of duplicate
+ * index tuples in batches of just a few heap pages at a time.  Index AMs call
+ * here through the table_index_delete_check() interface.  See tableam
+ * interface details (for the TM_IndexDeleteOp struct) for more information.
+ *
+ * Alternatively, this can be used by callers that want a batch version of
+ * heap_compute_xid_horizon_for_tuples(). These callers (alltids callers) can
+ * expect us to access every heap page exhaustively.  They can still expect us
+ * to mark some extra entries deletable.  We check if not-marked-deletable
+ * entries are deleteable in passing, and mark them deletable if the check
+ * works out, as in the bottom-up deletion case.
+ *
+ * Though the main thing that influences which heap pages are accessed here
+ * (for alltids/bottom-up callers) is the presence of tuples that index AM
+ * caller has marked "promising" (which relate to duplicate index tuples
+ * believed to have been inserted in index recently), there are other
+ * considerations.  The approach taken here considers both spatial and
+ * temporal locality inside the heap structure.  This is especially helpful
+ * when there are several heap blocks with approximately the same amount of
+ * promising tuples.  Multiple calls here for the same index will tend to
+ * consistently delete the oldest index tuples, which keeps the number of
+ * buffer misses here to a minimum.
+ *
+ * Sometimes larger batch sizes are preferred here, even when that means that
+ * we might actually exceed caller's immediate requirement for free space in
+ * the index.  Contiguous heap blocks are considered "favorable".  The
+ * presence of favorable blocks makes the call as a whole access more blocks
+ * to better amortize costs.  We expect to be called multiple times for
+ * related records in at least some cases, and have to consider costs over
+ * time.  The cost of any individual call is less important.
+ *
+ * Returns the latestRemovedXid from the heap tuples pointed to by index
+ * tuples whose deltids entries are marked safe to delete.
+ *
+ * Note: The logic for maintaining latestRemovedXid here is duplicated by code
+ * within heap_compute_xid_horizon_for_tuples().  Make sure that they stay in
+ * sync.
+ */
+TransactionId
+heapam_index_delete_check(Relation rel, TM_IndexDeleteOp *delstate)
+{
+	TransactionId latestRemovedXid = InvalidTransactionId;
+	BlockNumber hblkno = InvalidBlockNumber;
+	Buffer		buf = InvalidBuffer;
+	Page		hpage;
+	bool		finalhpage = false;
+	int			finalndeltids = 0;
+	int			nblocksaccessed = 0;
+	int			nblocksfavorable = 0;
+	int			spacefreed = 0;
+	int			spacefreedbeforecurhpage = 0;
+	SnapshotData SnapshotNonVacuumable;
+	TM_IndexDelete *deltids = delstate->deltids;
+	TM_IndexStatus *status = delstate->status;
+	int			targetfreespace = delstate->targetfreespace;
+
+	InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel));
+
+	/*
+	 * Sort and shrink deltids array so that it consists only of TIDs from
+	 * just a few of the most promising blocks for !alltids callers.  (Just
+	 * sort by TID for an alltids caller.)
+	 */
+	nblocksfavorable = heapam_index_delete_check_sort(rel, delstate);
+	for (int i = 0; i < delstate->ndeltids; i++)
+	{
+		TM_IndexStatus *dstatus = status + deltids[i].id;
+		ItemPointer htid = &deltids[i].tid;
+		ItemId		hitemid;
+		OffsetNumber hoffnum;
+		HeapTupleData heapTuple;
+
+		/* Only alltids callers may mark entries as deleteitup themselves */
+		Assert(delstate->alltids || !dstatus->deleteitup);
+
+		if (hblkno == InvalidBlockNumber ||
+			ItemPointerGetBlockNumber(htid) != hblkno)
+		{
+			/*
+			 * We usually do a little extra work on the final heap page after
+			 * caller's target space to free has been reached.  The cost of
+			 * accessing the final heap page we'll need to visit has already
+			 * been paid by that point.  We finish off the entire final heap
+			 * page because it's cheap to do so.
+			 *
+			 * We don't want to unnecessarily visit the next page in line.
+			 * Handle that here (when we just finished final page).
+			 */
+			if (finalhpage)
+				break;
+
+			/*
+			 * Each time we're about to access a new page we consider if it's
+			 * really worth it.  We apply two tests before we visit the next
+			 * page, and give up if either fails (when caller is an !alltids
+			 * caller, also known as a bottom-up deletion caller):
+			 *
+			 * 1. Give up when we didn't enable our caller to free any
+			 * additional space as a result of processing the most recent heap
+			 * page visited.  We expect to make steady progress or no
+			 * progress.
+			 *
+			 * 2. Give up when MAX_DELETE_HEAP_BLOCKS have been accessed
+			 * already, no matter what.  (This is defensive, since the deltids
+			 * array was shrunk before we started.  It should now contain TIDs
+			 * from pages not exceeding MAX_DELETE_HEAP_BLOCKS in number.)
+			 */
+			if (!delstate->alltids)
+			{
+				if (nblocksaccessed >= 1 && spacefreed == spacefreedbeforecurhpage)
+					break;
+				if (nblocksaccessed == MAX_DELETE_HEAP_BLOCKS)
+					break;
+			}
+
+			/*
+			 * After visiting and processing the first heap page, aggressively
+			 * decay target space freed (the request from index AM caller)
+			 * before accessing each new heap page (starting with the second
+			 * in line).  But only start decaying when we encounter our first
+			 * non-favorable block.
+			 *
+			 * Favorable blocks are contiguous groups of heap blocks that are
+			 * likely to have related heap tuples that are cheaper to process
+			 * in larger batches.  It doesn't make sense to be stingy here.
+			 * The index AM may end up calling us about the same heap TIDs
+			 * before much time has passed if we do that.
+			 *
+			 * Note that even favorable blocks are required to enable caller
+			 * to free at least some space -- otherwise we give up before
+			 * accessing the next block in line.  If a favorable block cannot
+			 * be freed then there is probably an old snapshot that frustrates
+			 * progress here in general.
+			 */
+			if (nblocksfavorable == 0)
+			{
+				targetfreespace /= 2;
+
+				/* Must always start out with at least 1 favorable block */
+				Assert(nblocksaccessed >= 1);
+			}
+
+			/* Now access next page */
+			if (BufferIsValid(buf))
+			{
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				ReleaseBuffer(buf);
+			}
+
+			/*
+			 * We could prune the heap page in passing here, but that doesn't
+			 * seem like a good idea.  (Index AM caller is expected to hold
+			 * locks of its own.)
+			 */
+			hblkno = ItemPointerGetBlockNumber(htid);
+			buf = ReadBuffer(rel, hblkno);
+			hpage = BufferGetPage(buf);
+			nblocksaccessed++;
+			if (nblocksfavorable > 0)
+				nblocksfavorable--;
+			spacefreedbeforecurhpage = spacefreed;
+
+			/* Need to lock buffer for visibility checks */
+			LockBuffer(buf, BUFFER_LOCK_SHARE);
+		}
+
+		if (!dstatus->deleteitup)
+		{
+			ItemPointerData tmp;
+			bool			all_dead,
+							found;
+
+			tmp = *htid;		/* Don't modify htid */
+			all_dead = false;	/* Check that whole HOT chain is vacuumable */
+			found = heap_hot_search_buffer(&tmp, rel, buf,
+										   &SnapshotNonVacuumable, &heapTuple,
+										   &all_dead, true);
+
+			if (found || !all_dead)
+				continue;
+		}
+
+		/* Caller can delete this TID from index */
+		finalndeltids = i + 1;
+		dstatus->deleteitup = true;
+		spacefreed += dstatus->tupsize;
+
+		if (spacefreed >= targetfreespace && !delstate->alltids)
+		{
+			/*
+			 * Caller's free space target has now been met (maybe...target may
+			 * have decayed one or more times from original value if we
+			 * weren't accessing favorable/contiguous blocks).
+			 *
+			 * Finish off the current/final heap page before finishing.
+			 */
+			finalhpage = true;
+		}
+
+		/*
+		 * One last step required for TID that caller will delete.  Must
+		 * maintain latestRemovedXid for caller's delete operation.
+		 */
+		hoffnum = ItemPointerGetOffsetNumber(htid);
+		hitemid = PageGetItemId(hpage, hoffnum);
+
+		while (ItemIdIsRedirected(hitemid))
+		{
+			hoffnum = ItemIdGetRedirect(hitemid);
+			hitemid = PageGetItemId(hpage, hoffnum);
+		}
+
+		/*
+		 * If the heap item has storage, then read the header and use that to
+		 * set latestRemovedXid.
+		 *
+		 * Some LP_DEAD items may not be accessible, so we ignore them.
+		 */
+		if (ItemIdHasStorage(hitemid))
+		{
+			HeapTupleHeader htuphdr;
+
+			htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+
+			HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
+		}
+		else if (ItemIdIsDead(hitemid))
+		{
+			/*
+			 * Conjecture: if hitemid is dead then it had xids before the xids
+			 * marked on LP_NORMAL items. So we just ignore this item and move
+			 * onto the next, for the purposes of calculating
+			 * latestRemovedXid.
+			 */
+		}
+		else
+			Assert(!ItemIdIsUsed(hitemid));
+	}
+
+	if (BufferIsValid(buf))
+	{
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		ReleaseBuffer(buf);
+	}
+
+	/*
+	 * If all heap tuples were LP_DEAD then we will be returning
+	 * InvalidTransactionId here, which avoids conflicts. This matches
+	 * existing logic which assumes that LP_DEAD tuples must already be older
+	 * than the latestRemovedXid on the cleanup record that set them as
+	 * LP_DEAD, hence must already have generated a conflict.
+	 */
+	delstate->ndeltids = finalndeltids;
+
+	return latestRemovedXid;
+}
+
+/*
+ * Determine how many favorable blocks are among blocks we'll access (which
+ * have been sorted by heapam_index_delete_check_sort() by the time we get
+ * called).  The exact approach taken by heapam_index_delete_check() is
+ * influenced by the number of favorable blocks.
+ *
+ * Returns number of favorable blocks, starting from (and including) the first
+ * block in line for processing.
+ *
+ * Favorable blocks are contiguous heap blocks, which are likely to have
+ * relatively many dead items.  These blocks are cheaper to access together
+ * all at once.  Having many favorable blocks is common with low cardinality
+ * index tuples, where heap locality has a relatively large influence on which
+ * heap blocks we visit (and the order they're processed in).  Being more
+ * aggressive with favorable blocks is slightly more expensive in the short
+ * term, but less expensive across related heapam_index_delete_check() calls.
+ *
+ * Note: We always indicate that there is at least 1 favorable block (the
+ * first in line to process).  The first block must always be in sorted order
+ * because the ordering is relative to the first block (or previous block).
+ * This degenerate case isn't a problem for heapam_index_delete_check(), which
+ * is supposed to always visit the first heap page in line, regardless of any
+ * other factor.
+ */
+static int
+top_block_groups_favorable(IndexDeleteCounts *blockcounts, int nblockgroups,
+						   TM_IndexDelete *deltids)
+{
+	int			nblocksfavorable = 0;
+	BlockNumber lastblock = InvalidBlockNumber;
+
+	for (int b = 0; b < nblockgroups; b++)
+	{
+		IndexDeleteCounts *blockgroup = blockcounts + b;
+		TM_IndexDelete *firstgroup = deltids + blockgroup->ideltids;
+		BlockNumber thisblock = ItemPointerGetBlockNumber(&firstgroup->tid);
+
+		if (BlockNumberIsValid(lastblock) &&
+			(thisblock < lastblock ||
+			 thisblock > lastblock + FAVORABLE_BLOCK_STRIDE))
+			break;
+
+		nblocksfavorable++;
+		lastblock = Min(thisblock, MaxBlockNumber - FAVORABLE_BLOCK_STRIDE);
+	}
+
+	Assert(nblocksfavorable >= 1);
+
+	return nblocksfavorable;
+}
+
+static inline int
+indexdelete_tids_cmp(ItemPointer tid1, ItemPointer tid2)
+{
+	{
+		BlockNumber blk1 = ItemPointerGetBlockNumber(tid1);
+		BlockNumber blk2 = ItemPointerGetBlockNumber(tid2);
+
+		if (blk1 != blk2)
+			return (blk1 < blk2) ? -1 : 1;
+	}
+	{
+		OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1);
+		OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2);
+
+		if (pos1 != pos2)
+			return (pos1 < pos2) ? -1 : 1;
+	}
+
+	pg_unreachable();
+
+	return 0;
+}
+
+static inline int
+indexdeletecount_cmp(IndexDeleteCounts *count1, IndexDeleteCounts *count2)
+{
+	uint32		ntids1,
+				ntids2;
+
+	/* We expect power-of-two values for npromisingtids fields */
+	Assert(count1->npromisingtids == 0 ||
+		   ((count1->npromisingtids - 1) & count1->npromisingtids) == 0);
+	Assert(count2->npromisingtids == 0 ||
+		   ((count2->npromisingtids - 1) & count2->npromisingtids) == 0);
+
+	/*
+	 * Most significant field is npromisingtids, which we sort on in desc
+	 * order.  The usual asc comparison order is deliberately inverted here.
+	 */
+	if (count1->npromisingtids > count2->npromisingtids)
+		return -1;
+	if (count1->npromisingtids < count2->npromisingtids)
+		return 1;
+
+	/*
+	 * Tiebreak: desc ntids sort order.
+	 *
+	 * We cannot expect power-of-two values for ntids fields.  We should
+	 * behave as if they were already rounded up for us instead.
+	 */
+	ntids1 = count1->ntids;
+	ntids2 = count2->ntids;
+	if (ntids1 != ntids2)
+	{
+		ntids1 = pg_nextpower2_32(ntids1);
+		ntids2 = pg_nextpower2_32(ntids2);
+
+		if (ntids1 > ntids2)
+			return -1;
+		if (ntids1 < ntids2)
+			return 1;
+	}
+
+	/*
+	 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
+	 * block in deltids array) order.
+	 *
+	 * This is equivalent to sorting in ascending heap block number order
+	 * (among otherwise equal subsets of the array).  This approach allows us
+	 * to avoid accessing the out-of-line TID.  (We rely on the assumption
+	 * that the deltids array was sorted in ascending heap TID order when
+	 * these offsets to the first TID from each heap block group were formed.)
+	 */
+	if (count1->ideltids > count2->ideltids)
+		return 1;
+	if (count1->ideltids < count2->ideltids)
+		return -1;
+
+	pg_unreachable();
+
+	return 0;
+}
+
+/*
+ * Two hand written shellshort implementations.
+ *
+ * The two sort operations needed by heapam_index_delete_check_sort() become
+ * quite noticeable on profiles of workloads with lots of index contention
+ * caused by non-HOT updates.  Keeping costs down is important enough to
+ * justify several micro-optimizations.  We could just use qsort() instead,
+ * but the indirection that it imposes is expensive enough to matter here.
+ * (The size of array elements also matters, which is why we keep it under 8
+ * bytes - swaps should be as fast as reasonably possible).
+ *
+ * We use shellsort here because it has many of the same strengths as an
+ * industrial-strength quicksort implementation, but is also lightweight in
+ * the sense that the entire implementation compiles to relatively few machine
+ * instructions.  It is adaptive to inputs with some presorted subsets (which
+ * are typical here).
+ *
+ * This implementation is fast with array sizes up to about 1900.  This covers
+ * all supported BLCKSZ values.
+ */
+static void
+heap_tid_shellsort(TM_IndexDelete *deltids, int ndeltids)
+{
+	int			low = 0;
+
+	/* Think carefully before changing anything here */
+	StaticAssertStmt(sizeof(TM_IndexDelete) <= 8,
+					 "element size exceeds 8 bytes");
+
+	for (int g = 0; g < lengthof(ShellsortGaps); g++)
+	{
+		for (int hi = ShellsortGaps[g], i = low + hi; i < ndeltids; i++)
+		{
+			TM_IndexDelete d = deltids[i];
+			int			j = i;
+
+			while (j >= hi &&
+				   indexdelete_tids_cmp(&deltids[j - hi].tid, &d.tid) >= 0)
+			{
+				deltids[j] = deltids[j - hi];
+				j -= hi;
+			}
+			deltids[j] = d;
+		}
+	}
+}
+
+static void
+index_delete_shellsort(IndexDeleteCounts *blockcounts, int nblockgroups)
+{
+	int			low = 0;
+
+	/* Think carefully before changing anything here */
+	StaticAssertStmt(sizeof(IndexDeleteCounts) <= 8,
+					 "element size exceeds 8 bytes");
+
+	for (int g = 0; g < lengthof(ShellsortGaps); g++)
+	{
+		for (int hi = ShellsortGaps[g], i = low + hi; i < nblockgroups; i++)
+		{
+			IndexDeleteCounts c = blockcounts[i];
+			int			j = i;
+
+			while (j >= hi &&
+				   indexdeletecount_cmp(&blockcounts[j - hi], &c) >= 0)
+			{
+				blockcounts[j] = blockcounts[j - hi];
+				j -= hi;
+			}
+			blockcounts[j] = c;
+		}
+	}
+}
+
+/*
+ * heapam_index_delete_check() helper function.  Sorts deltids array in the
+ * order needed for useful processing.
+ *
+ * Groups heap TIDs from deltids into heap block number groupings.  From
+ * there, sorts each heap block grouping by the total number of "promising"
+ * TIDs it contains (in desc order).  For blocks with the same number of
+ * promising TIDs, tiebreak on the total heap TID count (also desc order).
+ *
+ * heapam_index_delete_check() only visits up to MAX_DELETE_HEAP_BLOCKS heap
+ * blocks due to the speculative nature of the batch index deletion
+ * optimization.  These heap blocks had better be the most promising
+ * available, based on a variety of criteria.  We make sure of that here.
+ *
+ * Sets new size of deltids array (ndeltids) in state.  deltids will only have
+ * TIDs from the MAX_DELETE_HEAP_BLOCKS most promising heap blocks when we
+ * return (which is usually far fewer).
+ *
+ * Returns number of "favorable" blocks.
+ */
+static int
+heapam_index_delete_check_sort(Relation rel, TM_IndexDeleteOp *delstate)
+{
+	IndexDeleteCounts *blockcounts;
+	TM_IndexDelete *reordereddeltids;
+	BlockNumber curblock = InvalidBlockNumber;
+	int			nblockgroups = 0;
+	int			ncopied = 0;
+	int			nblocksfavorable = 0;
+#ifdef USE_PREFETCH
+	int			prefetch_distance;
+#endif
+
+	Assert(delstate->ndeltids > 0);
+
+	/* First sort caller's array by TID */
+	heap_tid_shellsort(delstate->deltids, delstate->ndeltids);
+
+	/* alltids caller visits all blocks, so make sure that happens */
+	if (delstate->alltids)
+		return delstate->ndeltids;
+
+	/* Calculate per-heap-block count of TIDs */
+	blockcounts = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids);
+	for (int i = 0; i < delstate->ndeltids; i++)
+	{
+		ItemPointer deltid = &delstate->deltids[i].tid;
+		TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
+		bool		ispromising = dstatus->ispromising;
+
+		if (curblock != ItemPointerGetBlockNumber(deltid))
+		{
+			/* New block group */
+			nblockgroups++;
+
+			Assert(curblock < ItemPointerGetBlockNumber(deltid) ||
+				   !BlockNumberIsValid(curblock));
+
+			curblock = ItemPointerGetBlockNumber(deltid);
+			blockcounts[nblockgroups - 1].ideltids = i;
+			blockcounts[nblockgroups - 1].ntids = 1;
+			blockcounts[nblockgroups - 1].npromisingtids = 0;
+		}
+		else
+		{
+			blockcounts[nblockgroups - 1].ntids++;
+		}
+
+		if (ispromising)
+			blockcounts[nblockgroups - 1].npromisingtids++;
+	}
+
+	/*
+	 * We're about ready to use index_delete_shellsort() to determine the
+	 * optimal order for visiting heap pages.  But before we do, round the
+	 * number of promising tuples for each block group up to the nearest
+	 * power-of-two (unless there are zero promising tuples).  This scheme
+	 * usefully divides heap pages into buckets.  Each bucket contains heap
+	 * pages that are approximately equally promising, that we want to treat
+	 * as exactly equivalent (at least initially).
+	 *
+	 * While in general the presence of promising tuples (the hint that index
+	 * AMs provide) is the best information that we have to go on, it is based
+	 * on simple heuristics about duplicates in indexes that are understood to
+	 * have specific flaws.  We should not let the most promising heap pages
+	 * win or lose on the basis of _relatively_ small differences in the total
+	 * number of promising tuples.  Small differences between the most
+	 * promising few heap pages are effectively ignored by applying this
+	 * power-of-two bucketing scheme.
+	 *
+	 * When we have lots of ties on the final bucket-ized npromisingtids among
+	 * the most promising heap pages, we let heap locality determine the order
+	 * in which we visit heap pages.  This is helpful because it exploits the
+	 * natural tendency for earlier heap blocks to accumulate more LP_DEAD
+	 * items sooner in workloads with many non-HOT updates.  It's also helpful
+	 * because the effect over time is that we process related heap blocks
+	 * sequentially, possibly with multiple rounds of processing over the same
+	 * related heap blocks that are subject to continuous non-HOT updates over
+	 * time.
+	 *
+	 * Note that we effectively have the same power-of-two bucketing scheme
+	 * with the ntids field (which is compared after npromisingtids).  The
+	 * only reason that we don't fix nhtids here is that the original values
+	 * will be needed when copying the final TIDs from winning block groups
+	 * back into caller's deltids array.
+	 */
+	for (int b = 0; b < nblockgroups; b++)
+	{
+		IndexDeleteCounts *blockgroup = blockcounts + b;
+
+		if (blockgroup->npromisingtids != 0)
+			blockgroup->npromisingtids =
+				pg_nextpower2_32((uint32) blockgroup->npromisingtids);
+	}
+
+	/* Sort groups and rearrange caller's deltids array */
+	index_delete_shellsort(blockcounts, nblockgroups);
+	reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
+
+	nblockgroups = Min(MAX_DELETE_HEAP_BLOCKS, nblockgroups);
+	/* Determine number of favorable blocks at the start of array */
+	nblocksfavorable = top_block_groups_favorable(blockcounts, nblockgroups,
+												  delstate->deltids);
+
+#ifdef USE_PREFETCH
+	/* Compute the prefetch distance that we will attempt to maintain */
+	if (IsCatalogRelation(rel))
+		prefetch_distance = maintenance_io_concurrency;
+	else
+		prefetch_distance =
+			get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
+
+	prefetch_distance = Min(prefetch_distance, nblockgroups);
+#endif
+
+	for (int b = 0; b < nblockgroups; b++)
+	{
+		IndexDeleteCounts *blockgroup = blockcounts + b;
+		TM_IndexDelete *firstgroup = delstate->deltids + blockgroup->ideltids;
+
+		memcpy(reordereddeltids + ncopied, firstgroup,
+			   sizeof(TM_IndexDelete) * blockgroup->ntids);
+		ncopied += blockgroup->ntids;
+
+#ifdef USE_PREFETCH
+		if (prefetch_distance-- > 0)
+		{
+			BlockNumber hblock = ItemPointerGetBlockNumber(&firstgroup->tid);
+
+			PrefetchBuffer(rel, MAIN_FORKNUM, hblock);
+		}
+#endif
+	}
+
+	/* Copy final grouped and sorted TIDs back into start of caller's array */
+	memcpy(delstate->deltids, reordereddeltids,
+		   sizeof(TM_IndexDelete) * ncopied);
+	delstate->ndeltids = ncopied;
+
+	/* be tidy */
+	pfree(reordereddeltids);
+	pfree(blockcounts);
+
+	return nblocksfavorable;
+}
+
 /*
  * Perform XLogInsert to register a heap cleanup info message. These
  * messages are sent once per VACUUM and are required because
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 3eea215b85..a4069d2ce0 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -2557,6 +2557,7 @@ static const TableAmRoutine heapam_methods = {
 	.tuple_delete = heapam_tuple_delete,
 	.tuple_update = heapam_tuple_update,
 	.tuple_lock = heapam_tuple_lock,
+	.index_delete_check = heapam_index_delete_check,
 
 	.tuple_fetch_row_version = heapam_fetch_row_version,
 	.tuple_get_latest_tid = heap_get_latest_tid,
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 6438c45716..e19bdd246a 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -207,9 +207,9 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
 /*
  * To perform that check simply start an index scan, create the necessary
  * slot, do the heap lookup, and shut everything down again. This could be
- * optimized, but is unlikely to matter from a performance POV. If there
- * frequently are live index pointers also matching a unique index key, the
- * CPU overhead of this routine is unlikely to matter.
+ * optimized, but is unlikely to matter from a performance POV. Note that
+ * table_index_delete_check() is optimized in this way, since it is designed
+ * as a batch operation.
  *
  * Note that *tid may be modified when we return true if the AM supports
  * storing multiple row versions reachable via a single index entry (like
-- 
2.25.1