From 09c4ccf9f02e1a3b1e00e696ad7da1a96a4da0aa Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 9 Nov 2020 12:59:29 -0800
Subject: [PATCH v9 1/4] Make tableam interface support bottom-up deletion.

Teach tableam about bottom-up index deletion.  This mechanism allows an
index AM to cooperate with a tableam in deleting index tuples at regular
intervals.  The general idea is to avoid accumulating too many versions
in the index for any given logically row, without doing any extra work
before the situation gets out of hand at a localized page in an index.

This commit isn't useful on its own.  An upcoming commit will add
support to nbtree.  A further commit will provide complete functionality
by adding support to heapam.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Victor Yegorov <vyegorov@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
---
 src/include/access/tableam.h             | 99 ++++++++++++++++++++++++
 src/backend/access/heap/heapam_handler.c |  1 +
 src/backend/access/table/tableam.c       |  6 +-
 3 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 387eb34a61..5cd698c885 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -128,6 +128,75 @@ typedef struct TM_FailureData
 	bool		traversed;
 } TM_FailureData;
 
+/*
+ * State used when calling table_index_delete_check() to perform "bottom up"
+ * deletion of duplicate index tuples.  State is intialized by index AM
+ * caller, while state is finalized by tableam, which modifies state.
+ */
+typedef struct TM_IndexDelete
+{
+	ItemPointerData tid;		/* table TID from index tuple */
+	int16		id;				/* Offset into TM_IndexStatus array */
+} TM_IndexDelete;
+
+typedef struct TM_IndexStatus
+{
+	OffsetNumber idxoffnum;		/* Index am page offset number */
+	int16		tupsize;		/* Space freed in index if tuple deleted */
+	bool		ispromising;	/* Is a duplicate within index? */
+	bool		deleteitup;		/* Was tableam tuple found dead? */
+} TM_IndexStatus;
+
+/*
+ * State representing one single bottom-up index deletion operation.
+ *
+ * Index am caller provides a TM_IndexDeleteOp, which points to two palloc()'d
+ * arrays.  Each array has one entry per TID that the tableam is asked to
+ * consider (typically these are all of the TIDs from a single index page, so
+ * there could be hundreds or even thousand of entries in arrays).  ndeltids
+ * tracks the current number of entries.
+ *
+ * The two arrays are conceptually one single array.  Two arrays/structs are
+ * used for performance reasons.  (We really need to keep the TM_IndexDelete
+ * struct small so that the tableam can do an initial sort by TID as quickly
+ * as possible.)
+ *
+ * The index AM should keep track of which index tuple relates to which entry
+ * by setting idxoffnum (and/or relying on each entry being uniquely
+ * identifiable using tid).  Index AM requests target free space indicated by
+ * "targetfreespace".  Index AM also represents the space saving for each TID
+ * by filling in the tupsize for each array element.  The tableam must balance
+ * the requirements of the index AM against the costs paid in the tableam.
+ *
+ * The index AM provides strong hints about where to look to the tableam by
+ * marking some entries as "promising".  Index AM does this with duplicate
+ * index tuples that are strongly suspected to be old versions left behind by
+ * UPDATEs that did not logically changed any indexed values.  Index AM may
+ * find it helpful to only mark TIDs/entries as promising when they're thought
+ * to have been affected by such an UPDATE in the recent past.
+ *
+ * The tableam marks individual entries as deletable for the index AM.  It's
+ * common for the final array to be shrunk in size.  The index AM caller
+ * should do nothing if on return delstate.ndeltids is found set to zero.  The
+ * index AM caller only needs to consider the first ndeltids from the final
+ * array, which is typically much smaller than its original size (tableam
+ * updates ndeltids in state).  One reason for this is that the tableam can
+ * naturally only afford to a few tableam blocks on each call -- it typically
+ * won't even try to check most of the entries from the tableam.
+ *
+ * The tableam typically sorts the delstate.deltids array by TID.  The index
+ * AM should be prepared to restore the array to its useful/original order.
+ * The array is typically far smaller than its original size by then, so that
+ * step should be relatively fast.
+ */
+typedef struct TM_IndexDeleteOp
+{
+	int			ndeltids;		/* Number of deltids/status for op */
+	TM_IndexDelete *deltids;
+	TM_IndexStatus *status;
+	int			targetfreespace;	/* Guides tableam on requirements */
+} TM_IndexDeleteOp;
+
 /* "options" flag bits for table_tuple_insert */
 /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */
 #define TABLE_INSERT_SKIP_FSM		0x0002
@@ -409,6 +478,17 @@ typedef struct TableAmRoutine
 							   uint8 flags,
 							   TM_FailureData *tmfd);
 
+	/*
+	 * Help index AMs to perform bottom-up index deletion.
+	 *
+	 * Optional callback.  See TM_IndexDeleteOp struct for full details.
+	 *
+	 * Returns a latestRemovedXid transaction ID that index AM must use to
+	 * generate a recovery conflict when required.
+	 */
+	TransactionId (*index_delete_check) (Relation rel,
+										 TM_IndexDeleteOp *delstate);
+
 	/*
 	 * Perform operations necessary to complete insertions made via
 	 * tuple_insert and multi_insert with a BulkInsertState specified. In-tree
@@ -1363,6 +1443,25 @@ table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot,
 									   flags, tmfd);
 }
 
+/*
+ * Bottom-up index deletion interface for index AMs.
+ *
+ * Sets deletable tuples in entries from caller's TM_IndexDeleteOp state that
+ * are found to point to already-dead tuples in the tableam structure.
+ *
+ * See TM_IndexDeleteOp struct for full details.
+ */
+static inline TransactionId
+table_index_delete_check(Relation rel, TM_IndexDeleteOp *delstate)
+{
+	/* optional callback */
+	if (rel->rd_tableam && rel->rd_tableam->index_delete_check)
+		return rel->rd_tableam->index_delete_check(rel, delstate);
+
+	delstate->ndeltids = 0;
+	return InvalidTransactionId;
+}
+
 /*
  * Perform operations necessary to complete insertions made via
  * tuple_insert and multi_insert with a BulkInsertState specified.
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index dcaea7135f..a08c494034 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -2532,6 +2532,7 @@ static const TableAmRoutine heapam_methods = {
 	.tuple_delete = heapam_tuple_delete,
 	.tuple_update = heapam_tuple_update,
 	.tuple_lock = heapam_tuple_lock,
+	.index_delete_check = NULL,
 
 	.tuple_fetch_row_version = heapam_fetch_row_version,
 	.tuple_get_latest_tid = heap_get_latest_tid,
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 6438c45716..e19bdd246a 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -207,9 +207,9 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
 /*
  * To perform that check simply start an index scan, create the necessary
  * slot, do the heap lookup, and shut everything down again. This could be
- * optimized, but is unlikely to matter from a performance POV. If there
- * frequently are live index pointers also matching a unique index key, the
- * CPU overhead of this routine is unlikely to matter.
+ * optimized, but is unlikely to matter from a performance POV. Note that
+ * table_index_delete_check() is optimized in this way, since it is designed
+ * as a batch operation.
  *
  * Note that *tid may be modified when we return true if the AM supports
  * storing multiple row versions reachable via a single index entry (like
-- 
2.25.1