From 09c4ccf9f02e1a3b1e00e696ad7da1a96a4da0aa Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 9 Nov 2020 12:59:29 -0800 Subject: [PATCH v9 1/4] Make tableam interface support bottom-up deletion. Teach tableam about bottom-up index deletion. This mechanism allows an index AM to cooperate with a tableam in deleting index tuples at regular intervals. The general idea is to avoid accumulating too many versions in the index for any given logically row, without doing any extra work before the situation gets out of hand at a localized page in an index. This commit isn't useful on its own. An upcoming commit will add support to nbtree. A further commit will provide complete functionality by adding support to heapam. Author: Peter Geoghegan Reviewed-By: Victor Yegorov Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com --- src/include/access/tableam.h | 99 ++++++++++++++++++++++++ src/backend/access/heap/heapam_handler.c | 1 + src/backend/access/table/tableam.c | 6 +- 3 files changed, 103 insertions(+), 3 deletions(-) diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 387eb34a61..5cd698c885 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -128,6 +128,75 @@ typedef struct TM_FailureData bool traversed; } TM_FailureData; +/* + * State used when calling table_index_delete_check() to perform "bottom up" + * deletion of duplicate index tuples. State is intialized by index AM + * caller, while state is finalized by tableam, which modifies state. + */ +typedef struct TM_IndexDelete +{ + ItemPointerData tid; /* table TID from index tuple */ + int16 id; /* Offset into TM_IndexStatus array */ +} TM_IndexDelete; + +typedef struct TM_IndexStatus +{ + OffsetNumber idxoffnum; /* Index am page offset number */ + int16 tupsize; /* Space freed in index if tuple deleted */ + bool ispromising; /* Is a duplicate within index? */ + bool deleteitup; /* Was tableam tuple found dead? */ +} TM_IndexStatus; + +/* + * State representing one single bottom-up index deletion operation. + * + * Index am caller provides a TM_IndexDeleteOp, which points to two palloc()'d + * arrays. Each array has one entry per TID that the tableam is asked to + * consider (typically these are all of the TIDs from a single index page, so + * there could be hundreds or even thousand of entries in arrays). ndeltids + * tracks the current number of entries. + * + * The two arrays are conceptually one single array. Two arrays/structs are + * used for performance reasons. (We really need to keep the TM_IndexDelete + * struct small so that the tableam can do an initial sort by TID as quickly + * as possible.) + * + * The index AM should keep track of which index tuple relates to which entry + * by setting idxoffnum (and/or relying on each entry being uniquely + * identifiable using tid). Index AM requests target free space indicated by + * "targetfreespace". Index AM also represents the space saving for each TID + * by filling in the tupsize for each array element. The tableam must balance + * the requirements of the index AM against the costs paid in the tableam. + * + * The index AM provides strong hints about where to look to the tableam by + * marking some entries as "promising". Index AM does this with duplicate + * index tuples that are strongly suspected to be old versions left behind by + * UPDATEs that did not logically changed any indexed values. Index AM may + * find it helpful to only mark TIDs/entries as promising when they're thought + * to have been affected by such an UPDATE in the recent past. + * + * The tableam marks individual entries as deletable for the index AM. It's + * common for the final array to be shrunk in size. The index AM caller + * should do nothing if on return delstate.ndeltids is found set to zero. The + * index AM caller only needs to consider the first ndeltids from the final + * array, which is typically much smaller than its original size (tableam + * updates ndeltids in state). One reason for this is that the tableam can + * naturally only afford to a few tableam blocks on each call -- it typically + * won't even try to check most of the entries from the tableam. + * + * The tableam typically sorts the delstate.deltids array by TID. The index + * AM should be prepared to restore the array to its useful/original order. + * The array is typically far smaller than its original size by then, so that + * step should be relatively fast. + */ +typedef struct TM_IndexDeleteOp +{ + int ndeltids; /* Number of deltids/status for op */ + TM_IndexDelete *deltids; + TM_IndexStatus *status; + int targetfreespace; /* Guides tableam on requirements */ +} TM_IndexDeleteOp; + /* "options" flag bits for table_tuple_insert */ /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */ #define TABLE_INSERT_SKIP_FSM 0x0002 @@ -409,6 +478,17 @@ typedef struct TableAmRoutine uint8 flags, TM_FailureData *tmfd); + /* + * Help index AMs to perform bottom-up index deletion. + * + * Optional callback. See TM_IndexDeleteOp struct for full details. + * + * Returns a latestRemovedXid transaction ID that index AM must use to + * generate a recovery conflict when required. + */ + TransactionId (*index_delete_check) (Relation rel, + TM_IndexDeleteOp *delstate); + /* * Perform operations necessary to complete insertions made via * tuple_insert and multi_insert with a BulkInsertState specified. In-tree @@ -1363,6 +1443,25 @@ table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, flags, tmfd); } +/* + * Bottom-up index deletion interface for index AMs. + * + * Sets deletable tuples in entries from caller's TM_IndexDeleteOp state that + * are found to point to already-dead tuples in the tableam structure. + * + * See TM_IndexDeleteOp struct for full details. + */ +static inline TransactionId +table_index_delete_check(Relation rel, TM_IndexDeleteOp *delstate) +{ + /* optional callback */ + if (rel->rd_tableam && rel->rd_tableam->index_delete_check) + return rel->rd_tableam->index_delete_check(rel, delstate); + + delstate->ndeltids = 0; + return InvalidTransactionId; +} + /* * Perform operations necessary to complete insertions made via * tuple_insert and multi_insert with a BulkInsertState specified. diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index dcaea7135f..a08c494034 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2532,6 +2532,7 @@ static const TableAmRoutine heapam_methods = { .tuple_delete = heapam_tuple_delete, .tuple_update = heapam_tuple_update, .tuple_lock = heapam_tuple_lock, + .index_delete_check = NULL, .tuple_fetch_row_version = heapam_fetch_row_version, .tuple_get_latest_tid = heap_get_latest_tid, diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 6438c45716..e19bdd246a 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -207,9 +207,9 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan) /* * To perform that check simply start an index scan, create the necessary * slot, do the heap lookup, and shut everything down again. This could be - * optimized, but is unlikely to matter from a performance POV. If there - * frequently are live index pointers also matching a unique index key, the - * CPU overhead of this routine is unlikely to matter. + * optimized, but is unlikely to matter from a performance POV. Note that + * table_index_delete_check() is optimized in this way, since it is designed + * as a batch operation. * * Note that *tid may be modified when we return true if the AM supports * storing multiple row versions reachable via a single index entry (like -- 2.25.1