From 7709510dab61f471b0a182a325adc39fe0cb63d4 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 30 Jun 2020 16:29:27 -0700 Subject: [PATCH v5 2/2] Add delete deduplication to nbtree. Repurpose deduplication infrastructure to delete items in indexes at the point where we'd usually have to split the page, even when they don't have their LP_DEAD bits set. Testing has shown that this is almost completely effective at preventing "version index bloat" from non-HOT updates, provided there are no long running transactions. This is primarily valuable with leaf pages that contain mostly-distinct index tuples, particularly with unique indexes. It is intended to complement deduplication. Heuristics are used to guess which index tuples are likely to point to no longer needed old table row versions. Note that INCLUDE indexes support the optimization. --- src/include/access/genam.h | 15 + src/include/access/heapam.h | 8 +- src/include/access/nbtree.h | 11 +- src/include/access/nbtxlog.h | 7 +- src/include/access/tableam.h | 53 +- src/include/executor/executor.h | 3 +- src/backend/access/heap/heapam.c | 406 ++++++++++++- src/backend/access/heap/heapam_handler.c | 5 +- src/backend/access/nbtree/README | 33 +- src/backend/access/nbtree/nbtdedup.c | 707 +++++++++++++++++++++-- src/backend/access/nbtree/nbtinsert.c | 62 +- src/backend/access/nbtree/nbtpage.c | 122 +++- src/backend/access/nbtree/nbtsort.c | 12 +- src/backend/access/nbtree/nbtxlog.c | 55 +- src/backend/access/table/tableam.c | 30 +- src/backend/commands/copy.c | 5 +- src/backend/executor/execIndexing.c | 41 +- src/backend/executor/execReplication.c | 4 +- src/backend/executor/nodeModifyTable.c | 14 +- 19 files changed, 1475 insertions(+), 118 deletions(-) diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 68d90f5141..7002da0716 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -108,10 +108,25 @@ typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc; * call is made with UNIQUE_CHECK_EXISTING. The tuple is already in the * index in this case, so it should not be inserted again. Rather, just * check for conflicting live tuples (possibly blocking). + * + * UNIQUE_CHECK_NO indicates the absence of any unique checking. + * UNIQUE_CHECK_NO_WITH_UNCHANGED is a variant of UNIQUE_CHECK_NO that + * indicates that the index tuple comes from an UPDATE that did not modify + * the row in respect of any columns that are indexed. The implementation + * requires a successor version, but there is no logical change. Some + * index access AMs can use this as hint that can trigger optimizations. + * + * XXX: Adding UNIQUE_CHECK_NO_WITH_UNCHANGED like this kind of makes + * sense, since it's pretty natural to leave it up to index AMs to figure + * it out with unique indexes. But what about when we insert NULLs into a + * unique index? Isn't that case UNIQUE_CHECK_YES, and yet also a thing + * that nbtree pretty much treats as UNIQUE_CHECK_NO once it sees that the + * index tuple has NULLs? */ typedef enum IndexUniqueCheck { UNIQUE_CHECK_NO, /* Don't do any uniqueness checking */ + UNIQUE_CHECK_NO_WITH_UNCHANGED, /* "No logical change" duplicate */ UNIQUE_CHECK_YES, /* Enforce uniqueness at insertion time */ UNIQUE_CHECK_PARTIAL, /* Test uniqueness, but no error */ UNIQUE_CHECK_EXISTING /* Check if existing tuple is unique */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 92b19dba32..7713110ebe 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -148,7 +148,8 @@ extern void heap_abort_speculative(Relation relation, ItemPointer tid); extern TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, - struct TM_FailureData *tmfd, LockTupleMode *lockmode); + struct TM_FailureData *tmfd, LockTupleMode *lockmode, + Bitmapset **modified_attrs_hint); extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_update, @@ -170,6 +171,11 @@ extern void simple_heap_update(Relation relation, ItemPointer otid, extern TransactionId heap_compute_xid_horizon_for_tuples(Relation rel, ItemPointerData *items, int nitems); +extern TransactionId heap_index_batch_check(Relation rel, TM_IndexDelete *deltids, + TM_IndexDeleteStatus *statusdeltids, + int ndeltids, Snapshot snapshot, + int minscore, bool uniqueindex, + int *ntiddeletes, int *finalndeltids); /* in heap/pruneheap.c */ struct GlobalVisState; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 65d9698b89..b2279cccb2 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -77,6 +77,7 @@ typedef BTPageOpaqueData *BTPageOpaque; #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples */ #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ +#define BTP_HAS_DUPS (1 << 8) /* page has at least one version duplicate */ /* * The max allowed value of a cycle ID is a bit less than 64K. This is @@ -219,6 +220,7 @@ typedef struct BTMetaPageData #define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0) #define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) #define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) +#define P_HAS_DUPS(opaque) (((opaque)->btpo_flags & BTP_HAS_DUPS) != 0) /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost @@ -1029,11 +1031,12 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); */ extern void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, - bool checkingunique); + bool checkingunique, bool dedupdelete, + bool allequalimage); extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff); extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup); -extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state); +extern Size _bt_dedup_merge_finish_pending(Page newpage, BTDedupState state); extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids); extern void _bt_update_posting(BTVacuumPosting vacposting); @@ -1084,7 +1087,9 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf, BTVacuumPosting *updatable, int nupdatable); extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, - Relation heapRel); + BTVacuumPosting *updatable, int nupdatable, + Relation heapRel, bool isdedup, + TransactionId dedupLatestRemovedXid); extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact); diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 5c014bdc66..1fd047bc66 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -187,12 +187,15 @@ typedef struct xl_btree_dedup typedef struct xl_btree_delete { TransactionId latestRemovedXid; - uint32 ndeleted; + uint16 ndeleted; + uint16 nupdated; /* DELETED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TUPLES METADATA ARRAY FOLLOWS */ } xl_btree_delete; -#define SizeOfBtreeDelete (offsetof(xl_btree_delete, ndeleted) + sizeof(uint32)) +#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nupdated) + sizeof(uint16)) /* * This is what we need to know about page reuse within btree. This record diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 387eb34a61..e90c6a55ff 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -128,6 +128,28 @@ typedef struct TM_FailureData bool traversed; } TM_FailureData; +/* + * State used by table_index_batch_check() to perform "bottom up" deletion of + * duplicate index tuples. + * + * We store this in two structs, even though it would be more natural to just + * have one struct (and one array). We really need to keep the TM_IndexDelete + * struct small (8 bytes) so that we can do an initial sort by TID as quickly + * as possible. + */ +typedef struct TM_IndexDelete +{ + ItemPointerData tid; /* table TID from index tuple */ + int16 id; /* Offset into TM_IndexDeleteStatus array */ +} TM_IndexDelete; + +typedef struct TM_IndexDeleteStatus +{ + OffsetNumber ioffnum; /* Index am identifies entries with this */ + bool ispromising; /* Contribute to order we visit table blocks? */ + bool isdead; /* Was tuple found dead? */ +} TM_IndexDeleteStatus; + /* "options" flag bits for table_tuple_insert */ /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */ #define TABLE_INSERT_SKIP_FSM 0x0002 @@ -396,7 +418,8 @@ typedef struct TableAmRoutine bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, - bool *update_indexes); + bool *update_indexes, + Bitmapset **modified_attrs_hint); /* see table_tuple_lock() for reference about parameters */ TM_Result (*tuple_lock) (Relation rel, @@ -1041,16 +1064,31 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, } /* - * This is a convenience wrapper around table_index_fetch_tuple() which - * returns whether there are table tuple items corresponding to an index - * entry. This likely is only useful to verify if there's a conflict in a - * unique index. + * These are convenience wrappers around table_index_fetch_tuple() which + * indicate whether there are table tuple items corresponding to an index + * entry. Can be used to verify if there's a conflict in a unique index. + * + * table_index_batch_check() is a variant that is specialized to garbage + * collection of dead tuples in index access methods. Duplicates are + * commonly caused by MVCC version churn when an optimization like + * heapam's HOT cannot be applied. It can make sense to opportunistically + * guess that many index tuples are dead versions, particularly in unique + * indexes. + * + * Note that table_index_batch_check() sorts the deltids array so that the + * order of access is optimized. Callers need to be able to deal with + * that. */ extern bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead); +extern int table_index_batch_check(Relation rel, + TM_IndexDelete *deltids, + int ndeltids, + Snapshot snapshot, + int npromisingkillsneeded); /* ------------------------------------------------------------------------ * Functions for non-modifying operations on individual tuples @@ -1311,12 +1349,13 @@ static inline TM_Result table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, - bool *update_indexes) + bool *update_indexes, Bitmapset **modified_attrs_hint) { return rel->rd_tableam->tuple_update(rel, otid, slot, cid, snapshot, crosscheck, wait, tmfd, - lockmode, update_indexes); + lockmode, update_indexes, + modified_attrs_hint); } /* diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 0c48d2a519..6e6e56583b 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -582,7 +582,8 @@ extern void ExecCloseIndices(ResultRelInfo *resultRelInfo); extern List *ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate, bool noDupErr, - bool *specConflict, List *arbiterIndexes); + bool *specConflict, List *arbiterIndexes, + Bitmapset *modified_attrs_hint); extern bool ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate, ItemPointer conflictTid, diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 1585861a02..11041d09ad 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -70,6 +70,12 @@ #include "utils/snapmgr.h" #include "utils/spccache.h" +typedef struct IndexDeleteCounts +{ + BlockNumber block; + int16 npromisingtids; + int16 ntids; +} IndexDeleteCounts; static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options); @@ -102,6 +108,13 @@ static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 in int *remaining); static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining); +static int heap_index_batch_check_block_sort(TM_IndexDelete *deltids, + TM_IndexDeleteStatus *statusdeltids, + int ndeltids); +static inline int indexdelete_tids_cmp(TM_IndexDelete *indexdelete1, + TM_IndexDelete *indexdelete2); +static inline int indexdeletecount_ntids_cmp(IndexDeleteCounts *count1, + IndexDeleteCounts *count2); static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed, bool *copy); @@ -2892,7 +2905,8 @@ simple_heap_delete(Relation relation, ItemPointer tid) TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, LockTupleMode *lockmode) + TM_FailureData *tmfd, LockTupleMode *lockmode, + Bitmapset **modified_attrs_hint) { TM_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -3758,10 +3772,15 @@ l2: if (old_key_tuple != NULL && old_key_copied) heap_freetuple(old_key_tuple); + /* Save for no logical changes hint when non-HOT update performed */ + if (!use_hot_update && modified_attrs_hint) + *modified_attrs_hint = modified_attrs; + else + bms_free(modified_attrs); + bms_free(hot_attrs); bms_free(key_attrs); bms_free(id_attrs); - bms_free(modified_attrs); bms_free(interesting_attrs); return TM_Ok; @@ -3891,7 +3910,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) result = heap_update(relation, otid, tup, GetCurrentCommandId(true), InvalidSnapshot, true /* wait for commit */ , - &tmfd, &lockmode); + &tmfd, &lockmode, NULL); switch (result) { case TM_SelfModified: @@ -6987,6 +7006,9 @@ xid_horizon_prefetch_buffer(Relation rel, * deleting hundreds of tuples from a single index block. To amortize that * cost to some degree, this uses prefetching and combines repeat accesses to * the same block. + * + * Note: The logic for maintaining latestRemovedXid here is duplicated by code + * within heap_index_batch_check(). Make sure that they stay in sync. */ TransactionId heap_compute_xid_horizon_for_tuples(Relation rel, @@ -7086,7 +7108,6 @@ heap_compute_xid_horizon_for_tuples(Relation rel, { hoffnum = ItemIdGetRedirect(hitemid); hitemid = PageGetItemId(hpage, hoffnum); - CHECK_FOR_INTERRUPTS(); } /* @@ -7134,6 +7155,383 @@ heap_compute_xid_horizon_for_tuples(Relation rel, return latestRemovedXid; } +/* + * Help with "bottom up" deletion of duplicate index tuples from index AMs. + * This is particularly likely to work well with unique indexes. + * + * This routine sorts the deltids array, but does not modify any individual + * entry except to mark it as dead for caller. *ntiddeletes is set to the + * number of items marked dead in caller's deltids array. *finaldeltids is + * set to the size of the subset of the final sorted deltids array that + * contains all entries marked dead for caller (the subset begins at the + * start of deltids and ends after *finaldeltids array elements). Caller + * will only need to consider this interesting subset of deltids, but note + * that only some of the subset's elements are actually marked dead/safe to + * delete in index. + * + * Returns the latestRemovedXid from the heap pages pointed at by the deltids + * index tuples that caller will delete. Caller deletes all deltids related + * index tuples that get marked dead here, and uses latestRemovedXid to + * generate a recovery conflict (if and when a conflict is required). + * + * Note: The logic for maintaining latestRemovedXid here is duplicated by code + * within heap_compute_xid_horizon_for_tuples(). Make sure that they stay in + * sync. + */ +TransactionId +heap_index_batch_check(Relation rel, TM_IndexDelete *deltids, + TM_IndexDeleteStatus *statusdeltids, int ndeltids, + Snapshot snapshot, int minscore, bool uniqueindex, + int *ntiddeletes, int *finalndeltids) +{ + TransactionId latestRemovedXid = InvalidTransactionId; + BlockNumber hblkno = InvalidBlockNumber; + Buffer buf = InvalidBuffer; + Page hpage; + bool finalhpage = false; + int nblocksaccessed = 0; + int score = 0; + + *ntiddeletes = 0; + *finalndeltids = 0; + + /* + * Sort and shrink deltids array so that it consists only of TIDs from 3 + * most promising blocks. We expect the most promising block first (and + * third most promising block third/last). + */ + ndeltids = heap_index_batch_check_block_sort(deltids, statusdeltids, + ndeltids); + for (int i = 0; i < ndeltids; i++) + { + TM_IndexDeleteStatus *status = statusdeltids + deltids[i].id; + ItemPointer htid = &deltids[i].tid; + ItemPointerData tmp; + bool all_dead = false; + bool found; + ItemId hitemid; + OffsetNumber hoffnum; + HeapTupleData heapTuple; + + Assert(!status->isdead); + + if (hblkno == InvalidBlockNumber || + ItemPointerGetBlockNumber(htid) != hblkno) + { + /* + * We usually do a little extra work on the final heap page after + * caller's minimum required score has been reached. The cost of + * accessing the final heap page we'll need to visit has already + * been paid by that point. We finish off the entire final heap + * page because it's cheap to do so. + * + * We don't want to unnecessarily visit the next page in line. + * Handle that here (when we just finished final page). + */ + if (finalhpage) + break; + + /* + * Each time we're about to access a new page we consider if it's + * really worth it. We apply various tests before we visit the + * next page, and give up if any test fails. These tests are: + * + * 1. Give up when we'd otherwise access the second heap page in + * line and have nothing to show for accessing the first/"most + * promising" page. But only with a unique index, where not + * finding items that are safe to delete in the first heap page + * accessed is considered a particularly discouraging sign. + * + * 2. Give up when we'd otherwise access the third heap page in + * line and have nothing to show for it in the case of non-unique + * indexes. The policy is more lax here because there is a small + * chance that we'll get unlucky on the first heap page when + * caller cannot easily distinguish between version churn and + * multiple logical rows. But two heap pages accessed at the same + * time with nothing to delete from either is close to a sure sign + * that accessing a third heap page would be wasted effort. + * + * 3. Give up before accessing a fourth page, no matter what. + * (This last test is defensive, since the deltids array was + * shrunk, and now only contains TIDs from the three most + * promising blocks.) + */ + if (uniqueindex && nblocksaccessed == 1 && score == 0) + break; + if (!uniqueindex && nblocksaccessed == 2 && score == 0) + break; + if (nblocksaccessed == 3) + break; + + /* Now access next page */ + if (BufferIsValid(buf)) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buf); + } + + /* + * We could prune the heap page in passing here, but that doesn't + * seem like a good idea. Caller holds locks of its own. + */ + hblkno = ItemPointerGetBlockNumber(htid); + buf = ReadBuffer(rel, hblkno); + hpage = BufferGetPage(buf); + nblocksaccessed++; + + /* Need to lock buffer for visibility checks */ + LockBuffer(buf, BUFFER_LOCK_SHARE); + } + + tmp = *htid; + found = heap_hot_search_buffer(&tmp, rel, buf, snapshot, &heapTuple, + &all_dead, true); + + if (found || !all_dead) + continue; + + /* Caller can delete this TID from index */ + *finalndeltids = i + 1; + (*ntiddeletes)++; + status->isdead = true; + if (status->ispromising) + score += 2; + else + score++; + + if (score >= minscore) + { + /* + * Caller's score requirement has now been met. Finish off the + * current/final heap page before finishing. + */ + finalhpage = true; + } + + /* + * One last step required for TID that caller will delete. Must + * maintain latestRemovedXid for caller's delete operation. + */ + hoffnum = ItemPointerGetOffsetNumber(htid); + hitemid = PageGetItemId(hpage, hoffnum); + + while (ItemIdIsRedirected(hitemid)) + { + hoffnum = ItemIdGetRedirect(hitemid); + hitemid = PageGetItemId(hpage, hoffnum); + } + + /* + * If the heap item has storage, then read the header and use that to + * set latestRemovedXid. + * + * Some LP_DEAD items may not be accessible, so we ignore them. + */ + if (ItemIdHasStorage(hitemid)) + { + HeapTupleHeader htuphdr; + + htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); + + HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid); + } + else if (ItemIdIsDead(hitemid)) + { + /* + * Conjecture: if hitemid is dead then it had xids before the xids + * marked on LP_NORMAL items. So we just ignore this item and move + * onto the next, for the purposes of calculating + * latestRemovedXid. + */ + } + else + Assert(!ItemIdIsUsed(hitemid)); + } + + if (BufferIsValid(buf)) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buf); + } + + /* + * If all heap tuples were LP_DEAD then we will be returning + * InvalidTransactionId here, which avoids conflicts. This matches + * existing logic which assumes that LP_DEAD tuples must already be older + * than the latestRemovedXid on the cleanup record that set them as + * LP_DEAD, hence must already have generated a conflict. + */ + + return latestRemovedXid; +} + +#define ST_SORT qsort_deltids_by_tid +#define ST_ELEMENT_TYPE TM_IndexDelete +#define ST_COMPARE(a, b) (indexdelete_tids_cmp(a, b)) +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + +#define ST_SORT qsort_blockgroup +#define ST_ELEMENT_TYPE IndexDeleteCounts +#define ST_COMPARE(a, b) (indexdeletecount_ntids_cmp(a, b)) +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + +/* + * heap_index_batch_check() helper function. Sorts deltids array in the + * order needed for useful processing. + * + * Groups heap TIDs from deltids into heap block number groupings. From + * there, sorts each heap block grouping by the total number of "promising" + * TIDs it contains (in desc order). For blocks with the same number of + * promising TIDs, tiebreak on the total heap TID count (also desc order). + * + * heap_index_batch_check() only visits 1 - 3 heap blocks due to the + * speculative nature of the batch index deletion optimization. These heap + * blocks had better be the most promising available. + * + * Returns new size of deltids array (ndeltids). deltids will only have TIDs + * from the 3 most promising heap blocks when we return (which is usually far + * fewer). + */ +static int +heap_index_batch_check_block_sort(TM_IndexDelete *deltids, + TM_IndexDeleteStatus *statusdeltids, + int ndeltids) +{ + IndexDeleteCounts *blockcounts; + TM_IndexDelete *reordereddeltids; + BlockNumber curblock = InvalidBlockNumber; + int nblockgroups = 0; + int ncopied = 0; + + Assert(ndeltids > 0); + + /* First sort caller's array by TID */ + qsort_deltids_by_tid(deltids, ndeltids); + + /* Calculate per-heap-block count of TIDs */ + blockcounts = palloc(sizeof(IndexDeleteCounts) * ndeltids); + for (int i = 0; i < ndeltids; i++) + { + ItemPointer deltid = &deltids[i].tid; + TM_IndexDeleteStatus *status = statusdeltids + deltids[i].id; + bool ispromising = status->ispromising; + + if (curblock != ItemPointerGetBlockNumber(deltid)) + { + /* New block group */ + nblockgroups++; + + curblock = ItemPointerGetBlockNumber(deltid); + blockcounts[nblockgroups - 1].block = curblock; + blockcounts[nblockgroups - 1].ntids = 1; + blockcounts[nblockgroups - 1].npromisingtids = 0; + } + else + { + blockcounts[nblockgroups - 1].ntids++; + } + + if (ispromising) + blockcounts[nblockgroups - 1].npromisingtids++; + } + + qsort_blockgroup(blockcounts, nblockgroups); + reordereddeltids = palloc(ndeltids * sizeof(TM_IndexDelete)); + /* Caller only visits 3 blocks at most, so just copy that many groups */ + nblockgroups = Min(3, nblockgroups); + for (int i = 0; i < nblockgroups; i++) + { + IndexDeleteCounts *blockgroup = blockcounts + i; + + for (int j = 0; j < ndeltids; j++) + { + ItemPointer tid = &deltids[j].tid; + + if (blockgroup->block == ItemPointerGetBlockNumber(tid)) + { + memcpy(reordereddeltids + ncopied, deltids + j, + sizeof(TM_IndexDelete) * blockgroup->ntids); + ncopied += blockgroup->ntids; + break; /* Move on to next heap block group */ + } + } + } + + /* Copy back final sorted array into caller's array */ + memcpy(deltids, reordereddeltids, sizeof(TM_IndexDelete) * ncopied); + + /* be tidy */ + pfree(reordereddeltids); + pfree(blockcounts); + + return ncopied; +} + +/* + * qsort-style comparator used in heap_index_batch_check_block_sort() + */ +static inline int +indexdelete_tids_cmp(TM_IndexDelete *indexdelete1, + TM_IndexDelete *indexdelete2) +{ + ItemPointer tid1 = &indexdelete1->tid; + ItemPointer tid2 = &indexdelete2->tid; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(tid1); + BlockNumber blk2 = ItemPointerGetBlockNumber(tid2); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +/* + * qsort-style comparator used in heap_index_batch_check_block_sort() + */ +static inline int +indexdeletecount_ntids_cmp(IndexDeleteCounts *count1, + IndexDeleteCounts *count2) +{ + /* Invert usual order here (desc npromisingtids sort order) */ + if (count1->npromisingtids > count2->npromisingtids) + return -1; + if (count1->npromisingtids < count2->npromisingtids) + return 1; + + /* Tiebreak: desc ntids sort order */ + if (count1->ntids > count2->ntids) + return -1; + if (count1->ntids < count2->ntids) + return 1; + + /* Tiebreak: block number (asc order) */ + if (count1->block > count2->block) + return 1; + if (count1->block < count2->block) + return -1; + + Assert(false); + + return 0; +} + /* * Perform XLogInsert to register a heap cleanup info message. These * messages are sent once per VACUUM and are required because diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index dcaea7135f..f32ed0a5f2 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -314,7 +314,8 @@ static TM_Result heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, - LockTupleMode *lockmode, bool *update_indexes) + LockTupleMode *lockmode, bool *update_indexes, + Bitmapset **modified_attrs_hint) { bool shouldFree = true; HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); @@ -325,7 +326,7 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, tuple->t_tableOid = slot->tts_tableOid; result = heap_update(relation, otid, tuple, cid, crosscheck, wait, - tmfd, lockmode); + tmfd, lockmode, modified_attrs_hint); ItemPointerCopy(&tuple->t_self, &slot->tts_tid); /* diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 9692e4cdf6..3ae4551f84 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -807,7 +807,8 @@ Deduplication in unique indexes helps to prevent these pathological page splits. Storing duplicates in a space efficient manner is not the goal, since in the long run there won't be any duplicates anyway. Rather, we're buying time for standard garbage collection mechanisms to run before a -page split is needed. +page split is needed. (The same principle is behind delete deduplication, +which also targets version churn.) Unique index leaf pages only get a deduplication pass when an insertion (that might have to split the page) observed an existing duplicate on the @@ -874,6 +875,36 @@ that need a page split anyway. Besides, supporting variable "split points" while splitting posting lists won't actually improve overall space utilization. +Delete deduplication +-------------------- + +The deduplication module usually opportunistically deletes whatever +duplicates happen to be present on the page before moving on to +deduplication proper, since in general some duplicates are likely to +already be dead to everybody. This happens before regular merge +deduplication, but only when we receive a hint that optimizations like +heapam's HOT have not worked out -- the incoming tuple must be a logically +unchanged duplicate which is needed for MVCC purposes. + +This mechanism is quite similar to on-the-fly deletion of index tuples +(that will already have failed to prevent a page split by the time delete +deduplication is considered). The main difference is that the tuples that +get deleted are not opportunistically marked LP_DEAD by transactions that +had to read the tuples in any case. Rather, we infer that duplicates are +likely to be dead tuples based on heuristics (starting with the hint from +the executor), and look for visibility information about the likely-dead +tuples in the hopes that that inference will work out. There is some risk +that this won't work out, but the upside of avoiding version driven page +splits is so large that it is worth it. + +Negative feedback (such as failing to dedup-delete any tuples) is not +really undesirable. At worst it is an unavoidable part of how the +algorithm works. We require that our various approaches to handling an +overflowing page (due partially or entirely to version churn) compete to +determine how best to handle the problem in a localized fashion. We +expect to converge on a stable and roughly optimal behavior at each part +of the key space in each index affected by version churn. + Notes About Data Representation ------------------------------- diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index f6be865b17..e9e30a306e 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -15,14 +15,27 @@ #include "postgres.h" #include "access/nbtree.h" +#include "access/heapam.h" #include "access/nbtxlog.h" +#include "access/tableam.h" #include "miscadmin.h" #include "utils/rel.h" -static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, - OffsetNumber minoff, IndexTuple newitem); +static bool _bt_dedup_delete_pass(Relation rel, Buffer buf, Relation heapRel, + Size newitemsz, bool checkingunique, + bool *merge); +static void _bt_dedup_merge_pass(Relation rel, Buffer buf, Relation heapRel, + IndexTuple newitem, Size newitemsz, + bool checkingunique, bool singleval); +static void _bt_dedup_delete_finish_pending(BTDedupState state, + TM_IndexDelete *deltids, + TM_IndexDeleteStatus *statusdeltids, + int *ndeltids); +static bool _bt_do_singleval(Relation rel, Page page, OffsetNumber minoff, + IndexTuple newitem); static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz); +static int _bt_indexdelete_cmp(const void *a, const void *b, void *arg); #ifdef USE_ASSERT_CHECKING static bool _bt_posting_valid(IndexTuple posting); #endif @@ -32,16 +45,12 @@ static bool _bt_posting_valid(IndexTuple posting); * if we cannot successfully free at least newitemsz (we also need space for * newitem's line pointer, which isn't included in caller's newitemsz). * - * The general approach taken here is to perform as much deduplication as - * possible to free as much space as possible. Note, however, that "single - * value" strategy is sometimes used for !checkingunique callers, in which - * case deduplication will leave a few tuples untouched at the end of the - * page. The general idea is to prepare the page for an anticipated page - * split that uses nbtsplitloc.c's "single value" strategy to determine a - * split point. (There is no reason to deduplicate items that will end up on - * the right half of the page after the anticipated page split; better to - * handle those if and when the anticipated right half page gets its own - * deduplication pass, following further inserts of duplicates.) + * There are two types of deduplication pass: The merge deduplication pass, + * where we merge together duplicate index tuples into a new posting list, and + * the delete deduplication pass, where old garbage version index tuples are + * deleted based on visibility information that we fetch from the table. We + * generally expect to perform only one type of deduplication pass per call + * here, but it's possible that we'll end up doing both. * * This function should be called during insertion, when the page doesn't have * enough space to fit an incoming newitem. If the BTP_HAS_GARBAGE page flag @@ -54,27 +63,24 @@ static bool _bt_posting_valid(IndexTuple posting); */ void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, - IndexTuple newitem, Size newitemsz, bool checkingunique) + IndexTuple newitem, Size newitemsz, bool checkingunique, + bool dedupdelete, bool allequalimage) { OffsetNumber offnum, minoff, maxoff; Page page = BufferGetPage(buf); BTPageOpaque opaque; - Page newpage; OffsetNumber deletable[MaxIndexTuplesPerPage]; - BTDedupState state; int ndeletable = 0; - Size pagesaving = 0; - bool singlevalstrat = false; - int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + bool singleval; /* * We can't assume that there are no LP_DEAD items. For one thing, VACUUM * will clear the BTP_HAS_GARBAGE hint without reliably removing items * that are marked LP_DEAD. We don't want to unnecessarily unset LP_DEAD - * bits when deduplicating items. Allowing it would be correct, though - * wasteful. + * bits when deduplicating items by merging. Allowing it would be + * correct, though wasteful. */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); minoff = P_FIRSTDATAKEY(opaque); @@ -91,7 +97,9 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, if (ndeletable > 0) { - _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel); + _bt_delitems_delete(rel, buf, deletable, ndeletable, + NULL, 0, + heapRel, false, InvalidTransactionId); /* * Return when a split will be avoided. This is equivalent to @@ -100,17 +108,516 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, if (PageGetFreeSpace(page) >= newitemsz) return; - /* - * Reconsider number of items on page, in case _bt_delitems_delete() - * managed to delete an item or two - */ - minoff = P_FIRSTDATAKEY(opaque); - maxoff = PageGetMaxOffsetNumber(page); + maxoff = InvalidOffsetNumber; /* Invalidate */ } /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); + /* Determine if page is all one value */ + singleval = _bt_do_singleval(rel, page, minoff, newitem); + + /* + * Perform delete deduplication pass when caller asked for it explicitly, + * or for a unique index. + * + * Don't delete dedup when page contains only one value, unless it's from + * a unique index where we always try to delete. We only get called about + * checkingunique cases when page is known to have at least one or two + * non-NULL duplicates. Clearly duplicates in a unique index are bound to + * become dead-to-all before long, so we should always try to delete them. + */ + if ((dedupdelete && !singleval) || checkingunique) + { + bool merge = true; + + if (_bt_dedup_delete_pass(rel, buf, heapRel, newitemsz, + checkingunique, &merge)) + return; + + /* + * _bt_dedup_delete_pass() may occasionally indicate no duplicates, in + * which case we should give up now + */ + if (!merge) + return; + + /* Fall back on merge deduplication. This happens infrequently. */ + } + + /* + * Perform merge deduplication pass, though only when it is safe to do so. + * Index must be an allequalimage index -- otherwise it's not safe. + */ + if (allequalimage) + _bt_dedup_merge_pass(rel, buf, heapRel, newitem, newitemsz, + checkingunique, singleval); +} + +/* + * Perform a delete deduplication pass. + * + * See if duplicate index tuples are eligible to be deleted, even though they + * don't have their LP_DEAD bit set already. Give up if we have to access + * more than a few heap pages before we can free enough space to fit newitem. + * + * Note: Caller should have already deleted all existing items with their + * LP_DEAD bits set. + */ +static bool +_bt_dedup_delete_pass(Relation rel, Buffer buf, Relation heapRel, + Size newitemsz, bool checkingunique, bool *merge) +{ + OffsetNumber offnum, + minoff, + maxoff, + skippostinglistoffnum = InvalidOffsetNumber; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + BTDedupState state; + TM_IndexDelete *deltids; + TM_IndexDeleteStatus *statusdeltids; + int ndeltids, + npromisingdeltids, + ntiddeletes, + ndeletable, + nupdatable; + int finalndeltids; + TransactionId dedupLatestRemovedXid = InvalidTransactionId; + OffsetNumber deletable[MaxIndexTuplesPerPage]; + BTVacuumPosting updatable[MaxIndexTuplesPerPage]; + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; + state->nmaxitems = 0; + /* Final "posting list" size should not restrict anything */ + state->maxpostingsize = BLCKSZ; + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + state->nintervals = 0; + + /* + * deltids and statusdeltids are allocated separately as a performance + * optimization. You can think of them as one array artificially split in + * two for performance reasons. + */ + ndeltids = 0; + npromisingdeltids = 0; + deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); + statusdeltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDeleteStatus)); + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == minoff) + { + _bt_dedup_start_pending(state, itup, offnum); + } + else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_dedup_save_htid(state, itup)) + { + + } + else + { + /* Handle interval, saving TIDs when "non-promising" */ + _bt_dedup_delete_finish_pending(state, deltids, statusdeltids, + &ndeltids); + + /* itup starts new pending interval */ + _bt_dedup_start_pending(state, itup, offnum); + } + } + /* Handle the final interval, saving TIDs when "non-promising" */ + _bt_dedup_delete_finish_pending(state, deltids, statusdeltids, &ndeltids); + + if (state->nintervals == 0) + { + /* No duplicates/promising tuples, don't bother trying */ + pfree(state->htids); + pfree(state); + pfree(deltids); + pfree(statusdeltids); + /* Caller should avoid merge deduplication pass */ + *merge = false; + return false; + } + + /* + * deltids array now contains non-duplicate tuples, all of which are + * marked non-promising. + * + * Add known duplicates to array now by extracting them from the dedup + * intervals we just formed. Tuples are marked promising so that the + * tableam infrastructure can focus its efforts there (actually we don't + * do that for posting list TIDs). See comment block below for a full + * explanation of promising tuples. + */ + for (int i = 0; i < state->nintervals; i++) + { + BTDedupInterval interval = state->intervals[i]; + + for (int j = 0; j < interval.nitems; j++) + { + OffsetNumber dupoffnum = interval.baseoff + j; + ItemId itemid = PageGetItemId(page, dupoffnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + if (!BTreeTupleIsPosting(itup)) + { + /* + * Plain non-pivot tuple duplicate -- TID is promising + */ + deltids[ndeltids].tid = itup->t_tid; + deltids[ndeltids].id = ndeltids; + statusdeltids[ndeltids].ioffnum = dupoffnum; + statusdeltids[ndeltids].ispromising = true; + statusdeltids[ndeltids].isdead = false; /* for now */ + ndeltids++; + npromisingdeltids++; + } + else + { + /* + * Posting list tuple duplicate -- TIDs are not promising, but + * tableam might manage to delete them in passing + */ + int nitem = BTreeTupleGetNPosting(itup); + + Assert(_bt_posting_valid(itup)); + + for (int p = 0; p < nitem; p++) + { + ItemPointer htid = BTreeTupleGetPostingN(itup, p); + + deltids[ndeltids].tid = *htid; + deltids[ndeltids].id = ndeltids; + statusdeltids[ndeltids].ioffnum = dupoffnum; + statusdeltids[ndeltids].ispromising = false; + statusdeltids[ndeltids].isdead = false; /* for now */ + ndeltids++; + } + } + } + } + + /* Done with dedup state */ + pfree(state->htids); + pfree(state); + + /* + * Determine which TIDs are dead in deltids array (if we have any + * duplicates at all, since we only really expect to find dead tuples + * among duplicates). + * + * Each failure to delete a duplicate/promising tuple here is a kind of + * learning experience. It results in caller falling back on splitting + * the page (or on a merge deduplication pass), discouraging future calls + * back here for the same key space range covered by a failed page (or at + * least discouraging processing the original duplicates in case where + * caller falls back on a successful merge deduplication pass). We + * converge on the most effective strategy for each page in the index over + * time, through trial and error. We accept well understood small + * negative outcomes (effort on deleting that didn't immediately pay off) + * as the price of a potentially huge (though uncertain) upside (no + * unnecessary page splits). + * + * We don't mark posting list tuple TIDs as promising specifically so that + * cases where we have a failed delete deduplication pass followed by a + * successfully merge deduplication pass do not go on to waste time on the + * TIDs during any future delete deduplication passes. This is helpful + * when posting list tuples generally point to multiple distinct logical + * rows in the table. If it turns out that they really are all old + * versions of a single logical table row, we still have a pretty good + * chance of being able to delete them in a future dedup delete pass (not + * necessarily the next one for the leaf page). We could easily pick up + * the dead TIDs from such a posting list tuple when the tableam goes on + * to place even more similar version churn tuples on the same table + * block. We'll naturally target the newer version churn tuples directly, + * and then discover dead TIDs from posting list tuples that weren't + * dead/possible to delete earlier (i.e. back when we "were under the + * mistaken impression" that the posting list TIDs were for multiple + * logical table rows). + */ + Assert(ndeltids > 0); + ntiddeletes = 0; + if (npromisingdeltids > 0) + { + SnapshotData SnapshotNonVacuumable; + + InitNonVacuumableSnapshot(SnapshotNonVacuumable, + GlobalVisTestFor(heapRel)); + + /* + * We require a score of 25 from the tableam. A promising TID is + * worth 2, while a non-promising TID is worth only 1. + * + * You might wonder why we don't tell the tableam more about our + * preferences (for example, we don't register a callback that figures + * out when tableam has found enough dead TIDs to allow us to free + * just enough leaf page space to avoid a page split -- even though + * that interface is quite feasible). Our precise preferences are + * unimportant because the high level design of delete deduplication + * is highly opportunistic. This generally works out because of the + * asymmetry. The cost of failing to delete even one tuple once per + * page is drastically lower than the potential upside of never having + * to split a page affected by version churn. + * + * We're not trying to prevent just any kind of page split. We're + * trying to delay _unnecessary_ version churn page splits + * indefinitely, which is the same thing as preventing them altogether + * if you look at the situation on a long enough time line (think + * hours, days, or even months). + * + * The tableam will regularly give us a final score which is much + * higher than what we ask for (a score of 25). It often happens to + * be convenient for the tableam to process extra TIDs. Our target + * score is really just a way of balancing costs over time for pages + * that are more or less constantly at risk of unnecessary page + * splits, but are not expected to split for any other reason. (If + * there are even a few inserts of new logical row TIDs on the page + * then a page split is practically inevitable, and will probably + * happen very quickly in practice. We don't worry about these "in + * between" cases because they're too unstable to last and so too rare + * to matter.) + */ + dedupLatestRemovedXid = + heap_index_batch_check(heapRel, deltids, statusdeltids, ndeltids, + &SnapshotNonVacuumable, 25, checkingunique, + &ntiddeletes, &finalndeltids); + + /* + * TODO: Obviously we should be going through a new tableam shim + * function rather than calling into heapam directly like this. We + * don't bother with that for now because the interface is still + * unsettled. See also: WIP table_index_batch_check() shim. + */ + } + + if (ntiddeletes == 0) + { + pfree(deltids); + pfree(statusdeltids); + return false; + } + + /* + * We have at least one dead TID to delete. All that remains is to + * construct a leaf-page-wise description of what to delete that can be + * used by _bt_delitems_delete(). + * + * Sort deltids in useful order, then process sorted array in loop (loop + * expects items in offset number order, or by TID among entries that have + * equal offset numbers -- which happens when there are posting list + * tuples that we want to delete some TIDs from). + * + * Note: We only process the subset of elements at the start of deltids + * that pertain to table blocks actually accessed by the tableam (the + * first finalndeltids elements). This approach saves a few cycles. + */ + Assert(finalndeltids > 0); + qsort_arg(deltids, finalndeltids, sizeof(TM_IndexDelete), + _bt_indexdelete_cmp, statusdeltids); + ndeletable = 0; + nupdatable = 0; + for (int i = 0; i < finalndeltids; i++) + { + TM_IndexDeleteStatus *status = statusdeltids + deltids[i].id; + OffsetNumber savedoffnum = status->ioffnum; + ItemId itemid = PageGetItemId(page, savedoffnum); + IndexTuple itup; + int nitem; + BTVacuumPosting vacposting = NULL; + + itup = (IndexTuple) PageGetItem(page, itemid); + + /* + * If this is a posting list and we already reached the first dead + * tuple in the posting list, skip remaining items + */ + if (skippostinglistoffnum == savedoffnum) + { + Assert(BTreeTupleIsPosting(itup)); + Assert(!status->ispromising); + continue; + } + + if (!BTreeTupleIsPosting(itup)) + { + /* + * For a plain non-pivot tuple, simply being found marked dead + * means we can kill + */ + Assert(ItemPointerEquals(&itup->t_tid, &deltids[i].tid)); + if (status->isdead) + deletable[ndeletable++] = savedoffnum; + continue; + } + + /* + * For a posting list tuple we have to work a bit harder, since we may + * either delete or update (i.e. update to delete a subset of its + * TIDs). + * + * We'll be skipping over future TIDs from this same posting list in + * outer loop, since we want to process all TIDs in tuple together + * once in inner loop. Remember to do that at top of outer loop now. + */ + Assert(_bt_posting_valid(itup)); + skippostinglistoffnum = savedoffnum; + nitem = BTreeTupleGetNPosting(itup); + for (int j = 0; j < nitem; j++) + { + ItemPointer htid; + int cmp; + int tidi = i; + + htid = BTreeTupleGetPostingN(itup, j); + + for (;;) + { + cmp = ItemPointerCompare(htid, &deltids[tidi].tid); + if (cmp == 0) + break; + tidi++; + if (tidi >= finalndeltids || + (statusdeltids + deltids[tidi].id)->ioffnum != savedoffnum) + { + /* + * If this later tidi deltid doesn't even relate to same + * posting list index tuple from page, we're done with + * this TID from itup (posting list tuple). + */ + cmp = -1; + break; + } + } + + /* Final check for exact TID match */ + if (cmp != 0) + continue; + + /* Only interested in dead TIDs */ + if (!(statusdeltids + deltids[tidi].id)->isdead) + continue; + + if (vacposting == NULL) + { + /* + * First dead table TID encountered. + * + * Start maintaining metadata describing how to update + * existing posting list tuple. + */ + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + nitem * sizeof(uint16)); + vacposting->itup = itup; + vacposting->updatedoffset = savedoffnum; + vacposting->ndeletedtids = 0; + } + vacposting->deletetids[vacposting->ndeletedtids++] = j; + } + + /* Decide what to do with TIDs in posting list tuple */ + + if (vacposting == NULL) + { + /* No TIDs to delete in posting list tuple */ + } + else if (vacposting->ndeletedtids == nitem) + { + /* Straight delete -- won't need update state */ + deletable[ndeletable++] = savedoffnum; + pfree(vacposting); + } + else + { + /* Going to delete some but not all TIDs in posting list */ + Assert(vacposting->ndeletedtids > 0 && + vacposting->ndeletedtids < nitem); + updatable[nupdatable++] = vacposting; + } + } + + /* Done with deltids state */ + pfree(deltids); + pfree(statusdeltids); + + /* + * Should not get this far if we don't have at least some TIDs to delete, + * but be paranoid + */ + if (ndeletable == 0 && nupdatable == 0) + { + Assert(false); + return false; + } + + /* + * Go through with deleting TIDs that we found are safe to delete. + * + * No MarkBufferDirtyHint() call is needed here, since we don't ever mark + * line pointers LP_DEAD. Any and all modifications to the page are made + * in the critical section in _bt_delitems_delete(). + */ + _bt_delitems_delete(rel, buf, deletable, ndeletable, + updatable, nupdatable, + heapRel, true, dedupLatestRemovedXid); + + /* be tidy */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]); + + /* Return success when page split (or merge deduplication pass) avoided */ + return PageGetExactFreeSpace(page) >= newitemsz; +} + +/* + * Perform a merge deduplication pass. + * + * The general approach taken here is to perform as much deduplication as + * possible to free as much space as possible. Note, however, that "single + * value" strategy is used for singleval callers, in which case deduplication + * will leave a few tuples untouched at the end of the page. The general idea + * is to prepare the page for an anticipated page split that uses + * nbtsplitloc.c's "single value" strategy to determine a split point. (There + * is no reason to deduplicate items that will end up on the right half of the + * page after the anticipated page split; better to handle those if and when + * the anticipated right half page gets its own deduplication pass, following + * further inserts of duplicates.) + * + * Note: Caller should have already deleted all existing items with their + * LP_DEAD bits set. + */ +static void +_bt_dedup_merge_pass(Relation rel, Buffer buf, Relation heapRel, + IndexTuple newitem, Size newitemsz, bool checkingunique, + bool singleval) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Page newpage; + BTDedupState state; + Size pagesaving = 0; + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + /* * By here, it's clear that deduplication will definitely be attempted. * Initialize deduplication state. @@ -138,9 +645,8 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, /* nintervals should be initialized to zero */ state->nintervals = 0; - /* Determine if "single value" strategy should be used */ - if (!checkingunique) - singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); /* * Deduplicate items from page, and write them to newpage. @@ -203,9 +709,9 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, * form new posting tuple, and actually update the page. Else * reset the state and move on without modifying the page. */ - pagesaving += _bt_dedup_finish_pending(newpage, state); + pagesaving += _bt_dedup_merge_finish_pending(newpage, state); - if (singlevalstrat) + if (singleval) { /* * Single value strategy's extra steps. @@ -225,7 +731,7 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, else if (state->nmaxitems == 6) { state->deduplicate = false; - singlevalstrat = false; /* won't be back here */ + singleval = false; /* won't be back here */ } } @@ -235,7 +741,7 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, } /* Handle the last item */ - pagesaving += _bt_dedup_finish_pending(newpage, state); + pagesaving += _bt_dedup_merge_finish_pending(newpage, state); /* * If no items suitable for deduplication were found, newpage must be @@ -263,6 +769,8 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, * still falsely set, just to keep things tidy. (We can't rely on * _bt_vacuum_one_page() having done this already, and we can't rely on a * page split or VACUUM getting to it in the near future.) + * + * Deliberately don't unset BTP_HAS_DUPS here. */ if (P_HAS_GARBAGE(opaque)) { @@ -317,8 +825,8 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, * Every tuple processed by deduplication either becomes the base tuple for a * posting list, or gets its heap TID(s) accepted into a pending posting list. * A tuple that starts out as the base tuple for a posting list will only - * actually be rewritten within _bt_dedup_finish_pending() when it turns out - * that there are duplicates that can be merged into the base tuple. + * actually be rewritten within _bt_dedup_merge_finish_pending() when it turns + * out that there are duplicates that can be merged into the base tuple. */ void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, @@ -443,7 +951,7 @@ _bt_dedup_save_htid(BTDedupState state, IndexTuple itup) * where no deduplication was possible. */ Size -_bt_dedup_finish_pending(Page newpage, BTDedupState state) +_bt_dedup_merge_finish_pending(Page newpage, BTDedupState state) { OffsetNumber tupoff; Size tuplesz; @@ -496,10 +1004,76 @@ _bt_dedup_finish_pending(Page newpage, BTDedupState state) return spacesaving; } +/* + * Stripped down version of _bt_dedup_merge_finish_pending() used by + * _bt_dedup_delete_pass(). + * + * Finalize interval of duplicates (duplicate group) without materializing the + * would-be posting list tuple. We store all TIDs on the leaf page in the + * array, but only TIDs that we determine are duplicates are marked as + * promising. (Non-promising TIDs only get considered in passing, when they + * happen to be on the same table am page as promising TIDs.) + */ +static void +_bt_dedup_delete_finish_pending(BTDedupState state, TM_IndexDelete *deltids, + TM_IndexDeleteStatus *statusdeltids, int *ndeltids) +{ + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + Assert(state->intervals[state->nintervals].baseoff == state->baseoff); + + if (state->nitems == 1) + { + /* Remember non-duplicate's TID, but mark it not promising */ + OffsetNumber offnum = state->baseoff; + IndexTuple itup = state->base; + + if (!BTreeTupleIsPosting(itup)) + { + deltids[*ndeltids].tid = itup->t_tid; + deltids[*ndeltids].id = *ndeltids; + statusdeltids[*ndeltids].ioffnum = offnum; + statusdeltids[*ndeltids].ispromising = false; + statusdeltids[*ndeltids].isdead = false; /* for now */ + (*ndeltids)++; + } + else + { + int nitem = BTreeTupleGetNPosting(itup); + + Assert(_bt_posting_valid(itup)); + + for (int i = 0; i < nitem; i++) + { + ItemPointer htid = BTreeTupleGetPostingN(itup, i); + + deltids[*ndeltids].tid = *htid; + deltids[*ndeltids].id = *ndeltids; + statusdeltids[*ndeltids].ioffnum = offnum; + statusdeltids[*ndeltids].ispromising = false; + statusdeltids[*ndeltids].isdead = false; /* for now */ + (*ndeltids)++; + } + } + } + else + { + /* Dups in interval -- store in deltids later */ + state->intervals[state->nintervals].nitems = state->nitems; + /* Increment nintervals, since we wrote a new posting list tuple */ + state->nintervals++; + } + + /* Reset state for next pending posting list */ + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; +} + /* * Determine if page non-pivot tuples (data items) are all duplicates of the - * same value -- if they are, deduplication's "single value" strategy should - * be applied. The general goal of this strategy is to ensure that + * same value -- if they are, merge deduplication's "single value" strategy + * should be applied. The general goal of this strategy is to ensure that * nbtsplitloc.c (which uses its own single value strategy) will find a useful * split point as further duplicates are inserted, and successive rightmost * page splits occur among pages that store the same duplicate value. When @@ -531,8 +1105,8 @@ _bt_dedup_finish_pending(Page newpage, BTDedupState state) * here. */ static bool -_bt_do_singleval(Relation rel, Page page, BTDedupState state, - OffsetNumber minoff, IndexTuple newitem) +_bt_do_singleval(Relation rel, Page page, OffsetNumber minoff, + IndexTuple newitem) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); ItemId itemid; @@ -809,6 +1383,57 @@ _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff) return nposting; } +/* + * qsort-style comparator used by _bt_dedup_delete_pass() + * + * TM_IndexDelete and TM_IndexDeleteStatus are two different structs for + * performance reasons (initial TID sort needs to be very fast because it + * happens before any of the TIDs have been eliminated). It would be more + * natural if there was only one struct with all the fields from each of the + * two structs. + * + * This qsort_arg comparator deals with finding the TM_IndexDeleteStatus of + * the TM_IndexDelete entries that we sort. + */ +static int +_bt_indexdelete_cmp(const void *a, const void *b, void *arg) +{ + TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a; + TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b; + TM_IndexDeleteStatus *statusdeltids = (TM_IndexDeleteStatus *) arg; + + OffsetNumber offset1 = (statusdeltids + indexdelete1->id)->ioffnum; + OffsetNumber offset2 = (statusdeltids + indexdelete2->id)->ioffnum; + ItemPointer tid1 = &indexdelete1->tid; + ItemPointer tid2 = &indexdelete2->tid; + + if (offset1 > offset2) + return 1; + if (offset1 < offset2) + return -1; + + /* Must be posting list tuple -- restore TID order */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(tid1); + BlockNumber blk2 = ItemPointerGetBlockNumber(tid2); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + /* * Verify posting list invariants for "posting", which must be a posting list * tuple. Used within assertions. diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index d36f7557c8..6377a024ba 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -37,6 +37,7 @@ static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, + bool logicallymodified, BTStack stack, Relation heapRel); static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack); @@ -86,7 +87,9 @@ _bt_doinsert(Relation rel, IndexTuple itup, BTInsertStateData insertstate; BTScanInsert itup_key; BTStack stack; - bool checkingunique = (checkUnique != UNIQUE_CHECK_NO); + bool checkingunique = (checkUnique != UNIQUE_CHECK_NO && + checkUnique != UNIQUE_CHECK_NO_WITH_UNCHANGED); + bool logicallymodified = (checkUnique != UNIQUE_CHECK_NO_WITH_UNCHANGED); /* we need an insertion scan key to do our search, so build one */ itup_key = _bt_mkscankey(rel, itup); @@ -235,7 +238,7 @@ search: * checkingunique. */ newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, - stack, heapRel); + logicallymodified, stack, heapRel); _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack, itup, insertstate.itemsz, newitemoff, insertstate.postingoff, false); @@ -767,6 +770,11 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * the right, rather than the first page. In that case, this function * moves right to the correct target page. * + * If 'logicallymodified' is false, this is for an UPDATE that didn't + * logically change the indexed value, but must nevertheless have a new + * entry to point to a successor version. This hint from the executor + * influences the behavior of deduplication. + * * (In a !heapkeyspace index, there can be multiple pages with the same * high key, where the new tuple could legitimately be placed on. In * that case, the caller passes the first page containing duplicates, @@ -790,6 +798,7 @@ static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, + bool logicallymodified, BTStack stack, Relation heapRel) { @@ -873,14 +882,20 @@ _bt_findinsertloc(Relation rel, /* * If the target page is full, see if we can obtain enough space by * erasing LP_DEAD items. If that fails to free enough space, see if - * we can avoid a page split by performing a deduplication pass over - * the page. + * we can avoid a page split by performing deduplication. Usually + * this means a deduplication merge pass, though a deduplication + * delete pass is preferred when it looks like version churn is the + * source of most of the duplicates. * - * We only perform a deduplication pass for a checkingunique caller - * when the incoming item is a duplicate of an existing item on the - * leaf page. This heuristic avoids wasting cycles -- we only expect - * to benefit from deduplicating a unique index page when most or all - * recently added items are duplicates. See nbtree/README. + * We only consider deduplication for a checkingunique caller when the + * incoming item is a known duplicate of an existing item on the leaf + * page. This heuristic avoids wasting cycles. The overarching goal + * within a unique index is to prevent an unnecessary page split + * altogether by delaying splits again and again (the goal is not to + * save space). If even one incoming tuple that gets added to this + * page originates with an INSERT statement then a page split is all + * but inevitable anyway --- that's why it's okay that our heuristic + * only considers the current incoming newitem. See nbtree/README. */ if (PageGetFreeSpace(page) < insertstate->itemsz) { @@ -893,16 +908,24 @@ _bt_findinsertloc(Relation rel, uniquedup = true; } - if (itup_key->allequalimage && BTGetDeduplicateItems(rel) && - (!checkingunique || uniquedup) && + if (BTGetDeduplicateItems(rel) && (!checkingunique || uniquedup) && PageGetFreeSpace(page) < insertstate->itemsz) { + bool dedupdelete = !logicallymodified || P_HAS_DUPS(lpageop); + _bt_dedup_one_page(rel, insertstate->buf, heapRel, insertstate->itup, insertstate->itemsz, - checkingunique); + checkingunique, dedupdelete, + itup_key->allequalimage); insertstate->bounds_valid = false; } } + else if (!logicallymodified && !P_HAS_DUPS(lpageop)) + { + lpageop->btpo_flags |= BTP_HAS_DUPS; + + MarkBufferDirtyHint(insertstate->buf, true); + } } else { @@ -1525,11 +1548,11 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); /* - * leftpage won't be the root when we're done. Also, clear the SPLIT_END - * and HAS_GARBAGE flags. + * leftpage won't be the root when we're done. Also, clear the SPLIT_END, + * HAS_GARBAGE, and HAS_DUPS flags. */ lopaque->btpo_flags = oopaque->btpo_flags; - lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE | BTP_HAS_DUPS); /* set flag in leftpage indicating that rightpage has no downlink yet */ lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; lopaque->btpo_prev = oopaque->btpo_prev; @@ -1712,11 +1735,11 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel); /* - * rightpage won't be the root when we're done. Also, clear the SPLIT_END - * and HAS_GARBAGE flags. + * rightpage won't be the root when we're done. Also, clear the + * SPLIT_END, HAS_GARBAGE and HAS_DUPS flags. */ ropaque->btpo_flags = oopaque->btpo_flags; - ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE | BTP_HAS_DUPS); ropaque->btpo_prev = origpagenumber; ropaque->btpo_next = oopaque->btpo_next; ropaque->btpo.level = oopaque->btpo.level; @@ -2659,7 +2682,8 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) } if (ndeletable > 0) - _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel); + _bt_delitems_delete(rel, buffer, deletable, ndeletable, NULL, 0, + heapRel, false, InvalidTransactionId); /* * Note: if we didn't find any LP_DEAD items, then the page's diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 7f392480ac..3acc31bcce 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1117,8 +1117,7 @@ _bt_page_recyclable(Page page) * We record VACUUMs and b-tree deletes differently in WAL. Deletes must * generate their own latestRemovedXid by accessing the heap directly, whereas * VACUUMs rely on the initial heap scan taking care of it indirectly. Also, - * only VACUUM can perform granular deletes of individual TIDs in posting list - * tuples. + * we remove the VACUUM cycle ID from pages, which b-tree deletes don't do. */ void _bt_delitems_vacuum(Relation rel, Buffer buf, @@ -1277,36 +1276,114 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * Delete item(s) from a btree leaf page during single-page cleanup. * * This routine assumes that the caller has pinned and write locked the - * buffer. Also, the given deletable array *must* be sorted in ascending - * order. + * buffer. Also, the given deletable and updatable arrays *must* be sorted in + * ascending order. * * This is nearly the same as _bt_delitems_vacuum as far as what it does to * the page, but it needs to generate its own latestRemovedXid by accessing * the heap. This is used by the REDO routine to generate recovery conflicts. - * Also, it doesn't handle posting list tuples unless the entire tuple can be - * deleted as a whole (since there is only one LP_DEAD bit per line pointer). + * Though note that dedup deletion caller will provide its own + * latestRemovedXid, since it's convenient for it to determine that at the + * same point that it determines that the items are dead (it won't set LP_DEAD + * items). Another difference is that we don't clear page's vacuum cycle ID. */ void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, - Relation heapRel) + BTVacuumPosting *updatable, int nupdatable, + Relation heapRel, bool isdedup, + TransactionId dedupLatestRemovedXid) { Page page = BufferGetPage(buf); BTPageOpaque opaque; TransactionId latestRemovedXid = InvalidTransactionId; + Size itemsz; + char *updatedbuf = NULL; + Size updatedbuflen = 0; + OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; /* Shouldn't be called unless there's something to do */ - Assert(ndeletable > 0); + Assert(ndeletable > 0 || nupdatable > 0); + /* Shouldn't update posting lists unless for dedup caller */ + Assert(isdedup || nupdatable == 0); if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) - latestRemovedXid = - _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable); + { + if (!isdedup) + latestRemovedXid = + _bt_xid_horizon(rel, heapRel, page, deletable, + ndeletable); + else + latestRemovedXid = dedupLatestRemovedXid; + } + + for (int i = 0; i < nupdatable; i++) + { + /* Replace work area IndexTuple with updated version */ + _bt_update_posting(updatable[i]); + + /* Maintain array of updatable page offsets for WAL record */ + updatedoffsets[i] = updatable[i]->updatedoffset; + } + + /* XLOG stuff -- allocate and fill buffer before critical section */ + if (nupdatable > 0 && RelationNeedsWAL(rel)) + { + Size offset = 0; + + for (int i = 0; i < nupdatable; i++) + { + BTVacuumPosting vacposting = updatable[i]; + + itemsz = SizeOfBtreeUpdate + + vacposting->ndeletedtids * sizeof(uint16); + updatedbuflen += itemsz; + } + + updatedbuf = palloc(updatedbuflen); + for (int i = 0; i < nupdatable; i++) + { + BTVacuumPosting vacposting = updatable[i]; + xl_btree_update update; + + update.ndeletedtids = vacposting->ndeletedtids; + memcpy(updatedbuf + offset, &update.ndeletedtids, + SizeOfBtreeUpdate); + offset += SizeOfBtreeUpdate; + + itemsz = update.ndeletedtids * sizeof(uint16); + memcpy(updatedbuf + offset, vacposting->deletetids, itemsz); + offset += itemsz; + } + } /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); - /* Fix the page */ - PageIndexMultiDelete(page, deletable, ndeletable); + /* + * Handle posting tuple updates. + * + * Deliberately do this before handling simple deletes. If we did it the + * other way around (i.e. WAL record order -- simple deletes before + * updates) then we'd have to make compensating changes to the 'updatable' + * array of offset numbers. + */ + for (int i = 0; i < nupdatable; i++) + { + OffsetNumber updatedoffset = updatedoffsets[i]; + IndexTuple itup; + + itup = updatable[i]->itup; + itemsz = MAXALIGN(IndexTupleSize(itup)); + if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup, + itemsz)) + elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"", + BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + } + + /* Now handle simple deletes of entire tuples */ + if (ndeletable > 0) + PageIndexMultiDelete(page, deletable, ndeletable); /* * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID, @@ -1326,6 +1403,7 @@ _bt_delitems_delete(Relation rel, Buffer buf, xlrec_delete.latestRemovedXid = latestRemovedXid; xlrec_delete.ndeleted = ndeletable; + xlrec_delete.nupdated = nupdatable; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); @@ -1336,8 +1414,16 @@ _bt_delitems_delete(Relation rel, Buffer buf, * When XLogInsert stores the whole buffer, the array need not be * stored too. */ - XLogRegisterBufData(0, (char *) deletable, - ndeletable * sizeof(OffsetNumber)); + if (ndeletable > 0) + XLogRegisterBufData(0, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + if (nupdatable > 0) + { + XLogRegisterBufData(0, (char *) updatedoffsets, + nupdatable * sizeof(OffsetNumber)); + XLogRegisterBufData(0, updatedbuf, updatedbuflen); + } recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE); @@ -1345,6 +1431,13 @@ _bt_delitems_delete(Relation rel, Buffer buf, } END_CRIT_SECTION(); + + /* can't leak memory here */ + if (updatedbuf != NULL) + pfree(updatedbuf); + /* free tuples generated by calling _bt_update_posting() */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]->itup); } /* @@ -1376,7 +1469,6 @@ _bt_xid_horizon(Relation rel, Relation heapRel, Page page, itemid = PageGetItemId(page, deletable[i]); itup = (IndexTuple) PageGetItem(page, itemid); - Assert(ItemIdIsDead(itemid)); Assert(!BTreeTupleIsPivot(itup)); if (!BTreeTupleIsPosting(itup)) diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index efee86784b..ecfe79badb 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -273,7 +273,7 @@ static void _bt_sortaddtup(Page page, Size itemsize, bool newfirstdataitem); static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra); -static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, +static void _bt_dedup_sort_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate); static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); @@ -1068,11 +1068,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, * Finalize pending posting list tuple, and add it to the index. Final tuple * is based on saved base tuple, and saved list of heap TIDs. * - * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple - * using _bt_buildadd(). + * This is almost like _bt_dedup_merge_finish_pending(), but it adds a new + * tuple using _bt_buildadd(). */ static void -_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, +_bt_dedup_sort_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate) { Assert(dstate->nitems > 0); @@ -1371,7 +1371,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) * _bt_dedup_save_htid() opted to not merge current item into * pending posting list. */ - _bt_sort_dedup_finish_pending(wstate, state, dstate); + _bt_dedup_sort_finish_pending(wstate, state, dstate); pfree(dstate->base); /* start new pending posting list with itup copy */ @@ -1390,7 +1390,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) * Handle the last item (there must be a last item when the * tuplesort returned one or more tuples) */ - _bt_sort_dedup_finish_pending(wstate, state, dstate); + _bt_dedup_sort_finish_pending(wstate, state, dstate); pfree(dstate->base); pfree(dstate->htids); } diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index bda9be2348..0a8e1d8295 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -530,12 +530,12 @@ btree_xlog_dedup(XLogReaderState *record) } else { - _bt_dedup_finish_pending(newpage, state); + _bt_dedup_merge_finish_pending(newpage, state); _bt_dedup_start_pending(state, itup, offnum); } } - _bt_dedup_finish_pending(newpage, state); + _bt_dedup_merge_finish_pending(newpage, state); Assert(state->nintervals == xlrec->nintervals); Assert(memcmp(state->intervals, intervals, state->nintervals * sizeof(BTDedupInterval)) == 0); @@ -675,7 +675,56 @@ btree_xlog_delete(XLogReaderState *record) page = (Page) BufferGetPage(buffer); - PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); + if (xlrec->nupdated > 0) + { + OffsetNumber *updatedoffsets; + xl_btree_update *updates; + + updatedoffsets = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + updates = (xl_btree_update *) ((char *) updatedoffsets + + xlrec->nupdated * + sizeof(OffsetNumber)); + + for (int i = 0; i < xlrec->nupdated; i++) + { + BTVacuumPosting vacposting; + IndexTuple origtuple; + ItemId itemid; + Size itemsz; + + itemid = PageGetItemId(page, updatedoffsets[i]); + origtuple = (IndexTuple) PageGetItem(page, itemid); + + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + updates->ndeletedtids * sizeof(uint16)); + vacposting->updatedoffset = updatedoffsets[i]; + vacposting->itup = origtuple; + vacposting->ndeletedtids = updates->ndeletedtids; + memcpy(vacposting->deletetids, + (char *) updates + SizeOfBtreeUpdate, + updates->ndeletedtids * sizeof(uint16)); + + _bt_update_posting(vacposting); + + /* Overwrite updated version of tuple */ + itemsz = MAXALIGN(IndexTupleSize(vacposting->itup)); + if (!PageIndexTupleOverwrite(page, updatedoffsets[i], + (Item) vacposting->itup, itemsz)) + elog(PANIC, "failed to update partially dead item"); + + pfree(vacposting->itup); + pfree(vacposting); + + /* advance to next xl_btree_update from array */ + updates = (xl_btree_update *) + ((char *) updates + SizeOfBtreeUpdate + + updates->ndeletedtids * sizeof(uint16)); + } + } + + if (xlrec->ndeleted > 0) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); /* Mark the page as not containing any LP_DEAD items */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 6438c45716..a19b1b78c7 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -207,9 +207,9 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan) /* * To perform that check simply start an index scan, create the necessary * slot, do the heap lookup, and shut everything down again. This could be - * optimized, but is unlikely to matter from a performance POV. If there - * frequently are live index pointers also matching a unique index key, the - * CPU overhead of this routine is unlikely to matter. + * optimized, but is unlikely to matter from a performance POV. Note that + * table_index_batch_check() is optimized in this way, since it is designed + * as a batch operation. * * Note that *tid may be modified when we return true if the AM supports * storing multiple row versions reachable via a single index entry (like @@ -236,6 +236,28 @@ table_index_fetch_tuple_check(Relation rel, return found; } +/* + * Specialized variant of table_index_fetch_tuple_check() that can be used + * by index AMs to perform "bottom up" deletion of duplicate index tuples. + * This is particularly likely to work well with unique indexes. + * + * Note: This routine sorts the deltids array, but does not modify any + * individual entry accept to mark it as dead for caller. + * + * Returns total number of deltids that can be killed in index by caller. + */ +int +table_index_batch_check(Relation rel, TM_IndexDelete *deltids, int ndeltids, + Snapshot snapshot, int npromisingkillsneeded) +{ + /* + * TODO -- call heapam's heap_index_batch_check() function here, and make + * nbtdedup.c call here instead of calling heap_index_batch_check() + * directly + */ + + return 0; +} /* ------------------------------------------------------------------------ * Functions for non-modifying operations on individual tuples @@ -356,7 +378,7 @@ simple_table_tuple_update(Relation rel, ItemPointer otid, GetCurrentCommandId(true), snapshot, InvalidSnapshot, true /* wait for commit */ , - &tmfd, &lockmode, update_indexes); + &tmfd, &lockmode, update_indexes, NULL); switch (result) { diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 36ddcdccdb..816f3702fb 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2523,7 +2523,7 @@ CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo, recheckIndexes = ExecInsertIndexTuples(resultRelInfo, buffer->slots[i], estate, false, NULL, - NIL); + NIL, NULL); ExecARInsertTriggers(estate, resultRelInfo, slots[i], recheckIndexes, cstate->transition_capture); @@ -3268,7 +3268,8 @@ CopyFrom(CopyState cstate) estate, false, NULL, - NIL); + NIL, + NULL); } /* AFTER ROW INSERT Triggers */ diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index c6b5bcba7b..d171d26b69 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -275,7 +275,8 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, EState *estate, bool noDupErr, bool *specConflict, - List *arbiterIndexes) + List *arbiterIndexes, + Bitmapset *modified_attrs_hint) { ItemPointer tupleid = &slot->tts_tid; List *result = NIL; @@ -389,6 +390,44 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, else checkUnique = UNIQUE_CHECK_PARTIAL; + /* + * We may have to hint to index am that this is a logically unchanged + * index tuple. This happens when we're inserting a duplicate tuple + * just to represent the successor version. + */ + if (checkUnique == UNIQUE_CHECK_NO && modified_attrs_hint) + { + bool logicallyModified = false; + + for (int attr = 0; attr < indexInfo->ii_NumIndexAttrs; attr++) + { + int keycol = indexInfo->ii_IndexAttrNumbers[attr]; + + if (keycol > 0) + { + logicallyModified = + bms_is_member(keycol - FirstLowInvalidHeapAttributeNumber, + modified_attrs_hint); + if (logicallyModified) + break; + } + else + { + /* + * XXX: For now we always assume that expression indexes + * and indexes with whole-row vars were not modified by an + * UPDATE (i.e. they just use the dedup delete + * optimization regardless of the details of the UPDATE). + * Review this decision when the high level design is a + * bit better worked out. + */ + } + } + + if (!logicallyModified) + checkUnique = UNIQUE_CHECK_NO_WITH_UNCHANGED; + } + satisfiesConstraint = index_insert(indexRelation, /* index relation */ values, /* array of index Datums */ diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 01d26881e7..e97d05b448 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -445,7 +445,7 @@ ExecSimpleRelationInsert(ResultRelInfo *resultRelInfo, if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(resultRelInfo, slot, estate, false, NULL, - NIL); + NIL, NULL); /* AFTER ROW INSERT Triggers */ ExecARInsertTriggers(estate, resultRelInfo, slot, @@ -513,7 +513,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, if (resultRelInfo->ri_NumIndices > 0 && update_indexes) recheckIndexes = ExecInsertIndexTuples(resultRelInfo, slot, estate, false, NULL, - NIL); + NIL, NULL); /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 29e07b7228..d76e371595 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -601,7 +601,7 @@ ExecInsert(ModifyTableState *mtstate, recheckIndexes = ExecInsertIndexTuples(resultRelInfo, slot, estate, true, &specConflict, - arbiterIndexes); + arbiterIndexes, NULL); /* adjust the tuple's state accordingly */ table_tuple_complete_speculative(resultRelationDesc, slot, @@ -640,7 +640,7 @@ ExecInsert(ModifyTableState *mtstate, if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(resultRelInfo, slot, estate, false, - NULL, NIL); + NULL, NIL, NULL); } } @@ -1219,6 +1219,7 @@ ExecUpdate(ModifyTableState *mtstate, TM_Result result; TM_FailureData tmfd; List *recheckIndexes = NIL; + Bitmapset *modified_attrs_hint = NULL; /* * abort the operation if not running transactions @@ -1382,7 +1383,8 @@ lreplace:; estate->es_snapshot, estate->es_crosscheck_snapshot, true /* wait for commit */ , - &tmfd, &lockmode, &update_indexes); + &tmfd, &lockmode, &update_indexes, + &modified_attrs_hint); switch (result) { @@ -1513,9 +1515,13 @@ lreplace:; /* insert index entries for tuple if necessary */ if (resultRelInfo->ri_NumIndices > 0 && update_indexes) + { recheckIndexes = ExecInsertIndexTuples(resultRelInfo, slot, estate, false, - NULL, NIL); + NULL, NIL, + modified_attrs_hint); + bms_free(modified_attrs_hint); + } } if (canSetTag) -- 2.25.1