From 205008ee146ac36801b9810331226a99448027de Mon Sep 17 00:00:00 2001 From: nkey Date: Thu, 6 Mar 2025 14:54:44 +0100 Subject: [PATCH v16 05/12] Allow snapshot resets in concurrent unique index builds Previously, concurrent unique index builds used a fixed snapshot for the entire scan to ensure proper uniqueness checks. This could delay vacuum's ability to clean up dead tuples. Now reset snapshots periodically during concurrent unique index builds, while still maintaining uniqueness by: 1. Ignoring dead tuples during uniqueness checks in tuplesort 2. Adding a uniqueness check in _bt_load that detects multiple alive tuples with the same key values This improves vacuum effectiveness during long-running index builds without compromising index uniqueness enforcement. --- src/backend/access/heap/README.HOT | 12 +- src/backend/access/heap/heapam_handler.c | 6 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtsort.c | 192 ++++++++++++++---- src/backend/access/nbtree/nbtsplitloc.c | 12 +- src/backend/access/nbtree/nbtutils.c | 29 ++- src/backend/catalog/index.c | 8 +- src/backend/commands/indexcmds.c | 4 +- src/backend/utils/sort/tuplesortvariants.c | 69 +++++-- src/include/access/nbtree.h | 4 +- src/include/access/tableam.h | 5 +- src/include/utils/tuplesort.h | 1 + .../expected/cic_reset_snapshots.out | 6 + 13 files changed, 263 insertions(+), 93 deletions(-) diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 74e407f375a..829dad1194e 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -386,12 +386,12 @@ have the HOT-safety property enforced before we start to build the new index. After waiting for transactions which had the table open, we build the index -for all rows that are valid in a fresh snapshot. Any tuples visible in the -snapshot will have only valid forward-growing HOT chains. (They might have -older HOT updates behind them which are broken, but this is OK for the same -reason it's OK in a regular index build.) As above, we point the index -entry at the root of the HOT-update chain but we use the key value from the -live tuple. +for all rows that are valid in a fresh snapshot, which is updated every so +often. Any tuples visible in the snapshot will have only valid forward-growing +HOT chains. (They might have older HOT updates behind them which are broken, +but this is OK for the same reason it's OK in a regular index build.) +As above, we point the index entry at the root of the HOT-update chain but we +use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then we take diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 22274f095ac..fa582d3e2d6 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1232,15 +1232,15 @@ heapam_index_build_range_scan(Relation heapRelation, * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot * and index whatever's live according to that while that snapshot is reset - * every so often (in case of non-unique index). + * every so often. */ OldestXmin = InvalidTransactionId; /* - * For unique index we need consistent snapshot for the whole scan. + * For concurrent builds of non-system indexes, we may want to periodically + * reset snapshots to allow vacuum to clean up tuples. */ reset_snapshots = indexInfo->ii_Concurrent && - !indexInfo->ii_Unique && !is_system_catalog; /* just for the case */ /* okay to ignore lazy VACUUMs here */ diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index cbe73675f86..5db6d237c2c 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -148,7 +148,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* @@ -374,7 +374,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } - else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + else if (_bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ @@ -789,12 +789,12 @@ _bt_do_singleval(Relation rel, Page page, BTDedupState state, itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) return true; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 810f80fc8e6..f7914ebb3d0 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -83,6 +83,7 @@ typedef struct BTSpool Relation index; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; } BTSpool; /* @@ -101,6 +102,7 @@ typedef struct BTShared Oid indexrelid; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; bool isconcurrent; int scantuplesortstates; @@ -203,15 +205,13 @@ typedef struct BTLeader */ typedef struct BTBuildState { - bool isunique; - bool nulls_not_distinct; bool havedead; Relation heap; BTSpool *spool; /* - * spool2 is needed only when the index is a unique index. Dead tuples are - * put into spool2 instead of spool in order to avoid uniqueness check. + * spool2 is needed only when the index is a unique index and build non-concurrently. + * Dead tuples are put into spool2 instead of spool in order to avoid uniqueness check. */ BTSpool *spool2; double indtuples; @@ -258,7 +258,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -303,8 +303,6 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - buildstate.isunique = indexInfo->ii_Unique; - buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; buildstate.havedead = false; buildstate.heap = heap; buildstate.spool = NULL; @@ -321,20 +319,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_Unique && indexInfo->ii_Concurrent); + _bt_leafbuild(buildstate.spool, buildstate.spool2, indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); @@ -381,6 +379,11 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, btspool->index = index; btspool->isunique = indexInfo->ii_Unique; btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; + /* + * We need to ignore dead tuples for unique checks in case of concurrent build. + * It is required because or periodic reset of snapshot. + */ + btspool->unique_dead_ignored = indexInfo->ii_Concurrent && indexInfo->ii_Unique; /* Save as primary spool */ buildstate->spool = btspool; @@ -429,8 +432,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * the use of parallelism or any other factor. */ buildstate->spool->sortstate = - tuplesort_begin_index_btree(heap, index, buildstate->isunique, - buildstate->nulls_not_distinct, + tuplesort_begin_index_btree(heap, index, btspool->isunique, + btspool->nulls_not_distinct, + btspool->unique_dead_ignored, maintenance_work_mem, coordinate, TUPLESORT_NONE); @@ -438,8 +442,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. We expect that the second spool (for * dead tuples) won't get very full, so we give it only work_mem. + * + * In case of concurrent build dead tuples are not need to be put into index + * since we wait for all snapshots older than reference snapshot during the + * validation phase. */ - if (indexInfo->ii_Unique) + if (indexInfo->ii_Unique && !indexInfo->ii_Concurrent) { BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); SortCoordinate coordinate2 = NULL; @@ -470,7 +478,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * full, so we give it only work_mem */ buildstate->spool2->sortstate = - tuplesort_begin_index_btree(heap, index, false, false, work_mem, + tuplesort_begin_index_btree(heap, index, false, false, false, work_mem, coordinate2, TUPLESORT_NONE); } @@ -483,7 +491,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); InvalidateCatalogSnapshot(); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -539,7 +547,7 @@ _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent) { BTWriteState wstate; @@ -561,7 +569,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } - Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; @@ -575,7 +583,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); - Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1154,13 +1162,117 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) SortSupport sortKeys; int64 tuples_done = 0; bool deduplicate; + bool fail_on_alive_duplicate; wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * The unique_dead_ignored does not guarantee absence of multiple alive + * tuples with same values exists in the spool. Such thing may happen if + * alive tuples are located between a few dead tuples, like this: addda. + */ + fail_on_alive_duplicate = btspool->unique_dead_ignored; - if (merge) + if (fail_on_alive_duplicate) + { + bool seen_alive = false, + prev_tested = false; + IndexTuple prev = NULL; + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(wstate->heap), + &TTSOpsBufferHeapTuple); + IndexFetchTableData *fetch = table_index_fetch_begin(wstate->heap); + + Assert(btspool->isunique); + Assert(!btspool2); + + while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) + { + bool tuples_equal = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (prev != NULL) /* if is not the first tuple */ + { + bool has_nulls = false, + call_again, /* just to pass something */ + ignored, /* just to pass something */ + now_alive; + ItemPointerData tid; + + /* if this tuples equal to previouse one? */ + if (wstate->inskey->allequalimage) + tuples_equal = _bt_keep_natts_fast(wstate->index, prev, itup, &has_nulls) > keysz; + else + tuples_equal = _bt_keep_natts(wstate->index, prev, itup,wstate->inskey, &has_nulls) > keysz; + + /* handle null values correctly */ + if (has_nulls && !btspool->nulls_not_distinct) + tuples_equal = false; + + if (tuples_equal) + { + /* check previous tuple if not yet */ + if (!prev_tested) + { + call_again = false; + tid = prev->t_tid; + seen_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + prev_tested = true; + } + + call_again = false; + tid = itup->t_tid; + now_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + /* are multiple alive tuples detected in equal group? */ + if (seen_alive && now_alive) + { + char *key_desc; + TupleDesc tupDes = RelationGetDescr(wstate->index); + bool isnull[INDEX_MAX_KEYS]; + Datum values[INDEX_MAX_KEYS]; + + index_deform_tuple(itup, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(wstate->index, values, isnull); + + /* keep this message in sync with the same in comparetup_index_btree_tiebreak */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(wstate->index)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(wstate->heap, + RelationGetRelationName(wstate->index)))); + } + seen_alive |= now_alive; + } + } + + if (!tuples_equal) + { + seen_alive = false; + prev_tested = false; + } + + _bt_buildadd(wstate, state, itup, 0); + if (prev) pfree(prev); + prev = CopyIndexTuple(itup); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge @@ -1321,7 +1433,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) InvalidOffsetNumber); } else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && + itup, NULL) > keysz && _bt_dedup_save_htid(dstate, itup)) { /* @@ -1418,7 +1530,6 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; - bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1436,21 +1547,12 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; - /* - * For concurrent non-unique index builds, we can periodically reset snapshots - * to allow the xmin horizon to advance. This is safe since these builds don't - * require a consistent view across the entire scan. Unique indexes still need - * a stable snapshot to properly enforce uniqueness constraints. - */ - reset_snapshot = isconcurrent && !btspool->isunique; - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that, while that snapshot may be reset periodically in - * case of non-unique index. + * live according to that, while that snapshot may be reset periodically. */ if (!isconcurrent) { @@ -1458,16 +1560,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } - else if (reset_snapshot) + else { + /* + * For concurrent index builds, we can periodically reset snapshots to allow + * the xmin horizon to advance. This is safe since these builds don't + * require a consistent view across the entire scan. + */ snapshot = InvalidSnapshot; PushActiveSnapshot(GetTransactionSnapshot()); } - else - { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(snapshot); - } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1537,6 +1639,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->indexrelid = RelationGetRelid(btspool->index); btshared->isunique = btspool->isunique; btshared->nulls_not_distinct = btspool->nulls_not_distinct; + btshared->unique_dead_ignored = btspool->unique_dead_ignored; btshared->isconcurrent = isconcurrent; btshared->scantuplesortstates = scantuplesortstates; btshared->queryid = pgstat_get_my_query_id(); @@ -1551,7 +1654,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), snapshot, - reset_snapshot); + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1631,7 +1734,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * In case of concurrent build snapshots are going to be reset periodically. * Wait until all workers imported initial snapshot. */ - if (reset_snapshot) + if (isconcurrent) WaitForParallelWorkersToAttach(pcxt, true); /* Join heap scan ourselves */ @@ -1642,7 +1745,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - if (!reset_snapshot) + if (!isconcurrent) WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); @@ -1745,6 +1848,7 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; + leaderworker->unique_dead_ignored = buildstate->spool->unique_dead_ignored; /* Initialize second spool, if required */ if (!btleader->btshared->isunique) @@ -1848,11 +1952,12 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) btspool->index = indexRel; btspool->isunique = btshared->isunique; btspool->nulls_not_distinct = btshared->nulls_not_distinct; + btspool->unique_dead_ignored = btshared->unique_dead_ignored; /* Look up shared state private to tuplesort.c */ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); tuplesort_attach_shared(sharedsort, seg); - if (!btshared->isunique) + if (!btshared->isunique || btshared->isconcurrent) { btspool2 = NULL; sharedsort2 = NULL; @@ -1932,6 +2037,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, btspool->index, btspool->isunique, btspool->nulls_not_distinct, + btspool->unique_dead_ignored, sortmem, coordinate, TUPLESORT_NONE); @@ -1954,14 +2060,12 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; btspool2->sortstate = - tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, false, Min(sortmem, work_mem), coordinate2, false); } /* Fill in buildstate for _bt_build_callback() */ - buildstate.isunique = btshared->isunique; - buildstate.nulls_not_distinct = btshared->nulls_not_distinct; buildstate.havedead = false; buildstate.heap = btspool->heap; buildstate.spool = btspool; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index e6c9aaa0454..7cb1f3e1bc6 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -687,7 +687,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -718,7 +718,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -967,7 +967,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) return perfectpenalty; @@ -988,7 +988,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * If page is entirely full of duplicates, a single value strategy split * will be performed. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; @@ -1027,7 +1027,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, - state->newitem); + state->newitem, NULL); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else @@ -1149,7 +1149,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); - return _bt_keep_natts_fast(state->rel, lastleft, firstright); + return _bt_keep_natts_fast(state->rel, lastleft, firstright, NULL); } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 693e43c674b..f9695fba8b5 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -51,8 +51,6 @@ static bool _bt_check_rowcompare(ScanKey skey, ScanDirection dir, bool *continuescan); static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); /* @@ -2828,7 +2826,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key, NULL); #ifdef DEBUG_NO_TRUNCATE /* Force truncation to be ineffective for testing purposes */ @@ -2946,17 +2944,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, /* * _bt_keep_natts - how many key attributes to keep when truncating. * + * This is exported to be used as comparison function during concurrent + * unique index build in case _bt_keep_natts_fast is not suitable because + * collation is not "allequalimage"/deduplication-safe. + * * Caller provides two tuples that enclose a split point. Caller's insertion * scankey is used to compare the tuples; the scankey's argument values are * not considered here. * + * hasnulls value set to true in case of any null column in any tuple. + * * This can return a number of attributes that is one greater than the * number of key attributes for the index relation. This indicates that the * caller must use a heap TID as a unique-ifier in new pivot tuple. */ -static int +int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) + BTScanInsert itup_key, + bool *hasnulls) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); TupleDesc itupdesc = RelationGetDescr(rel); @@ -2982,6 +2987,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + (*hasnulls) |= (isNull1 || isNull2); if (isNull1 != isNull2) break; @@ -3001,7 +3008,7 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * expected in an allequalimage index. */ Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright, NULL)); return keepnatts; } @@ -3012,7 +3019,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * This is exported so that a candidate split point can have its effect on * suffix truncation inexpensively evaluated ahead of time when finding a * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. + * save cycles. Also, it may be used as comparison function during concurrent + * build of unique index. * * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the @@ -3021,6 +3029,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * "equal image" columns, routine is guaranteed to give the same result as * _bt_keep_natts would. * + * hasnulls value set to true in case of any null column in any tuple. + * * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts, even when the index uses * an opclass or collation that is not "allequalimage"/deduplication-safe. @@ -3029,7 +3039,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * more balanced split point. */ int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *hasnulls) { TupleDesc itupdesc = RelationGetDescr(rel); int keysz = IndexRelationGetNumberOfKeyAttributes(rel); @@ -3046,6 +3057,8 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= (isNull1 | isNull2); att = TupleDescCompactAttr(itupdesc, attnum - 1); if (isNull1 != isNull2) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 482d9a1786d..e369ad0b723 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1531,7 +1531,7 @@ index_concurrently_build(Oid heapRelationId, /* Invalidate catalog snapshot just for assert */ InvalidateCatalogSnapshot(); - Assert(indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xmin)); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -3301,9 +3301,9 @@ IndexCheckExclusion(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the - * scan, which causes new snapshot to be set as active every so often. The reason - * for that is to propagate the xmin horizon forward. + * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new + * snapshot to be set as active every so often. The reason for that is to + * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 36b875945d3..4b50d6ee8cf 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1700,8 +1700,8 @@ DefineIndex(Oid tableId, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We build the index using all tuples that are visible using single or - * multiple refreshing snapshots. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using multiple + * refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index eb8601e2257..18f90d46a73 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -32,6 +32,7 @@ #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* sort-type codes for sort__start probes */ @@ -133,6 +134,7 @@ typedef struct bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + bool uniqueDeadIgnored; /* ignore dead tuples in unique check */ } TuplesortIndexBTreeArg; /* @@ -359,6 +361,7 @@ tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt) @@ -401,6 +404,7 @@ tuplesort_begin_index_btree(Relation heapRel, arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + arg->uniqueDeadIgnored = uniqueDeadIgnored; indexScanKey = _bt_mkscankey(indexRel, NULL); @@ -1655,6 +1659,7 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; char *key_desc; + bool uniqueCheckFail = true; /* * Some rather brain-dead implementations of qsort (such as the one in @@ -1664,18 +1669,58 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ Assert(tuple1 != tuple2); - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); + /* This is fail-fast check, see _bt_load for details. */ + if (arg->uniqueDeadIgnored) + { + bool any_tuple_dead, + call_again = false, + ignored; + + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(arg->index.heapRel), + &TTSOpsBufferHeapTuple); + ItemPointerData tid = tuple1->t_tid; + + IndexFetchTableData *fetch = table_index_fetch_begin(arg->index.heapRel); + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + if (!any_tuple_dead) + { + call_again = false; + tid = tuple2->t_tid; + any_tuple_dead = !table_index_fetch_tuple(fetch, &tuple2->t_tid, SnapshotSelf, slot, &call_again, + &ignored); + } + + if (any_tuple_dead) + { + elog(DEBUG5, "skipping duplicate values because some of them are dead: (%u,%u) vs (%u,%u)", + ItemPointerGetBlockNumber(&tuple1->t_tid), + ItemPointerGetOffsetNumber(&tuple1->t_tid), + ItemPointerGetBlockNumber(&tuple2->t_tid), + ItemPointerGetOffsetNumber(&tuple2->t_tid)); + + uniqueCheckFail = false; + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + if (uniqueCheckFail) + { + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); + + /* keep this error message in sync with the same in _bt_load */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(arg->index.indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(arg->index.heapRel, + RelationGetRelationName(arg->index.indexRel)))); + } } /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e4fdeca3402..d22a9797ad0 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1314,8 +1314,10 @@ extern bool btproperty(Oid index_oid, int attno, extern char *btbuildphasename(int64 phasenum); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key, bool *hasnulls); extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); + IndexTuple firstright, bool *hasnulls); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 313394d92c6..b1920999f12 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1800,9 +1800,8 @@ table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique concurrent index build SO_RESET_SNAPSHOT is applied - * for the scan. That leads for changing snapshots on the fly to allow xmin - * horizon propagate. + * In case of concurrent index build SO_RESET_SNAPSHOT is applied for the scan. + * That leads for changing snapshots on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index ef79f259f93..eb9bc30e5da 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -429,6 +429,7 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index 595a4000ce0..9f03fa3033c 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -41,7 +41,11 @@ END; $$; ---------------- ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -86,7 +90,9 @@ SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); (1 row) CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_parallelscan_initialize -- 2.43.0