From 27aa9e530fc5cb322efc7ada388ac635532f01a9 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Wed, 14 Jan 2026 17:02:34 +0100 Subject: [PATCH v5 4/4] Support snapshot resets in concurrent builds of unique indexes Previously, concurrent builds of unique index used a fixed snapshot for the entire scan to ensure proper uniqueness checks. Now reset snapshots periodically during concurrent unique index builds, while still maintaining uniqueness by: - ignoring SnapshotSelf dead tuples during uniqueness checks in tuplesort as not a guarantee, but a fail-fast mechanics - adding a uniqueness check in _bt_load that detects multiple alive tuples with the same key values as a guarantee of correctness Tuples are SnapshotSelf tested only in the case of equal index key values, otherwise _bt_load works like before. --- src/backend/access/heap/README.HOT | 12 +- src/backend/access/heap/heapam_handler.c | 7 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtreadpage.c | 2 +- src/backend/access/nbtree/nbtsort.c | 175 +++++++++++++++--- src/backend/access/nbtree/nbtsplitloc.c | 12 +- src/backend/access/nbtree/nbtutils.c | 30 ++- src/backend/catalog/index.c | 6 +- src/backend/commands/indexcmds.c | 4 +- src/backend/utils/sort/tuplesortvariants.c | 80 ++++++-- src/include/access/nbtree.h | 4 +- src/include/access/tableam.h | 7 +- src/include/catalog/index.h | 2 - src/include/utils/tuplesort.h | 1 + .../expected/cic_reset_snapshots.out | 6 + 15 files changed, 274 insertions(+), 82 deletions(-) diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 74e407f375a..829dad1194e 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -386,12 +386,12 @@ have the HOT-safety property enforced before we start to build the new index. After waiting for transactions which had the table open, we build the index -for all rows that are valid in a fresh snapshot. Any tuples visible in the -snapshot will have only valid forward-growing HOT chains. (They might have -older HOT updates behind them which are broken, but this is OK for the same -reason it's OK in a regular index build.) As above, we point the index -entry at the root of the HOT-update chain but we use the key value from the -live tuple. +for all rows that are valid in a fresh snapshot, which is updated every so +often. Any tuples visible in the snapshot will have only valid forward-growing +HOT chains. (They might have older HOT updates behind them which are broken, +but this is OK for the same reason it's OK in a regular index build.) +As above, we point the index entry at the root of the HOT-update chain but we +use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then we take diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index fb2d61360c9..00b1b5ac1fb 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1165,13 +1165,14 @@ heapam_index_build_range_scan(Relation heapRelation, * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot * and index whatever's live according to that while that snapshot is reset - * every so often (in the case of non-unique index). + * every so often. */ OldestXmin = InvalidTransactionId; /* - * For unique indexes we need a consistent snapshot for the whole scan. - * Resetting snapshots also doesn't work in xact-snapshot isolation modes, + * For concurrent builds of non-system indexes, we want to periodically + * reset snapshots. + * Resetting snapshots doesn't work in xact-snapshot isolation modes, * because those keep a registered transaction snapshot for the whole xact. */ reset_snapshots = IndexBuildResetsSnapshots(indexInfo) && diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index af7affdf409..10f4f7eeba9 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -149,7 +149,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* @@ -376,7 +376,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } - else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + else if (_bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ @@ -789,12 +789,12 @@ _bt_do_singleval(Relation rel, Page page, BTDedupState state, itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) return true; } diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c index 2ba1ca66023..c4c278e777a 100644 --- a/src/backend/access/nbtree/nbtreadpage.c +++ b/src/backend/access/nbtree/nbtreadpage.c @@ -623,7 +623,7 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) lasttup = (IndexTuple) PageGetItem(pstate->page, iid); /* Determine the first attribute whose values change on caller's page */ - firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup, NULL); for (; startikey < so->numberOfKeys; startikey++) { diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 6bb365c951d..e02a4a60a8b 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -87,6 +87,7 @@ typedef struct BTSpool Relation index; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; } BTSpool; /* @@ -105,6 +106,7 @@ typedef struct BTShared Oid indexrelid; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; bool isconcurrent; int scantuplesortstates; @@ -207,15 +209,14 @@ typedef struct BTLeader */ typedef struct BTBuildState { - bool isunique; - bool nulls_not_distinct; bool havedead; Relation heap; BTSpool *spool; /* - * spool2 is needed only when the index is a unique index. Dead tuples are - * put into spool2 instead of spool in order to avoid uniqueness check. + * spool2 is needed only when the index is a unique index and built + * non-concurrently. Dead tuples are put into spool2 instead of spool + * in order to avoid uniqueness check. */ BTSpool *spool2; double indtuples; @@ -262,7 +263,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -307,8 +308,6 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - buildstate.isunique = indexInfo->ii_Unique; - buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; buildstate.havedead = false; buildstate.heap = heap; buildstate.spool = NULL; @@ -385,6 +384,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, btspool->index = index; btspool->isunique = indexInfo->ii_Unique; btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; + /* + * We need to ignore dead tuples for unique checks only when concurrent + * unique builds actually use periodic snapshot resets. + */ + btspool->unique_dead_ignored = IndexBuildResetsSnapshots(indexInfo) && + indexInfo->ii_Unique; /* Save as primary spool */ buildstate->spool = btspool; @@ -433,8 +438,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * the use of parallelism or any other factor. */ buildstate->spool->sortstate = - tuplesort_begin_index_btree(heap, index, buildstate->isunique, - buildstate->nulls_not_distinct, + tuplesort_begin_index_btree(heap, index, btspool->isunique, + btspool->nulls_not_distinct, + btspool->unique_dead_ignored, maintenance_work_mem, coordinate, TUPLESORT_NONE); @@ -442,8 +448,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. We expect that the second spool (for * dead tuples) won't get very full, so we give it only work_mem. + * + * In case of concurrent build dead tuples do not need to be put into index + * since we wait for all snapshots older than reference snapshot during the + * validation phase. */ - if (indexInfo->ii_Unique) + if (indexInfo->ii_Unique && !IndexBuildResetsSnapshots(indexInfo)) { BTSpool *btspool2 = palloc0_object(BTSpool); SortCoordinate coordinate2 = NULL; @@ -474,7 +484,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * full, so we give it only work_mem */ buildstate->spool2->sortstate = - tuplesort_begin_index_btree(heap, index, false, false, work_mem, + tuplesort_begin_index_btree(heap, index, false, false, false, work_mem, coordinate2, TUPLESORT_NONE); } @@ -1156,13 +1166,118 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) SortSupport sortKeys; int64 tuples_done = 0; bool deduplicate; + bool fail_on_alive_duplicate; wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * The unique_dead_ignored does not guarantee absence of multiple alive + * tuples with the same values in the spool. Such a case may happen if + * alive tuples are located between a few dead tuples, like this: addda. + */ + fail_on_alive_duplicate = btspool->unique_dead_ignored; - if (merge) + if (fail_on_alive_duplicate) + { + bool seen_alive = false, + prev_tested = false; + IndexTuple prev = NULL; + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(wstate->heap), + &TTSOpsBufferHeapTuple); + IndexFetchTableData *fetch = table_index_fetch_begin(wstate->heap, SO_NONE); + + Assert(btspool->isunique); + Assert(!btspool2); + + while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) + { + bool tuples_equal = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (prev != NULL) /* if this is not the first tuple */ + { + bool has_nulls = false, + call_again = false, + ignored = false, + now_alive; + ItemPointerData tid; + + /* is this tuple equal to previous one? */ + if (wstate->inskey->allequalimage) + tuples_equal = _bt_keep_natts_fast(wstate->index, prev, itup, &has_nulls) > keysz; + else + tuples_equal = _bt_keep_natts(wstate->index, prev, itup, wstate->inskey, &has_nulls) > keysz; + + /* handle null values correctly */ + if (has_nulls && !btspool->nulls_not_distinct) + tuples_equal = false; + + if (tuples_equal) + { + /* check previous tuple if not yet */ + if (!prev_tested) + { + call_again = false; + tid = prev->t_tid; + seen_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + prev_tested = true; + } + + call_again = false; + tid = itup->t_tid; + now_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + /* are multiple alive tuples detected in equal group? */ + if (seen_alive && now_alive) + { + char *key_desc; + TupleDesc tupDes = RelationGetDescr(wstate->index); + bool isnull[INDEX_MAX_KEYS]; + Datum values[INDEX_MAX_KEYS]; + + index_deform_tuple(itup, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(wstate->index, values, isnull); + + /* keep this message in sync with the same in comparetup_index_btree_tiebreak */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(wstate->index)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(wstate->heap, + RelationGetRelationName(wstate->index)))); + } + seen_alive |= now_alive; + } + } + + if (!tuples_equal) + { + seen_alive = false; + prev_tested = false; + } + + _bt_buildadd(wstate, state, itup, 0); + if (prev) pfree(prev); + prev = CopyIndexTuple(itup); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + if (prev) pfree(prev); + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge @@ -1322,7 +1437,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) InvalidOffsetNumber); } else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && + itup, NULL) > keysz && _bt_dedup_save_htid(dstate, itup)) { /* @@ -1438,15 +1553,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; /* - * For concurrent non-unique index builds, we can periodically reset - * snapshots to allow the xmin horizon to advance. This is safe since - * these builds don't require a consistent view across the entire scan. - * Unique indexes still need a stable snapshot to properly enforce - * uniqueness constraints. Isolation modes where + * For concurrent index builds, we can periodically reset snapshots to + * allow the xmin horizon to advance. + * Isolation modes where * IsolationUsesXactSnapshot() is true also prevent resetting because * they keep a registered transaction snapshot for the whole transaction. */ - reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot() && !btspool->isunique; + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot(); /* * Prepare for scan of the base relation. In a normal index build, we use @@ -1454,7 +1567,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's * live according to that, while that snapshot may be reset periodically - * for non-unique indexes in non-xact-snapshot isolation modes. + * in non-xact-snapshot isolation modes. */ if (!isconcurrent) { @@ -1483,10 +1596,10 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) shm_toc_estimate_chunk(&pcxt->estimator, estsort); /* - * Unique case requires a second spool, and so we may have to account for - * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 + * Non-concurrent unique case requires a second spool, and so we may have + * to account for another shared workspace -- PARALLEL_KEY_TUPLESORT_SPOOL2 */ - if (!btspool->isunique) + if (!btspool->isunique || reset_snapshot) shm_toc_estimate_keys(&pcxt->estimator, 2); else { @@ -1541,6 +1654,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->indexrelid = RelationGetRelid(btspool->index); btshared->isunique = btspool->isunique; btshared->nulls_not_distinct = btspool->nulls_not_distinct; + btshared->unique_dead_ignored = btspool->unique_dead_ignored; btshared->isconcurrent = isconcurrent; btshared->scantuplesortstates = scantuplesortstates; btshared->queryid = pgstat_get_my_query_id(); @@ -1568,8 +1682,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); - /* Unique case requires a second spool, and associated shared state */ - if (!btspool->isunique) + /* Non-concurrent unique case requires a second spool and shared state */ + if (!btspool->isunique || reset_snapshot) sharedsort2 = NULL; else { @@ -1752,9 +1866,10 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; + leaderworker->unique_dead_ignored = buildstate->spool->unique_dead_ignored; /* Initialize second spool, if required */ - if (!btleader->btshared->isunique) + if (!btleader->btshared->isunique || (btleader->btshared->isconcurrent && !IsolationUsesXactSnapshot())) leaderworker2 = NULL; else { @@ -1855,11 +1970,12 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) btspool->index = indexRel; btspool->isunique = btshared->isunique; btspool->nulls_not_distinct = btshared->nulls_not_distinct; + btspool->unique_dead_ignored = btshared->unique_dead_ignored; /* Look up shared state private to tuplesort.c */ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); tuplesort_attach_shared(sharedsort, seg); - if (!btshared->isunique) + if (!btshared->isunique || (btshared->isconcurrent && !IsolationUsesXactSnapshot())) { btspool2 = NULL; sharedsort2 = NULL; @@ -1939,6 +2055,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, btspool->index, btspool->isunique, btspool->nulls_not_distinct, + btspool->unique_dead_ignored, sortmem, coordinate, TUPLESORT_NONE); @@ -1961,14 +2078,12 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; btspool2->sortstate = - tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, false, Min(sortmem, work_mem), coordinate2, false); } /* Fill in buildstate for _bt_build_callback() */ - buildstate.isunique = btshared->isunique; - buildstate.nulls_not_distinct = btshared->nulls_not_distinct; buildstate.havedead = false; buildstate.heap = btspool->heap; buildstate.spool = btspool; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index de9eca3c8b2..b174fb8d3ee 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -688,7 +688,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -719,7 +719,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -968,7 +968,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) return perfectpenalty; @@ -989,7 +989,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * If page is entirely full of duplicates, a single value strategy split * will be performed. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; @@ -1028,7 +1028,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, - state->newitem); + state->newitem, NULL); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else @@ -1150,7 +1150,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); - return _bt_keep_natts_fast(state->rel, lastleft, firstright); + return _bt_keep_natts_fast(state->rel, lastleft, firstright, NULL); } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 014faa1622f..b69bb61ce42 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -32,9 +32,6 @@ static int _bt_compare_int(const void *va, const void *vb); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); - /* * _bt_mkscankey @@ -707,7 +704,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key, NULL); #ifdef DEBUG_NO_TRUNCATE /* Force truncation to be ineffective for testing purposes */ @@ -825,17 +822,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, /* * _bt_keep_natts - how many key attributes to keep when truncating. * + * This is exported to be used as comparison function during concurrent + * unique index build in case _bt_keep_natts_fast is not suitable because + * collation is not "allequalimage"/deduplication-safe. + * * Caller provides two tuples that enclose a split point. Caller's insertion * scankey is used to compare the tuples; the scankey's argument values are * not considered here. * + * hasnulls value set to true in case of any null column in any tuple. + * * This can return a number of attributes that is one greater than the * number of key attributes for the index relation. This indicates that the * caller must use a heap TID as a unique-ifier in new pivot tuple. */ -static int +int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) + BTScanInsert itup_key, + bool *hasnulls) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); TupleDesc itupdesc = RelationGetDescr(rel); @@ -861,6 +865,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; if (isNull1 != isNull2) break; @@ -880,7 +886,7 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * expected in an allequalimage index. */ Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright, NULL)); return keepnatts; } @@ -891,7 +897,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * This is exported so that a candidate split point can have its effect on * suffix truncation inexpensively evaluated ahead of time when finding a * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. + * save cycles. Also, it may be used as comparison function during concurrent + * build of unique index. * * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the @@ -900,6 +907,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * "equal image" columns, routine is guaranteed to give the same result as * _bt_keep_natts would. * + * hasnulls value set to true in case of any null column in any tuple. + * * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts, even when the index uses * an opclass or collation that is not "allequalimage"/deduplication-safe. @@ -908,7 +917,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * more balanced split point. */ int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *hasnulls) { TupleDesc itupdesc = RelationGetDescr(rel); int keysz = IndexRelationGetNumberOfKeyAttributes(rel); @@ -925,6 +935,8 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; att = TupleDescCompactAttr(itupdesc, attnum - 1); if (isNull1 != isNull2) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index ce017e836be..fa073a33b65 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3344,9 +3344,9 @@ IndexCheckExclusion(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the - * scan, which causes new snapshot to be set as active every so often. The reason - * for that is to propagate the xmin horizon forward. + * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new + * snapshot to be set as active every so often. The reason for that is to + * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index be3dc5e8d28..9fff8635d3d 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1704,8 +1704,8 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We build the index using all tuples that are visible using single or - * multiple refreshing snapshots. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using multiple + * refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 2509ac3e3a4..1a2a8b64aa7 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -25,6 +25,8 @@ #include "access/hash.h" #include "access/htup_details.h" #include "access/nbtree.h" +#include "access/relscan.h" +#include "access/tableam.h" #include "catalog/index.h" #include "catalog/pg_collation.h" #include "executor/executor.h" @@ -35,6 +37,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* sort-type codes for sort__start probes */ @@ -136,6 +139,7 @@ typedef struct bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + bool uniqueDeadIgnored; /* ignore dead tuples in unique check */ } TuplesortIndexBTreeArg; /* @@ -361,6 +365,7 @@ tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt) @@ -403,6 +408,7 @@ tuplesort_begin_index_btree(Relation heapRel, arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + arg->uniqueDeadIgnored = uniqueDeadIgnored; indexScanKey = _bt_mkscankey(indexRel, NULL); @@ -1670,6 +1676,7 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; char *key_desc; + bool uniqueCheckFail = true; /* * Some rather brain-dead implementations of qsort (such as the one in @@ -1679,18 +1686,67 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ Assert(tuple1 != tuple2); - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); + /* This is fail-fast check, see _bt_load for details. */ + if (arg->uniqueDeadIgnored) + { + bool any_tuple_dead, + call_again = false, + ignored; + /* + * Keep the slot/fetch lifetime local to this duplicate check so the + * xmin propagation assertion below continues to validate cleanup. + * + * XXX: This fail-fast check performs heap fetches from the sort + * comparator. That can be expensive when many equal keys are + * compared. Revisit whether this optimization is worth keeping, or + * whether it should be replaced with a bounded/probabilistic check + * or deferred entirely to _bt_load(). + */ + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(arg->index.heapRel), + &TTSOpsBufferHeapTuple); + ItemPointerData tid = tuple1->t_tid; + + IndexFetchTableData *fetch = table_index_fetch_begin(arg->index.heapRel, SO_NONE); + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + if (!any_tuple_dead) + { + call_again = false; + tid = tuple2->t_tid; + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, + &ignored); + } + + if (any_tuple_dead) + { + elog(DEBUG5, "skipping duplicate values because some of them are dead: (%u,%u) vs (%u,%u)", + ItemPointerGetBlockNumber(&tuple1->t_tid), + ItemPointerGetOffsetNumber(&tuple1->t_tid), + ItemPointerGetBlockNumber(&tuple2->t_tid), + ItemPointerGetOffsetNumber(&tuple2->t_tid)); + + uniqueCheckFail = false; + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + if (uniqueCheckFail) + { + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); + + /* keep this error message in sync with the same in _bt_load */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(arg->index.indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(arg->index.heapRel, + RelationGetRelationName(arg->index.indexRel)))); + } } /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 3097e9bb1af..1266c8eecdf 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1307,8 +1307,10 @@ extern bool btproperty(Oid index_oid, int attno, extern char *btbuildphasename(int64 phasenum); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key, bool *hasnulls); extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); + IndexTuple firstright, bool *hasnulls); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 02c31caed2b..7e3e3242a16 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1853,9 +1853,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique concurrent build, - * concurrent_index_reset_snapshot_every_n_pages is applied for the scan. - * That leads to changing snapshots on the fly to allow xmin horizon propagate. + * In case of concurrent index build, + * concurrent_index_reset_snapshot_every_n_pages is applied for the + * scan. That leads to changing snapshots on the fly to allow xmin + * horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 60f8356ed82..ed000bf10d5 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -32,13 +32,11 @@ typedef struct AttrMap AttrMap; * * Snapshot resetting is only applicable when all of: * - the build is concurrent (ii_Concurrent) - * - the index is non-unique (unique needs consistent snapshot) * - isolation level is not REPEATABLE READ/SERIALIZABLE (those keep a * registered transaction snapshot) */ #define IndexBuildResetsSnapshots(indexInfo) \ ((indexInfo)->ii_Concurrent && \ - !(indexInfo)->ii_Unique && \ !IsolationUsesXactSnapshot()) /* Action code for index_set_state_flags */ diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index da68f45acf2..448dc83aa58 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -396,6 +396,7 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index e5a8a7f3a79..7835a796b88 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -41,7 +41,11 @@ END; $$; ---------------- ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -86,7 +90,9 @@ SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); (1 row) CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_parallelscan_initialize -- 2.43.0