From a57c5b4548d9d9f2e4ac4d1eee3f3f0b648255d2 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Mon, 21 Apr 2025 14:18:32 +0200 Subject: [PATCH v22 11/12] Refresh snapshot periodically during index validation Enhances validation phase of concurrently built indexes by periodically refreshing snapshots rather than using a single reference snapshot. This addresses issues with xmin propagation during long-running validations. The validation now takes a fresh snapshot every few pages, allowing the xmin horizon to advance. This restores feature of commit d9d076222f5b, which was reverted in commit e28bb8851969. New STIR-based approach is not depends on single reference snapshot anymore. --- doc/src/sgml/ref/create_index.sgml | 11 +++- doc/src/sgml/ref/reindex.sgml | 11 ++-- src/backend/access/heap/README.HOT | 4 +- src/backend/access/heap/heapam_handler.c | 73 +++++++++++++++++++++--- src/backend/access/nbtree/nbtsort.c | 2 +- src/backend/access/spgist/spgvacuum.c | 12 +++- src/backend/catalog/index.c | 42 ++++++++++---- src/backend/commands/indexcmds.c | 50 ++-------------- src/include/access/tableam.h | 7 +-- src/include/access/transam.h | 15 +++++ src/include/catalog/index.h | 2 +- 11 files changed, 146 insertions(+), 83 deletions(-) diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index cf14f474946..1626cee7a03 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -881,9 +881,14 @@ Indexes: - Like any long-running transaction, CREATE INDEX on a - table can affect which tuples can be removed by concurrent - VACUUM on any other table. + Due to the improved implementation using periodically refreshed snapshots and + auxiliary indexes, concurrent index builds have minimal impact on concurrent + VACUUM operations. The system automatically advances its + internal transaction horizon during the build process, allowing + VACUUM to remove dead tuples on other tables without + having to wait for the entire index build to complete. Only during very brief + periods when snapshots are being refreshed might there be any temporary effect + on concurrent VACUUM operations. diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml index d62791ff9c3..60f4d0d680f 100644 --- a/doc/src/sgml/ref/reindex.sgml +++ b/doc/src/sgml/ref/reindex.sgml @@ -502,10 +502,13 @@ Indexes: - Like any long-running transaction, REINDEX on a table - can affect which tuples can be removed by concurrent - VACUUM on any other table. - + REINDEX CONCURRENTLY has minimal + impact on which tuples can be removed by concurrent VACUUM + operations on other tables. This is achieved through periodic snapshot + refreshes and the use of auxiliary indexes during the rebuild process, + allowing the system to advance its transaction horizon regularly rather than + maintaining a single long-running snapshot. + REINDEX SYSTEM does not support diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 6f718feb6d5..d41609c97cd 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -401,12 +401,12 @@ use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then validate the index. This searches for tuples missing from the index in auxiliary -index, and inserts any missing ones if them visible to reference snapshot. +index, and inserts any missing ones if them visible to fresh snapshot. Again, the index entries have to have TIDs equal to HOT-chain root TIDs, but the value to be inserted is the one from the live tuple. Then we wait until every transaction that could have a snapshot older than -the second reference snapshot is finished. This ensures that nobody is +the latest used snapshot is finished. This ensures that nobody is alive any longer who could need to see any tuples that might be missing from the index, as well as ensuring that no one can see any inconsistent rows in a broken HOT chain (the first condition is stronger than the diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index f592b09ec68..236e216d170 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2034,23 +2034,26 @@ heapam_index_validate_scan_read_stream_next( return result; } -static void +static TransactionId heapam_index_validate_scan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, - Snapshot snapshot, ValidateIndexState *state, ValidateIndexState *auxState) { + TransactionId limitXmin; + Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + Snapshot snapshot; TupleTableSlot *slot; EState *estate; ExprContext *econtext; BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); - int num_to_check; + int num_to_check, + page_read_counter = 1; /* set to 1 to skip snapshot reset at start */ Tuplestorestate *tuples_for_check; ValidateIndexScanState callback_private_data; @@ -2061,14 +2064,16 @@ heapam_index_validate_scan(Relation heapRelation, /* Use 10% of memory for tuple store. */ int store_work_mem_part = maintenance_work_mem / 10; - /* - * Encode TIDs as int8 values for the sort, rather than directly sorting - * item pointers. This can be significantly faster, primarily because TID - * is a pass-by-reference type on all platforms, whereas int8 is - * pass-by-value on most platforms. - */ + PushActiveSnapshot(GetTransactionSnapshot()); + tuples_for_check = tuplestore_begin_datum(INT8OID, false, false, store_work_mem_part); + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + + Assert(!HaveRegisteredOrActiveSnapshot()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + /* * sanity checks */ @@ -2084,6 +2089,29 @@ heapam_index_validate_scan(Relation heapRelation, state->tuplesort = auxState->tuplesort = NULL; + /* + * Now take the first snapshot that will be used by to filter candidate + * tuples. We are going to replace it by newer snapshot every so often + * to propagate horizon. + * + * Beware! There might still be snapshots in use that treat some transaction + * as in-progress that our temporary snapshot treats as committed. + * + * If such a recently-committed transaction deleted tuples in the table, + * we will not include them in the index; yet those transactions which + * see the deleting one as still-in-progress will expect such tuples to + * be there once we mark the index as valid. + * + * We solve this by waiting for all endangered transactions to exit before + * we mark the index as valid, for that reason limitXmin is supported. + * + * We also set ActiveSnapshot to this snap, since functions in indexes may + * need a snapshot. + */ + snapshot = RegisterSnapshot(GetLatestSnapshot()); + PushActiveSnapshot(snapshot); + limitXmin = snapshot->xmin; + estate = CreateExecutorState(); econtext = GetPerTupleExprContext(estate); slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation), @@ -2117,6 +2145,7 @@ heapam_index_validate_scan(Relation heapRelation, LockBuffer(buf, BUFFER_LOCK_SHARE); block_number = BufferGetBlockNumber(buf); + page_read_counter++; i = 0; while ((off = tuples[i]) != InvalidOffsetNumber) @@ -2172,6 +2201,20 @@ heapam_index_validate_scan(Relation heapRelation, } ReleaseBuffer(buf); +#define VALIDATE_INDEX_RESET_SNAPSHOT_EACH_N_PAGE 4096 + if (page_read_counter % VALIDATE_INDEX_RESET_SNAPSHOT_EACH_N_PAGE == 0) + { + PopActiveSnapshot(); + UnregisterSnapshot(snapshot); + /* to make sure we propagate xmin */ + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + + snapshot = RegisterSnapshot(GetLatestSnapshot()); + PushActiveSnapshot(snapshot); + /* xmin should not go backwards, but just for the case*/ + limitXmin = TransactionIdNewer(limitXmin, snapshot->xmin); + } } ExecDropSingleTupleTableSlot(slot); @@ -2181,9 +2224,21 @@ heapam_index_validate_scan(Relation heapRelation, read_stream_end(read_stream); tuplestore_end(tuples_for_check); + /* + * Drop the latest snapshot. We must do this before waiting out other + * snapshot holders, else we will deadlock against other processes also + * doing CREATE INDEX CONCURRENTLY, which would see our snapshot as one + * they must wait for. + */ + PopActiveSnapshot(); + UnregisterSnapshot(snapshot); + InvalidateCatalogSnapshot(); + Assert(MyProc->xmin == InvalidTransactionId); /* These may have been pointing to the now-gone estate */ indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_PredicateState = NULL; + + return limitXmin; } /* diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 08a3cb28348..250d9d59b9a 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -444,7 +444,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * dead tuples) won't get very full, so we give it only work_mem. * * In case of concurrent build dead tuples are not need to be put into index - * since we wait for all snapshots older than reference snapshot during the + * since we wait for all snapshots older than latest snapshot during the * validation phase. */ if (indexInfo->ii_Unique && !indexInfo->ii_Concurrent) diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 2678f7ab782..968a8f7725c 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -191,14 +191,16 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, * Add target TID to pending list if the redirection could have * happened since VACUUM started. (If xid is invalid, assume it * must have happened before VACUUM started, since REINDEX - * CONCURRENTLY locks out VACUUM.) + * CONCURRENTLY locks out VACUUM, if myXmin is invalid it is + * validation scan.) * * Note: we could make a tighter test by seeing if the xid is * "running" according to the active snapshot; but snapmgr.c * doesn't currently export a suitable API, and it's not entirely * clear that a tighter test is worth the cycles anyway. */ - if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) + if (!TransactionIdIsValid(bds->myXmin) || + TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) spgAddPendingTID(bds, &dt->pointer); } else @@ -808,7 +810,6 @@ spgvacuumscan(spgBulkDeleteState *bds) /* Finish setting up spgBulkDeleteState */ initSpGistState(&bds->spgstate, index); bds->pendingList = NULL; - bds->myXmin = GetActiveSnapshot()->xmin; bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO; /* @@ -959,6 +960,10 @@ spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, bds.stats = stats; bds.callback = callback; bds.callback_state = callback_state; + if (info->validate_index) + bds.myXmin = InvalidTransactionId; + else + bds.myXmin = GetActiveSnapshot()->xmin; spgvacuumscan(&bds); @@ -999,6 +1004,7 @@ spgvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) bds.stats = stats; bds.callback = dummy_callback; bds.callback_state = NULL; + bds.myXmin = GetActiveSnapshot()->xmin; spgvacuumscan(&bds); } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index b37684309e3..e20d8a60357 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3535,8 +3535,9 @@ IndexCheckExclusion(Relation heapRelation, * insert their new tuples into it. At the same moment we clear "indisready" for * auxiliary index, since it is no more required to be updated. * - * We then take a new reference snapshot, any tuples that are valid according - * to this snap, but are not in the index, must be added to the index. + * We then take a new snapshot, any tuples that are valid according + * to this snap, but are not in the index, must be added to the index. In + * order to propagate xmin we reset that snapshot every few so often. * (Any tuples committed live after the snap will be inserted into the * index by their originating transaction. Any tuples committed dead before * the snap need not be indexed, because we will wait out all transactions @@ -3549,7 +3550,7 @@ IndexCheckExclusion(Relation heapRelation, * TIDs of both auxiliary and target indexes, and doing a "merge join" against * the TID lists to see which tuples from auxiliary index are missing from the * target index. Thus we will ensure that all tuples valid according to the - * reference snapshot are in the index. Notice we need to do bulkdelete in the + * latest snapshot are in the index. Notice we need to do bulkdelete in the * particular order: auxiliary first, target last. * * Building a unique index this way is tricky: we might try to insert a @@ -3570,13 +3571,14 @@ IndexCheckExclusion(Relation heapRelation, * * Also, some actions to concurrent drop the auxiliary index are performed. */ -void -validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) +TransactionId +validate_index(Oid heapId, Oid indexId, Oid auxIndexId) { Relation heapRelation, indexRelation, auxIndexRelation; IndexInfo *indexInfo; + TransactionId limitXmin; IndexVacuumInfo ivinfo, auxivinfo; ValidateIndexState state, auxState; Oid save_userid; @@ -3626,8 +3628,12 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) * Fetch info needed for index_insert. (You might think this should be * passed in from DefineIndex, but its copy is long gone due to having * been built in a previous transaction.) + * + * We might need snapshot for index expressions or predicates. */ + PushActiveSnapshot(GetTransactionSnapshot()); indexInfo = BuildIndexInfo(indexRelation); + PopActiveSnapshot(); /* mark build is concurrent just for consistency */ indexInfo->ii_Concurrent = true; @@ -3663,6 +3669,9 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) NULL, TUPLESORT_NONE); auxState.htups = auxState.itups = auxState.tups_inserted = 0; + /* tuplesort_begin_datum may require catalog snapshot */ + InvalidateCatalogSnapshot(); + (void) index_bulk_delete(&auxivinfo, NULL, validate_index_callback, &auxState); /* If aux index is empty, merge may be skipped */ @@ -3697,6 +3706,9 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) NULL, TUPLESORT_NONE); state.htups = state.itups = state.tups_inserted = 0; + /* tuplesort_begin_datum may require catalog snapshot */ + InvalidateCatalogSnapshot(); + /* ambulkdelete updates progress metrics */ (void) index_bulk_delete(&ivinfo, NULL, validate_index_callback, &state); @@ -3716,19 +3728,24 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) pgstat_progress_update_multi_param(3, progress_index, progress_vals); } tuplesort_performsort(state.tuplesort); + /* tuplesort_performsort may require catalog snapshot */ + InvalidateCatalogSnapshot(); + tuplesort_performsort(auxState.tuplesort); + /* tuplesort_performsort may require catalog snapshot */ + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); /* * Now merge both indexes */ pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_VALIDATE_IDXMERGE); - table_index_validate_scan(heapRelation, - indexRelation, - indexInfo, - snapshot, - &state, - &auxState); + limitXmin = table_index_validate_scan(heapRelation, + indexRelation, + indexInfo, + &state, + &auxState); /* Tuple sort closed by table_index_validate_scan */ Assert(state.tuplesort == NULL && auxState.tuplesort == NULL); @@ -3751,6 +3768,9 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) index_close(auxIndexRelation, NoLock); index_close(indexRelation, NoLock); table_close(heapRelation, NoLock); + + Assert(!TransactionIdIsValid(MyProc->xmin)); + return limitXmin; } /* diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index e5259d1d82e..e6260f7011e 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -592,7 +592,6 @@ DefineIndex(Oid tableId, LockRelId heaprelid; LOCKTAG heaplocktag; LOCKMODE lockmode; - Snapshot snapshot; Oid root_save_userid; int root_save_sec_context; int root_save_nestlevel; @@ -1794,32 +1793,11 @@ DefineIndex(Oid tableId, /* Tell concurrent index builds to ignore us, if index qualifies */ if (safe_index) set_indexsafe_procflags(); - - /* - * Now take the "reference snapshot" that will be used by validate_index() - * to filter candidate tuples. Beware! There might still be snapshots in - * use that treat some transaction as in-progress that our reference - * snapshot treats as committed. If such a recently-committed transaction - * deleted tuples in the table, we will not include them in the index; yet - * those transactions which see the deleting one as still-in-progress will - * expect such tuples to be there once we mark the index as valid. - * - * We solve this by waiting for all endangered transactions to exit before - * we mark the index as valid. - * - * We also set ActiveSnapshot to this snap, since functions in indexes may - * need a snapshot. - */ - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(snapshot); /* * Merge content of auxiliary and target indexes - insert any missing index entries. */ - validate_index(tableId, indexRelationId, auxIndexRelationId, snapshot); - limitXmin = snapshot->xmin; + limitXmin = validate_index(tableId, indexRelationId, auxIndexRelationId); - PopActiveSnapshot(); - UnregisterSnapshot(snapshot); /* * The snapshot subsystem could still contain registered snapshots that * are holding back our process's advertised xmin; in particular, if @@ -1841,8 +1819,8 @@ DefineIndex(Oid tableId, /* * The index is now valid in the sense that it contains all currently * interesting tuples. But since it might not contain tuples deleted just - * before the reference snap was taken, we have to wait out any - * transactions that might have older snapshots. + * before the last snapshot during validating was taken, we have to wait + * out any transactions that might have older snapshots. */ INJECTION_POINT("define_index_before_set_valid", NULL); pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, @@ -4382,7 +4360,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein { ReindexIndexInfo *newidx = lfirst(lc); TransactionId limitXmin; - Snapshot snapshot; StartTransactionCommand(); @@ -4397,13 +4374,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein if (newidx->safe) set_indexsafe_procflags(); - /* - * Take the "reference snapshot" that will be used by validate_index() - * to filter candidate tuples. - */ - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(snapshot); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4415,16 +4385,8 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein progress_vals[3] = newidx->amId; pgstat_progress_update_multi_param(4, progress_index, progress_vals); - validate_index(newidx->tableId, newidx->indexId, newidx->auxIndexId, snapshot); - - /* - * We can now do away with our active snapshot, we still need to save - * the xmin limit to wait for older snapshots. - */ - limitXmin = snapshot->xmin; - - PopActiveSnapshot(); - UnregisterSnapshot(snapshot); + limitXmin = validate_index(newidx->tableId, newidx->indexId, newidx->auxIndexId); + Assert(!TransactionIdIsValid(MyProc->xmin)); /* * To ensure no deadlocks, we must commit and start yet another @@ -4437,7 +4399,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* * The index is now valid in the sense that it contains all currently * interesting tuples. But since it might not contain tuples deleted - * just before the reference snap was taken, we have to wait out any + * just before the latest snap was taken, we have to wait out any * transactions that might have older snapshots. * * Because we don't take a snapshot or Xid in this transaction, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 6e280aa4e6a..c0aac2dab77 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -708,10 +708,9 @@ typedef struct TableAmRoutine TableScanDesc scan); /* see table_index_validate_scan for reference about parameters */ - void (*index_validate_scan) (Relation table_rel, + TransactionId (*index_validate_scan) (Relation table_rel, Relation index_rel, struct IndexInfo *index_info, - Snapshot snapshot, struct ValidateIndexState *state, struct ValidateIndexState *aux_state); @@ -1825,18 +1824,16 @@ table_index_build_range_scan(Relation table_rel, * Note: it is responsibility of that function to close sortstates in * both `state` and `auxstate`. */ -static inline void +static inline TransactionId table_index_validate_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, - Snapshot snapshot, struct ValidateIndexState *state, struct ValidateIndexState *auxstate) { return table_rel->rd_tableam->index_validate_scan(table_rel, index_rel, index_info, - snapshot, state, auxstate); } diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 7d82cd2eb56..15e345c7a19 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -355,6 +355,21 @@ NormalTransactionIdOlder(TransactionId a, TransactionId b) return b; } +/* return the newer of the two IDs */ +static inline TransactionId +TransactionIdNewer(TransactionId a, TransactionId b) +{ + if (!TransactionIdIsValid(a)) + return b; + + if (!TransactionIdIsValid(b)) + return a; + + if (TransactionIdFollows(a, b)) + return a; + return b; +} + /* return the newer of the two IDs */ static inline FullTransactionId FullTransactionIdNewer(FullTransactionId a, FullTransactionId b) diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index d51b4e8cd13..6c780681967 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -152,7 +152,7 @@ extern void index_build(Relation heapRelation, bool isreindex, bool parallel); -extern void validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot); +extern TransactionId validate_index(Oid heapId, Oid indexId, Oid auxIndexId); extern void index_set_state_flags(Oid indexId, IndexStateFlagsAction action); -- 2.43.0