From e579bfb2df7bfa0c87d721e05c68c8013915d441 Mon Sep 17 00:00:00 2001 From: nkey Date: Sat, 25 Jan 2025 17:21:29 +0100 Subject: [PATCH v16 09/12] Concurrently built index validation uses fresh snapshots This commit modifies the validation process for concurrently built indexes to use fresh snapshots instead of a single reference snapshot. The previous approach of using a single reference snapshot could lead to issues with xmin propagation. Specifically, if the index build took a long time, the reference snapshot's xmin could become outdated, causing the index to miss tuples that were deleted by transactions that committed after the reference snapshot was taken. To address this, the validation process now periodically replaces the snapshot with a newer one. This ensures that the index's xmin is kept up-to-date and that all relevant tuples are included in the index. --- doc/src/sgml/ref/create_index.sgml | 11 +++- doc/src/sgml/ref/reindex.sgml | 11 ++-- src/backend/access/heap/heapam_handler.c | 77 +++++++++++++++--------- src/backend/access/nbtree/nbtsort.c | 2 +- src/backend/access/spgist/spgvacuum.c | 12 +++- src/backend/catalog/index.c | 14 +++-- src/backend/commands/indexcmds.c | 2 +- src/include/access/transam.h | 15 +++++ 8 files changed, 97 insertions(+), 47 deletions(-) diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index e33345f6a34..54566223cb0 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -868,9 +868,14 @@ Indexes: - Like any long-running transaction, CREATE INDEX on a - table can affect which tuples can be removed by concurrent - VACUUM on any other table. + Due to the improved implementation using periodically refreshed snapshots and + auxiliary indexes, concurrent index builds have minimal impact on concurrent + VACUUM operations. The system automatically advances its + internal transaction horizon during the build process, allowing + VACUUM to remove dead tuples on other tables without + having to wait for the entire index build to complete. Only during very brief + periods when snapshots are being refreshed might there be any temporary effect + on concurrent VACUUM operations. diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml index 6a05620bd67..64c633e0398 100644 --- a/doc/src/sgml/ref/reindex.sgml +++ b/doc/src/sgml/ref/reindex.sgml @@ -495,10 +495,13 @@ Indexes: - Like any long-running transaction, REINDEX on a table - can affect which tuples can be removed by concurrent - VACUUM on any other table. - + REINDEX CONCURRENTLY has minimal + impact on which tuples can be removed by concurrent VACUUM + operations on other tables. This is achieved through periodic snapshot + refreshes and the use of auxiliary indexes during the rebuild process, + allowing the system to advance its transaction horizon regularly rather than + maintaining a single long-running snapshot. + REINDEX SYSTEM does not support diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 2e3e8a678c9..a596fc9920a 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1791,8 +1791,8 @@ heapam_index_build_range_scan(Relation heapRelation, */ static int heapam_index_validate_tuplesort_difference(Tuplesortstate *main, - Tuplesortstate *aux, - Tuplestorestate *store) + Tuplesortstate *aux, + Tuplestorestate *store) { int num = 0; /* state variables for the merge */ @@ -2048,7 +2048,8 @@ heapam_index_validate_scan(Relation heapRelation, ExprContext *econtext; BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); - int num_to_check; + int num_to_check, + page_read_counter = 1; /* set to 1 to skip snapshot resert at start */ Tuplestorestate *tuples_for_check; ValidateIndexScanState callback_private_data; @@ -2059,9 +2060,35 @@ heapam_index_validate_scan(Relation heapRelation, /* Use 10% of memory for tuple store. */ int store_work_mem_part = maintenance_work_mem / 10; + PushActiveSnapshot(GetTransactionSnapshot()); + + tuples_for_check = tuplestore_begin_datum(INT8OID, false, false, store_work_mem_part); + + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + + Assert(!HaveRegisteredOrActiveSnapshot()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + /* - * Now take the snapshot that will be used by to filter candidate - * tuples. + * sanity checks + */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + num_to_check = heapam_index_validate_tuplesort_difference(state->tuplesort, + auxState->tuplesort, + tuples_for_check); + + /* It is our responsibility to sloe tuple sort as fast as we can */ + tuplesort_end(state->tuplesort); + tuplesort_end(auxState->tuplesort); + + state->tuplesort = auxState->tuplesort = NULL; + + /* + * Now take the first snapshot that will be used by to filter candidate + * tuples. We are going to replace it by newer snapshot every so often + * to propagate horizon. * * Beware! There might still be snapshots in use that treat some transaction * as in-progress that our temporary snapshot treats as committed. @@ -2077,33 +2104,10 @@ heapam_index_validate_scan(Relation heapRelation, * We also set ActiveSnapshot to this snap, since functions in indexes may * need a snapshot. */ - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + snapshot = RegisterSnapshot(GetLatestSnapshot()); PushActiveSnapshot(snapshot); limitXmin = snapshot->xmin; - /* - * Encode TIDs as int8 values for the sort, rather than directly sorting - * item pointers. This can be significantly faster, primarily because TID - * is a pass-by-reference type on all platforms, whereas int8 is - * pass-by-value on most platforms. - */ - tuples_for_check = tuplestore_begin_datum(INT8OID, false, false, store_work_mem_part); - - /* - * sanity checks - */ - Assert(OidIsValid(indexRelation->rd_rel->relam)); - - num_to_check = heapam_index_validate_tuplesort_difference(state->tuplesort, - auxState->tuplesort, - tuples_for_check); - - /* It is our responsibility to sloe tuple sort as fast as we can */ - tuplesort_end(state->tuplesort); - tuplesort_end(auxState->tuplesort); - - state->tuplesort = auxState->tuplesort = NULL; - estate = CreateExecutorState(); econtext = GetPerTupleExprContext(estate); slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation), @@ -2140,6 +2144,7 @@ heapam_index_validate_scan(Relation heapRelation, LockBuffer(buf, BUFFER_LOCK_SHARE); block_number = BufferGetBlockNumber(buf); + page_read_counter++; i = 0; while ((off = tuples[i]) != InvalidOffsetNumber) @@ -2194,6 +2199,20 @@ heapam_index_validate_scan(Relation heapRelation, } ReleaseBuffer(buf); + + if (page_read_counter % SO_RESET_SNAPSHOT_EACH_N_PAGE == 0) + { + PopActiveSnapshot(); + UnregisterSnapshot(snapshot); + /* to make sure we propagate xmin */ + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + + snapshot = RegisterSnapshot(GetLatestSnapshot()); + PushActiveSnapshot(snapshot); + /* xmin should not go backwards, but just for the case*/ + limitXmin = TransactionIdNewer(limitXmin, snapshot->xmin); + } } ExecDropSingleTupleTableSlot(slot); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index f7914ebb3d0..bb4e0fbb675 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -444,7 +444,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * dead tuples) won't get very full, so we give it only work_mem. * * In case of concurrent build dead tuples are not need to be put into index - * since we wait for all snapshots older than reference snapshot during the + * since we wait for all snapshots older than latest snapshot during the * validation phase. */ if (indexInfo->ii_Unique && !indexInfo->ii_Concurrent) diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index eeddacd0d52..4130e49dd98 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -190,14 +190,16 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, * Add target TID to pending list if the redirection could have * happened since VACUUM started. (If xid is invalid, assume it * must have happened before VACUUM started, since REINDEX - * CONCURRENTLY locks out VACUUM.) + * CONCURRENTLY locks out VACUUM, if myXmin is invalid it is + * validation scan.) * * Note: we could make a tighter test by seeing if the xid is * "running" according to the active snapshot; but snapmgr.c * doesn't currently export a suitable API, and it's not entirely * clear that a tighter test is worth the cycles anyway. */ - if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) + if (!TransactionIdIsValid(bds->myXmin) || + TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) spgAddPendingTID(bds, &dt->pointer); } else @@ -811,7 +813,6 @@ spgvacuumscan(spgBulkDeleteState *bds) /* Finish setting up spgBulkDeleteState */ initSpGistState(&bds->spgstate, index); bds->pendingList = NULL; - bds->myXmin = GetActiveSnapshot()->xmin; bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO; /* @@ -925,6 +926,10 @@ spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, bds.stats = stats; bds.callback = callback; bds.callback_state = callback_state; + if (info->validate_index) + bds.myXmin = InvalidTransactionId; + else + bds.myXmin = GetActiveSnapshot()->xmin; spgvacuumscan(&bds); @@ -965,6 +970,7 @@ spgvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) bds.stats = stats; bds.callback = dummy_callback; bds.callback_state = NULL; + bds.myXmin = GetActiveSnapshot()->xmin; spgvacuumscan(&bds); } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 8df0b472e88..39d2f474865 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3485,8 +3485,9 @@ IndexCheckExclusion(Relation heapRelation, * insert their new tuples into it. At the same moment we clear "indisready" for * auxiliary index, since it is no more required. * - * We then take a new reference snapshot, any tuples that are valid according - * to this snap, but are not in the index, must be added to the index. + * We then take a new snapshot, any tuples that are valid according + * to this snap, but are not in the index, must be added to the index. In + * order to propagate xmin we reset that snapshot every few so often. * (Any tuples committed live after the snap will be inserted into the * index by their originating transaction. Any tuples committed dead before * the snap need not be indexed, because we will wait out all transactions @@ -3499,7 +3500,7 @@ IndexCheckExclusion(Relation heapRelation, * TIDs of both auxiliary and target indexes, and doing a "merge join" against * the TID lists to see which tuples from auxiliary index are missing from the * target index. Thus we will ensure that all tuples valid according to the - * reference snapshot are in the index. Notice we need to do bulkdelete in the + * latest snapshot are in the index. Notice we need to do bulkdelete in the * particular order: auxiliary first, target last. * * Building a unique index this way is tricky: we might try to insert a @@ -3582,6 +3583,7 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId) */ PushActiveSnapshot(GetTransactionSnapshot()); indexInfo = BuildIndexInfo(indexRelation); + PopActiveSnapshot(); /* mark build is concurrent just for consistency */ indexInfo->ii_Concurrent = true; @@ -3617,6 +3619,9 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId) NULL, TUPLESORT_NONE); auxState.htups = auxState.itups = auxState.tups_inserted = 0; + /* tuplesort_begin_datum may require catalog snapshot */ + InvalidateCatalogSnapshot(); + (void) index_bulk_delete(&auxivinfo, NULL, validate_index_callback, &auxState); @@ -3646,9 +3651,6 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId) } tuplesort_performsort(state.tuplesort); tuplesort_performsort(auxState.tuplesort); - - PopActiveSnapshot(); - InvalidateCatalogSnapshot(); Assert(!TransactionIdIsValid(MyProc->xmin)); /* diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 51db7f23378..85f83a97a1f 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -4349,7 +4349,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* * The index is now valid in the sense that it contains all currently * interesting tuples. But since it might not contain tuples deleted - * just before the reference snap was taken, we have to wait out any + * just before the latest snap was taken, we have to wait out any * transactions that might have older snapshots. * * Because we don't take a snapshot or Xid in this transaction, diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 7d82cd2eb56..15e345c7a19 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -355,6 +355,21 @@ NormalTransactionIdOlder(TransactionId a, TransactionId b) return b; } +/* return the newer of the two IDs */ +static inline TransactionId +TransactionIdNewer(TransactionId a, TransactionId b) +{ + if (!TransactionIdIsValid(a)) + return b; + + if (!TransactionIdIsValid(b)) + return a; + + if (TransactionIdFollows(a, b)) + return a; + return b; +} + /* return the newer of the two IDs */ static inline FullTransactionId FullTransactionIdNewer(FullTransactionId a, FullTransactionId b) -- 2.43.0