diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 7695ec1..49b9892 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2002,6 +2002,42 @@ include_dir 'conf.d' + + + old_snapshot_threshold (integer) + + old_snapshot_threshold configuration parameter + + + + + Sets the minimum time that a snapshot can be used without risk of a + snapshot too old error occurring when using the snapshot. + This parameter can only be set at server start. + + + + Beyond the threshold, old data may be vacuumed away. This can help + prevent bloat in the face of snapshots which remain in use for a + long time. To prevent incorrect results due to cleanup of data which + would otherwise be visible to the snapshot, an error is generated + when the snapshot is older than this threshold and the snapshot is + used to read a page which has been modified since the snapshot was + built. + + + + A value of -1 disables this feature, and is the default. + Useful values for production work probably range from a small number + of hours to a few days. The setting will be coerced to a granularity + of minutes, and small numbers (such as 0 or + 1min) are only allowed because they may sometimes be + useful for testing. While a setting as high as 60d is + allowed, please note that in many workloads extreme bloat or + transaction ID wraparound may occur in much shorter time frames. + + + @@ -2979,6 +3015,10 @@ include_dir 'conf.d' You should also consider setting hot_standby_feedback on standby server(s) as an alternative to using this parameter. + + This does not prevent cleanup of dead rows which have reached the age + specified by old_snapshot_threshold. + @@ -3126,6 +3166,16 @@ include_dir 'conf.d' until it eventually reaches the primary. Standbys make no other use of feedback they receive other than to pass upstream. + + This setting does not override the behavior of + old_snapshot_threshold on the primary; a snapshot on the + standby which exceeds the primary's age threshold can become invalid, + resulting in cancellation of transactions on the standby. This is + because old_snapshot_threshold is intended to provide an + absolute limit on the time which dead rows can contribute to bloat, + which would otherwise be violated because of the configuration of a + standby. + diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index c740952..89bad05 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -135,7 +135,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls, MemoryContext tupcxt = NULL; MemoryContext oldcxt = NULL; - revmap = brinRevmapInitialize(idxRel, &pagesPerRange); + revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); for (;;) { @@ -152,7 +152,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls, /* normalize the block number to be the first block in the range */ heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, - BUFFER_LOCK_SHARE); + BUFFER_LOCK_SHARE, NULL); /* if range is unsummarized, there's nothing to do */ if (!brtup) @@ -284,7 +284,8 @@ brinbeginscan(Relation r, int nkeys, int norderbys) scan = RelationGetIndexScan(r, nkeys, norderbys); opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque)); - opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange); + opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange, + scan->xs_snapshot); opaque->bo_bdesc = brin_build_desc(r); scan->opaque = opaque; @@ -367,7 +368,8 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) MemoryContextResetAndDeleteChildren(perRangeCxt); tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, - &off, &size, BUFFER_LOCK_SHARE); + &off, &size, BUFFER_LOCK_SHARE, + scan->xs_snapshot); if (tup) { tup = brin_copy_tuple(tup, size); @@ -645,7 +647,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) /* * Initialize our state, including the deformed tuple state. */ - revmap = brinRevmapInitialize(index, &pagesPerRange); + revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); state = initialize_brin_buildstate(index, revmap, pagesPerRange); /* @@ -1040,7 +1042,8 @@ summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, * the same.) */ phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf, - &offset, &phsz, BUFFER_LOCK_SHARE); + &offset, &phsz, BUFFER_LOCK_SHARE, + NULL); /* the placeholder tuple must exist */ if (phtup == NULL) elog(ERROR, "missing placeholder tuple"); @@ -1075,7 +1078,7 @@ brinsummarize(Relation index, Relation heapRel, double *numSummarized, BlockNumber pagesPerRange; Buffer buf; - revmap = brinRevmapInitialize(index, &pagesPerRange); + revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); /* * Scan the revmap to find unsummarized items. @@ -1090,7 +1093,7 @@ brinsummarize(Relation index, Relation heapRel, double *numSummarized, CHECK_FOR_INTERRUPTS(); tup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, - BUFFER_LOCK_SHARE); + BUFFER_LOCK_SHARE, NULL); if (tup == NULL) { /* no revmap entry for this heap range. Summarize it. */ diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c index b2c273d..812f76c 100644 --- a/src/backend/access/brin/brin_revmap.c +++ b/src/backend/access/brin/brin_revmap.c @@ -68,15 +68,19 @@ static void revmap_physical_extend(BrinRevmap *revmap); * brinRevmapTerminate when caller is done with it. */ BrinRevmap * -brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange) +brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange, + Snapshot snapshot) { BrinRevmap *revmap; Buffer meta; BrinMetaPageData *metadata; + Page page; meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO); LockBuffer(meta, BUFFER_LOCK_SHARE); - metadata = (BrinMetaPageData *) PageGetContents(BufferGetPage(meta)); + page = BufferGetPage(meta); + TestForOldSnapshot(snapshot, idxrel, page); + metadata = (BrinMetaPageData *) PageGetContents(page); revmap = palloc(sizeof(BrinRevmap)); revmap->rm_irel = idxrel; @@ -185,7 +189,8 @@ brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange, */ BrinTuple * brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, - Buffer *buf, OffsetNumber *off, Size *size, int mode) + Buffer *buf, OffsetNumber *off, Size *size, int mode, + Snapshot snapshot) { Relation idxRel = revmap->rm_irel; BlockNumber mapBlk; @@ -262,6 +267,7 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, } LockBuffer(*buf, mode); page = BufferGetPage(*buf); + TestForOldSnapshot(snapshot, idxRel, page); /* If we land on a revmap page, start over */ if (BRIN_IS_REGULAR_PAGE(page)) diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 06ba9cb..dc593c2 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -71,7 +71,7 @@ ginTraverseLock(Buffer buffer, bool searchMode) * is share-locked, and stack->parent is NULL. */ GinBtreeStack * -ginFindLeafPage(GinBtree btree, bool searchMode) +ginFindLeafPage(GinBtree btree, bool searchMode, Snapshot snapshot) { GinBtreeStack *stack; @@ -90,6 +90,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode) stack->off = InvalidOffsetNumber; page = BufferGetPage(stack->buffer); + TestForOldSnapshot(snapshot, btree->index, page); access = ginTraverseLock(stack->buffer, searchMode); @@ -116,6 +117,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode) stack->buffer = ginStepRight(stack->buffer, btree->index, access); stack->blkno = rightlink; page = BufferGetPage(stack->buffer); + TestForOldSnapshot(snapshot, btree->index, page); if (!searchMode && GinPageIsIncompleteSplit(page)) ginFinishSplit(btree, stack, false, NULL); diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index a55bb4a..ab14b35 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -1820,7 +1820,7 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno, { /* search for the leaf page where the first item should go to */ btree.itemptr = insertdata.items[insertdata.curitem]; - stack = ginFindLeafPage(&btree, false); + stack = ginFindLeafPage(&btree, false, NULL); ginInsertValue(&btree, stack, &insertdata, buildStats); } @@ -1830,7 +1830,8 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno, * Starts a new scan on a posting tree. */ GinBtreeStack * -ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno) +ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, + Snapshot snapshot) { GinBtreeStack *stack; @@ -1838,7 +1839,7 @@ ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno) btree->fullScan = TRUE; - stack = ginFindLeafPage(btree, TRUE); + stack = ginFindLeafPage(btree, TRUE, snapshot); return stack; } diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 53290a4..f07e05a 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -19,6 +19,7 @@ #include "miscadmin.h" #include "utils/datum.h" #include "utils/memutils.h" +#include "utils/rel.h" /* GUC parameter */ int GinFuzzySearchLimit = 0; @@ -63,7 +64,7 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack) */ static void scanPostingTree(Relation index, GinScanEntry scanEntry, - BlockNumber rootPostingTree) + BlockNumber rootPostingTree, Snapshot snapshot) { GinBtreeData btree; GinBtreeStack *stack; @@ -71,7 +72,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry, Page page; /* Descend to the leftmost leaf page */ - stack = ginScanBeginPostingTree(&btree, index, rootPostingTree); + stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot); buffer = stack->buffer; IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */ @@ -114,7 +115,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry, */ static bool collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, - GinScanEntry scanEntry) + GinScanEntry scanEntry, Snapshot snapshot) { OffsetNumber attnum; Form_pg_attribute attr; @@ -145,6 +146,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, return true; page = BufferGetPage(stack->buffer); + TestForOldSnapshot(snapshot, btree->index, page); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); /* @@ -224,7 +226,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, LockBuffer(stack->buffer, GIN_UNLOCK); /* Collect all the TIDs in this entry's posting tree */ - scanPostingTree(btree->index, scanEntry, rootPostingTree); + scanPostingTree(btree->index, scanEntry, rootPostingTree, snapshot); /* * We lock again the entry page and while it was unlocked insert @@ -291,7 +293,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, * Start* functions setup beginning state of searches: finds correct buffer and pins it. */ static void -startScanEntry(GinState *ginstate, GinScanEntry entry) +startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot) { GinBtreeData btreeEntry; GinBtreeStack *stackEntry; @@ -318,7 +320,7 @@ restartScanEntry: ginPrepareEntryScan(&btreeEntry, entry->attnum, entry->queryKey, entry->queryCategory, ginstate); - stackEntry = ginFindLeafPage(&btreeEntry, true); + stackEntry = ginFindLeafPage(&btreeEntry, true, snapshot); page = BufferGetPage(stackEntry->buffer); needUnlock = TRUE; @@ -335,7 +337,7 @@ restartScanEntry: * for the entry type. */ btreeEntry.findItem(&btreeEntry, stackEntry); - if (collectMatchBitmap(&btreeEntry, stackEntry, entry) == false) + if (!collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot)) { /* * GIN tree was seriously restructured, so we will cleanup all @@ -383,7 +385,7 @@ restartScanEntry: needUnlock = FALSE; stack = ginScanBeginPostingTree(&entry->btree, ginstate->index, - rootPostingTree); + rootPostingTree, snapshot); entry->buffer = stack->buffer; /* @@ -535,7 +537,7 @@ startScan(IndexScanDesc scan) uint32 i; for (i = 0; i < so->totalentries; i++) - startScanEntry(ginstate, so->entries[i]); + startScanEntry(ginstate, so->entries[i], scan->xs_snapshot); if (GinFuzzySearchLimit > 0) { @@ -580,7 +582,8 @@ startScan(IndexScanDesc scan) * keep it pinned to prevent interference with vacuum. */ static void -entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast) +entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, + ItemPointerData advancePast, Snapshot snapshot) { Page page; int i; @@ -624,7 +627,7 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan entry->btree.itemptr.ip_posid++; } entry->btree.fullScan = false; - stack = ginFindLeafPage(&entry->btree, true); + stack = ginFindLeafPage(&entry->btree, true, snapshot); /* we don't need the stack, just the buffer. */ entry->buffer = stack->buffer; @@ -734,7 +737,7 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan */ static void entryGetItem(GinState *ginstate, GinScanEntry entry, - ItemPointerData advancePast) + ItemPointerData advancePast, Snapshot snapshot) { Assert(!entry->isFinished); @@ -857,7 +860,7 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, /* If we've processed the current batch, load more items */ while (entry->offset >= entry->nlist) { - entryLoadMoreItems(ginstate, entry, advancePast); + entryLoadMoreItems(ginstate, entry, advancePast, snapshot); if (entry->isFinished) { @@ -896,7 +899,7 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, */ static void keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, - ItemPointerData advancePast) + ItemPointerData advancePast, Snapshot snapshot) { ItemPointerData minItem; ItemPointerData curPageLossy; @@ -943,7 +946,7 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, */ if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) { - entryGetItem(ginstate, entry, advancePast); + entryGetItem(ginstate, entry, advancePast, snapshot); if (entry->isFinished) continue; } @@ -1001,7 +1004,7 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0) { - entryGetItem(ginstate, entry, advancePast); + entryGetItem(ginstate, entry, advancePast, snapshot); if (entry->isFinished) continue; } @@ -1210,7 +1213,8 @@ scanGetItem(IndexScanDesc scan, ItemPointerData advancePast, GinScanKey key = so->keys + i; /* Fetch the next item for this key that is > advancePast. */ - keyGetItem(&so->ginstate, so->tempCtx, key, advancePast); + keyGetItem(&so->ginstate, so->tempCtx, key, advancePast, + scan->xs_snapshot); if (key->isFinished) return false; @@ -1332,6 +1336,7 @@ scanGetCandidate(IndexScanDesc scan, pendingPosition *pos) for (;;) { page = BufferGetPage(pos->pendingBuffer); + TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); maxoff = PageGetMaxOffsetNumber(page); if (pos->firstOffset > maxoff) @@ -1512,6 +1517,7 @@ collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos) sizeof(bool) * (pos->lastOffset - pos->firstOffset)); page = BufferGetPage(pos->pendingBuffer); + TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); for (i = 0; i < so->nkeys; i++) { @@ -1698,12 +1704,15 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) int i; pendingPosition pos; Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO); + Page page; BlockNumber blkno; *ntids = 0; LockBuffer(metabuffer, GIN_SHARE); - blkno = GinPageGetMeta(BufferGetPage(metabuffer))->head; + page = BufferGetPage(metabuffer); + TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); + blkno = GinPageGetMeta(page)->head; /* * fetch head of list before unlocking metapage. head page must be pinned diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index cd21e0e..7a9c67a 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -192,7 +192,7 @@ ginEntryInsert(GinState *ginstate, ginPrepareEntryScan(&btree, attnum, key, category, ginstate); - stack = ginFindLeafPage(&btree, false); + stack = ginFindLeafPage(&btree, false, NULL); page = BufferGetPage(stack->buffer); if (btree.findItem(&btree, stack)) diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 8138383..affd635 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -337,6 +337,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, LockBuffer(buffer, GIST_SHARE); gistcheckpage(scan->indexRelation, buffer); page = BufferGetPage(buffer); + TestForOldSnapshot(scan->xs_snapshot, r, page); opaque = GistPageGetOpaque(page); /* diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 3d48c4f..8c89ee7 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -279,6 +279,7 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 6025a3f..eb8c9cd 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -188,7 +188,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - metap = HashPageGetMeta(BufferGetPage(metabuf)); + page = BufferGetPage(metabuf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + metap = HashPageGetMeta(page); /* * Loop until we get a lock on the correct target bucket. @@ -241,6 +243,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); page = BufferGetPage(buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == bucket); @@ -347,6 +350,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) _hash_readnext(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { + TestForOldSnapshot(scan->xs_snapshot, rel, page); maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch(page, so->hashso_sk_hash); } @@ -388,6 +392,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) _hash_readprev(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { + TestForOldSnapshot(scan->xs_snapshot, rel, page); maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 34ba385..7007acf 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -395,6 +395,7 @@ heapgetpage(HeapScanDesc scan, BlockNumber page) LockBuffer(buffer, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(buffer); + TestForOldSnapshot(snapshot, scan->rs_rd, dp); lines = PageGetMaxOffsetNumber(dp); ntup = 0; @@ -538,6 +539,7 @@ heapgettup(HeapScanDesc scan, LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(snapshot, scan->rs_rd, dp); lines = PageGetMaxOffsetNumber(dp); /* page and lineoff now reference the physically next tid */ @@ -583,6 +585,7 @@ heapgettup(HeapScanDesc scan, LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(snapshot, scan->rs_rd, dp); lines = PageGetMaxOffsetNumber(dp); if (!scan->rs_inited) @@ -617,6 +620,7 @@ heapgettup(HeapScanDesc scan, /* Since the tuple was previously fetched, needn't lock page here */ dp = (Page) BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(snapshot, scan->rs_rd, dp); lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self)); lpp = PageGetItemId(dp, lineoff); Assert(ItemIdIsNormal(lpp)); @@ -746,6 +750,7 @@ heapgettup(HeapScanDesc scan, LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(snapshot, scan->rs_rd, dp); lines = PageGetMaxOffsetNumber((Page) dp); linesleft = lines; if (backward) @@ -833,6 +838,7 @@ heapgettup_pagemode(HeapScanDesc scan, } dp = (Page) BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp); lines = scan->rs_ntuples; /* page and lineindex now reference the next visible tid */ @@ -876,6 +882,7 @@ heapgettup_pagemode(HeapScanDesc scan, } dp = (Page) BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp); lines = scan->rs_ntuples; if (!scan->rs_inited) @@ -909,6 +916,7 @@ heapgettup_pagemode(HeapScanDesc scan, /* Since the tuple was previously fetched, needn't lock page here */ dp = (Page) BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp); lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self)); lpp = PageGetItemId(dp, lineoff); Assert(ItemIdIsNormal(lpp)); @@ -1028,6 +1036,7 @@ heapgettup_pagemode(HeapScanDesc scan, heapgetpage(scan, page); dp = (Page) BufferGetPage(scan->rs_cbuf); + TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp); lines = scan->rs_ntuples; linesleft = lines; if (backward) @@ -1872,6 +1881,7 @@ heap_fetch(Relation relation, */ LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, relation, page); /* * We'd better check for out-of-range offnum in case of VACUUM since the @@ -2201,6 +2211,7 @@ heap_get_latest_tid(Relation relation, buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid)); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, relation, page); /* * Check for bogus item number. This is not treated as an error diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 59beadd..eb7ae8f 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -92,12 +92,21 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * need to use the horizon that includes slots, otherwise the data-only * horizon can be used. Note that the toast relation of user defined * relations are *not* considered catalog relations. + * + * It is OK to apply the old snapshot limit before acquiring the cleanup + * lock because the worst that can happen is that we are not quite as + * aggressive about the cleanup (by however many transaction IDs are + * consumed between this point and acquiring the lock). This allows us to + * save significant overhead in the case where the page is found not to be + * prunable. */ if (IsCatalogRelation(relation) || RelationIsAccessibleInLogicalDecoding(relation)) OldestXmin = RecentGlobalXmin; else - OldestXmin = RecentGlobalDataXmin; + OldestXmin = + TransactionIdLimitedForOldSnapshots(RecentGlobalDataXmin, + relation); Assert(TransactionIdIsValid(OldestXmin)); diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index e3c55eb..66966e0 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -119,7 +119,7 @@ _bt_doinsert(Relation rel, IndexTuple itup, top: /* find the first page containing this key */ - stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE); + stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE, NULL); offset = InvalidOffsetNumber; @@ -135,7 +135,7 @@ top: * precise description. */ buf = _bt_moveright(rel, buf, natts, itup_scankey, false, - true, stack, BT_WRITE); + true, stack, BT_WRITE, NULL); /* * If we're not allowing duplicates, make sure the key isn't already in @@ -1671,7 +1671,8 @@ _bt_insert_parent(Relation rel, elog(DEBUG2, "concurrent ROOT page split"); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); /* Find the leftmost page at the next level up */ - pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false); + pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false, + NULL); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 67755d7..390bd1a 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1255,7 +1255,7 @@ _bt_pagedel(Relation rel, Buffer buf) itup_scankey = _bt_mkscankey(rel, targetkey); /* find the leftmost leaf page containing this key */ stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, - false, &lbuf, BT_READ); + false, &lbuf, BT_READ, NULL); /* don't need a pin on the page */ _bt_relbuf(rel, lbuf); diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 14dffe0..32efb6d 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -30,7 +30,7 @@ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); -static Buffer _bt_walk_left(Relation rel, Buffer buf); +static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); @@ -79,6 +79,10 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) * address of the leaf-page buffer, which is read-locked and pinned. * No locks are held on the parent pages, however! * + * If the snapshot parameter is not NULL, "old snapshot" checking will take + * place during the descent through the tree. This is not needed when + * positioning for an insert or delete, so NULL is used for those cases. + * * NOTE that the returned buffer is read-locked regardless of the access * parameter. However, access = BT_WRITE will allow an empty root page * to be created and returned. When access = BT_READ, an empty index @@ -87,7 +91,7 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) */ BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, - Buffer *bufP, int access) + Buffer *bufP, int access, Snapshot snapshot) { BTStack stack_in = NULL; @@ -96,7 +100,9 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, /* If index is empty and access = BT_READ, no root page is created. */ if (!BufferIsValid(*bufP)) + { return (BTStack) NULL; + } /* Loop iterates once per level descended in the tree */ for (;;) @@ -124,7 +130,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, */ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, (access == BT_WRITE), stack_in, - BT_READ); + BT_READ, snapshot); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); @@ -197,6 +203,10 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * On entry, we have the buffer pinned and a lock of the type specified by * 'access'. If we move right, we release the buffer and lock and acquire * the same on the right sibling. Return value is the buffer we stop at. + * + * If the snapshot parameter is not NULL, "old snapshot" checking will take + * place during the descent through the tree. This is not needed when + * positioning for an insert or delete, so NULL is used for those cases. */ Buffer _bt_moveright(Relation rel, @@ -206,7 +216,8 @@ _bt_moveright(Relation rel, bool nextkey, bool forupdate, BTStack stack, - int access) + int access, + Snapshot snapshot) { Page page; BTPageOpaque opaque; @@ -232,6 +243,7 @@ _bt_moveright(Relation rel, for (;;) { page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque)) @@ -970,7 +982,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * Use the manufactured insertion scan key to descend the tree and * position ourselves on the target leaf page. */ - stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ); + stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ, + scan->xs_snapshot); /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -1336,6 +1349,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); /* check for deleted page */ page = BufferGetPage(so->currPos.buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { @@ -1394,7 +1408,8 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) } /* Step to next physical page */ - so->currPos.buf = _bt_walk_left(rel, so->currPos.buf); + so->currPos.buf = _bt_walk_left(rel, so->currPos.buf, + scan->xs_snapshot); /* if we're physically at end of index, return failure */ if (so->currPos.buf == InvalidBuffer) @@ -1409,6 +1424,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * and do it all again. */ page = BufferGetPage(so->currPos.buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { @@ -1442,7 +1458,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * again if it's important. */ static Buffer -_bt_walk_left(Relation rel, Buffer buf) +_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot) { Page page; BTPageOpaque opaque; @@ -1472,6 +1488,7 @@ _bt_walk_left(Relation rel, Buffer buf) CHECK_FOR_INTERRUPTS(); buf = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* @@ -1498,12 +1515,14 @@ _bt_walk_left(Relation rel, Buffer buf) blkno = opaque->btpo_next; buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } /* Return to the original page to see what's up */ buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ); page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISDELETED(opaque)) { @@ -1521,6 +1540,7 @@ _bt_walk_left(Relation rel, Buffer buf) blkno = opaque->btpo_next; buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_ISDELETED(opaque)) break; @@ -1557,7 +1577,8 @@ _bt_walk_left(Relation rel, Buffer buf) * The returned buffer is pinned and read-locked. */ Buffer -_bt_get_endpoint(Relation rel, uint32 level, bool rightmost) +_bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + Snapshot snapshot) { Buffer buf; Page page; @@ -1580,6 +1601,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) return InvalidBuffer; page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); for (;;) @@ -1599,6 +1621,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) RelationGetRelationName(rel)); buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } @@ -1651,7 +1674,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * version of _bt_search(). We don't maintain a stack since we know we * won't need it. */ - buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot); if (!BufferIsValid(buf)) { diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 620e746..a9f837f 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -295,7 +295,7 @@ spgLeafTest(Relation index, SpGistScanOpaque so, */ static void spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, - storeRes_func storeRes) + storeRes_func storeRes, Snapshot snapshot) { Buffer buffer = InvalidBuffer; bool reportedSome = false; @@ -336,6 +336,7 @@ redirect: /* else new pointer points to the same page, no work needed */ page = BufferGetPage(buffer); + TestForOldSnapshot(snapshot, index, page); isnull = SpGistPageStoresNulls(page) ? true : false; @@ -558,7 +559,7 @@ spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm) so->tbm = tbm; so->ntids = 0; - spgWalk(scan->indexRelation, so, true, storeBitmap); + spgWalk(scan->indexRelation, so, true, storeBitmap, scan->xs_snapshot); return so->ntids; } @@ -617,7 +618,8 @@ spggettuple(IndexScanDesc scan, ScanDirection dir) } so->iPtr = so->nPtrs = 0; - spgWalk(scan->indexRelation, so, false, storeGettuple); + spgWalk(scan->indexRelation, so, false, storeGettuple, + scan->xs_snapshot); if (so->nPtrs == 0) break; /* must have completed scan */ diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 4cb4acf..93361a0 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -489,7 +489,8 @@ vacuum_set_xid_limits(Relation rel, * working on a particular table at any time, and that each vacuum is * always an independent transaction. */ - *oldestXmin = GetOldestXmin(rel, true); + *oldestXmin = + TransactionIdLimitedForOldSnapshots(GetOldestXmin(rel, true), rel); Assert(TransactionIdIsNormal(*oldestXmin)); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 52e19b3..426e756 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -1660,7 +1660,8 @@ should_attempt_truncation(LVRelStats *vacrelstats) possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || - possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)) + possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) && + old_snapshot_threshold < 0) return true; else return false; diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 36a04fc..c04b17f 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -43,6 +43,7 @@ #include "storage/procsignal.h" #include "storage/sinvaladt.h" #include "storage/spin.h" +#include "utils/snapmgr.h" shmem_startup_hook_type shmem_startup_hook = NULL; @@ -136,6 +137,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) size = add_size(size, ReplicationOriginShmemSize()); size = add_size(size, WalSndShmemSize()); size = add_size(size, WalRcvShmemSize()); + size = add_size(size, SnapMgrShmemSize()); size = add_size(size, BTreeShmemSize()); size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); @@ -247,6 +249,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) /* * Set up other modules that need some shared memory space */ + SnapMgrInit(); BTreeShmemInit(); SyncScanShmemInit(); AsyncShmemInit(); diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 740beb6..fd221ff 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -1759,6 +1759,15 @@ GetSnapshotData(Snapshot snapshot) snapshot->regd_count = 0; snapshot->copied = false; + /* + * Capture the current time and WAL stream location in case this snapshot + * becomes old enough to need to fall back on the special "old snapshot" + * logic. + */ + snapshot->lsn = GetXLogInsertRecPtr(); + snapshot->whenTaken = GetSnapshotCurrentTimestamp(); + MaintainOldSnapshotTimeMapping(snapshot->whenTaken, xmin); + return snapshot; } diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index c557cb6..f8996cd 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -46,3 +46,4 @@ CommitTsControlLock 38 CommitTsLock 39 ReplicationOriginLock 40 MultiXactTruncationLock 41 +OldSnapshotTimeMapLock 42 diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt index 1a920e8..a962308 100644 --- a/src/backend/utils/errcodes.txt +++ b/src/backend/utils/errcodes.txt @@ -414,6 +414,10 @@ Section: Class 58 - System Error (errors external to PostgreSQL itself) 58P01 E ERRCODE_UNDEFINED_FILE undefined_file 58P02 E ERRCODE_DUPLICATE_FILE duplicate_file +Section: Class 72 - Snapshot Failure +# (class borrowed from Oracle) +72000 E ERRCODE_SNAPSHOT_TOO_OLD snapshot_too_old + Section: Class F0 - Configuration File Error # (PostgreSQL-specific error class) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index a325943..bc541b0 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2658,6 +2658,17 @@ static struct config_int ConfigureNamesInt[] = }, { + {"old_snapshot_threshold", PGC_POSTMASTER, RESOURCES_ASYNCHRONOUS, + gettext_noop("Time before a snapshot is too old to read pages changed after the snapshot was taken."), + gettext_noop("A value of -1 disables this feature."), + GUC_UNIT_MIN + }, + &old_snapshot_threshold, + -1, -1, MINS_PER_HOUR * HOURS_PER_DAY * 60, + NULL, NULL, NULL + }, + + { {"tcp_keepalives_idle", PGC_USERSET, CLIENT_CONN_OTHER, gettext_noop("Time between issuing TCP keepalives."), gettext_noop("A value of 0 uses the system default."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 773b4e8..2c387d4 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -165,6 +165,8 @@ #effective_io_concurrency = 1 # 1-1000; 0 disables prefetching #max_worker_processes = 8 #max_parallel_degree = 0 # max number of worker processes per node +#old_snapshot_threshold = -1 # 1min-60d; -1 disables; 0 is immediate + # (change requires restart) #------------------------------------------------------------------------------ diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index b88e012..19504c3 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -46,14 +46,18 @@ #include "access/transam.h" #include "access/xact.h" +#include "access/xlog.h" +#include "catalog/catalog.h" #include "lib/pairingheap.h" #include "miscadmin.h" #include "storage/predicate.h" #include "storage/proc.h" #include "storage/procarray.h" #include "storage/sinval.h" +#include "storage/spin.h" #include "utils/builtins.h" #include "utils/memutils.h" +#include "utils/rel.h" #include "utils/resowner_private.h" #include "utils/snapmgr.h" #include "utils/syscache.h" @@ -61,6 +65,64 @@ /* + * GUC parameters + */ +int old_snapshot_threshold; /* number of minutes, -1 disables */ + +/* + * Structure for dealing with old_snapshot_threshold implementation. + */ +typedef struct OldSnapshotControlData +{ + /* + * Variables for old snapshot handling are shared among processes and are + * only allowed to move forward. + */ + slock_t mutex_current; /* protect current timestamp */ + int64 current_timestamp; /* latest snapshot timestamp */ + slock_t mutex_latest_xmin; /* protect latest snapshot xmin */ + TransactionId latest_xmin; /* latest snapshot xmin */ + slock_t mutex_threshold; /* protect threshold fields */ + int64 threshold_timestamp; /* earlier snapshot is old */ + TransactionId threshold_xid; /* earlier xid may be gone */ + + /* + * Keep one xid per minute for old snapshot error handling. + * + * Use a circular buffer with a head offset, a count of entries currently + * used, and a timestamp corresponding to the xid at the head offset. A + * count_used value of zero means that there are no times stored; a + * count_used value of old_snapshot_threshold means that the buffer is + * full and the head must be advanced to add new entries. Use timestamps + * aligned to minute boundaries, since that seems less surprising than + * aligning based on the first usage timestamp. + * + * It is OK if the xid for a given time slot is from earlier than + * calculated by adding the number of minutes corresponding to the + * (possibly wrapped) distance from the head offset to the time of the + * head entry, since that just results in the vacuuming of old tuples + * being slightly less aggressive. It would not be OK for it to be off in + * the other direction, since it might result in vacuuming tuples that are + * still expected to be there. + * + * Use of an SLRU was considered but not chosen because it is more + * heavyweight than is needed for this, and would probably not be any less + * code to implement. + * + * Persistence is not needed. + */ + int head_offset; /* subscript of oldest tracked time */ + int64 head_timestamp; /* time corresponding to head xid */ + int count_used; /* how many slots are in use */ + TransactionId xid_by_minute[FLEXIBLE_ARRAY_MEMBER]; +} OldSnapshotControlData; + +typedef struct OldSnapshotControlData *OldSnapshotControl; + +static volatile OldSnapshotControl oldSnapshotControl; + + +/* * CurrentSnapshot points to the only snapshot taken in transaction-snapshot * mode, and to the latest one taken in a read-committed transaction. * SecondarySnapshot is a snapshot that's always up-to-date as of the current @@ -153,6 +215,7 @@ static Snapshot FirstXactSnapshot = NULL; static List *exportedSnapshots = NIL; /* Prototypes for local functions */ +static int64 AlignTimestampToMinuteBoundary(int64 ts); static Snapshot CopySnapshot(Snapshot snapshot); static void FreeSnapshot(Snapshot snapshot); static void SnapshotResetXmin(void); @@ -174,6 +237,49 @@ typedef struct SerializedSnapshotData CommandId curcid; } SerializedSnapshotData; +Size +SnapMgrShmemSize(void) +{ + Size size; + + size = offsetof(OldSnapshotControlData, xid_by_minute); + if (old_snapshot_threshold > 0) + size = add_size(size, mul_size(sizeof(TransactionId), + old_snapshot_threshold)); + + return size; +} + +/* + * Initialize for managing old snapshot detection. + */ +void +SnapMgrInit(void) +{ + bool found; + + /* + * Create or attach to the OldSnapshotControl structure. + */ + oldSnapshotControl = (OldSnapshotControl) + ShmemInitStruct("OldSnapshotControlData", + SnapMgrShmemSize(), &found); + + if (!found) + { + SpinLockInit(&oldSnapshotControl->mutex_current); + oldSnapshotControl->current_timestamp = 0; + SpinLockInit(&oldSnapshotControl->mutex_latest_xmin); + oldSnapshotControl->latest_xmin = InvalidTransactionId; + SpinLockInit(&oldSnapshotControl->mutex_threshold); + oldSnapshotControl->threshold_timestamp = 0; + oldSnapshotControl->threshold_xid = InvalidTransactionId; + oldSnapshotControl->head_offset = 0; + oldSnapshotControl->head_timestamp = 0; + oldSnapshotControl->count_used = 0; + } +} + /* * GetTransactionSnapshot * Get the appropriate snapshot for a new query in a transaction. @@ -1405,6 +1511,304 @@ ThereAreNoPriorRegisteredSnapshots(void) return false; } + +/* + * Return an int64 timestamp which is exactly on a minute boundary. + * + * If the argument is already aligned, return that value, otherwise move to + * the next minute boundary following the given time. + */ +static int64 +AlignTimestampToMinuteBoundary(int64 ts) +{ + int64 retval = ts + (USECS_PER_MINUTE - 1); + + return retval - (retval % USECS_PER_MINUTE); +} + +/* + * Get current timestamp for snapshots as int64 that never moves backward. + */ +int64 +GetSnapshotCurrentTimestamp(void) +{ + int64 now = GetCurrentIntegerTimestamp(); + + /* + * Don't let time move backward; if it hasn't advanced, use the old value. + */ + SpinLockAcquire(&oldSnapshotControl->mutex_current); + if (now <= oldSnapshotControl->current_timestamp) + now = oldSnapshotControl->current_timestamp; + else + oldSnapshotControl->current_timestamp = now; + SpinLockRelease(&oldSnapshotControl->mutex_current); + + return now; +} + +/* + * Get timestamp through which vacuum may have processed based on last stored + * value for threshold_timestamp. + * + * XXX: So far, we never trust that a 64-bit value can be read atomically; if + * that ever changes, we could get rid of the spinlock here. + */ +int64 +GetOldSnapshotThresholdTimestamp(void) +{ + int64 threshold_timestamp; + + SpinLockAcquire(&oldSnapshotControl->mutex_threshold); + threshold_timestamp = oldSnapshotControl->threshold_timestamp; + SpinLockRelease(&oldSnapshotControl->mutex_threshold); + + return threshold_timestamp; +} + +static void +SetOldSnapshotThresholdTimestamp(int64 ts, TransactionId xlimit) +{ + SpinLockAcquire(&oldSnapshotControl->mutex_threshold); + oldSnapshotControl->threshold_timestamp = ts; + oldSnapshotControl->threshold_xid = xlimit; + SpinLockRelease(&oldSnapshotControl->mutex_threshold); +} + +/* + * TransactionIdLimitedForOldSnapshots + * + * Apply old snapshot limit, if any. This is intended to be called for page + * pruning and table vacuuming, to allow old_snapshot_threshold to override + * the normal global xmin value. Actual testing for snapshot too old will be + * based on whether a snapshot timestamp is prior to the threshold timestamp + * set in this function. + */ +TransactionId +TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, + Relation relation) +{ + if (TransactionIdIsNormal(recentXmin) + && old_snapshot_threshold >= 0 + && RelationNeedsWAL(relation) + && !IsCatalogRelation(relation) + && !RelationIsAccessibleInLogicalDecoding(relation)) + { + int64 ts = GetSnapshotCurrentTimestamp(); + TransactionId xlimit = recentXmin; + TransactionId latest_xmin = oldSnapshotControl->latest_xmin; + bool same_ts_as_threshold = false; + + /* + * Zero threshold always overrides to latest xmin, if valid. Without + * some heuristic it will find its own snapshot too old on, for + * example, a simple UPDATE -- which would make it useless for most + * testing, but there is no principled way to ensure that it doesn't + * fail in this way. Use a five-second delay to try to get useful + * testing behavior, but this may need adjustment. + */ + if (old_snapshot_threshold == 0) + { + if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin) + && TransactionIdFollows(latest_xmin, xlimit)) + xlimit = latest_xmin; + + ts -= 5 * USECS_PER_SEC; + SetOldSnapshotThresholdTimestamp(ts, xlimit); + + return xlimit; + } + + ts = AlignTimestampToMinuteBoundary(ts) + - (old_snapshot_threshold * USECS_PER_MINUTE); + + /* Check for fast exit without LW locking. */ + SpinLockAcquire(&oldSnapshotControl->mutex_threshold); + if (ts == oldSnapshotControl->threshold_timestamp) + { + xlimit = oldSnapshotControl->threshold_xid; + same_ts_as_threshold = true; + } + SpinLockRelease(&oldSnapshotControl->mutex_threshold); + + if (!same_ts_as_threshold) + { + LWLockAcquire(OldSnapshotTimeMapLock, LW_SHARED); + + if (oldSnapshotControl->count_used > 0 + && ts >= oldSnapshotControl->head_timestamp) + { + int offset; + + offset = ((ts - oldSnapshotControl->head_timestamp) + / USECS_PER_MINUTE); + if (offset > oldSnapshotControl->count_used - 1) + offset = oldSnapshotControl->count_used - 1; + offset = (oldSnapshotControl->head_offset + offset) + % old_snapshot_threshold; + xlimit = oldSnapshotControl->xid_by_minute[offset]; + + if (NormalTransactionIdFollows(xlimit, recentXmin)) + SetOldSnapshotThresholdTimestamp(ts, xlimit); + } + + LWLockRelease(OldSnapshotTimeMapLock); + } + + /* + * Failsafe protection against vacuuming work of active transaction. + * + * This is not an assertion because we avoid the spinlock for + * performance, leaving open the possibility that xlimit could advance + * and be more current; but it seems prudent to apply this limit. It + * might make pruning a tiny bit less agressive than it could be, but + * protects against data loss bugs. + */ + if (TransactionIdIsNormal(latest_xmin) + && TransactionIdPrecedes(latest_xmin, xlimit)) + xlimit = latest_xmin; + + if (NormalTransactionIdFollows(xlimit, recentXmin)) + return xlimit; + } + + return recentXmin; +} + +/* + * Take care of the circular buffer that maps time to xid. + */ +void +MaintainOldSnapshotTimeMapping(int64 whenTaken, TransactionId xmin) +{ + int64 ts; + + /* Fast exit when old_snapshot_threshold is not used. */ + if (old_snapshot_threshold < 0) + return; + + /* Keep track of the latest xmin seen by any process. */ + SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin); + if (TransactionIdFollows(xmin, oldSnapshotControl->latest_xmin)) + oldSnapshotControl->latest_xmin = xmin; + SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin); + + /* No further tracking needed for 0 (used for testing). */ + if (old_snapshot_threshold == 0) + return; + + /* + * We don't want to do something stupid with unusual values, but we don't + * want to litter the log with warnings or break otherwise normal + * processing for this feature; so if something seems unreasonable, just + * log at DEBUG level and return without doing anything. + */ + if (whenTaken < 0) + { + elog(DEBUG1, + "MaintainOldSnapshotTimeMapping called with negative whenTaken = %ld", + (long) whenTaken); + return; + } + if (!TransactionIdIsNormal(xmin)) + { + elog(DEBUG1, + "MaintainOldSnapshotTimeMapping called with xmin = %lu", + (unsigned long) xmin); + return; + } + + ts = AlignTimestampToMinuteBoundary(whenTaken); + + LWLockAcquire(OldSnapshotTimeMapLock, LW_EXCLUSIVE); + + Assert(oldSnapshotControl->head_offset >= 0); + Assert(oldSnapshotControl->head_offset < old_snapshot_threshold); + Assert((oldSnapshotControl->head_timestamp % USECS_PER_MINUTE) == 0); + Assert(oldSnapshotControl->count_used >= 0); + Assert(oldSnapshotControl->count_used <= old_snapshot_threshold); + + if (oldSnapshotControl->count_used == 0) + { + /* set up first entry for empty mapping */ + oldSnapshotControl->head_offset = 0; + oldSnapshotControl->head_timestamp = ts; + oldSnapshotControl->count_used = 1; + oldSnapshotControl->xid_by_minute[0] = xmin; + } + else if (ts < oldSnapshotControl->head_timestamp) + { + /* old ts; log it at DEBUG */ + LWLockRelease(OldSnapshotTimeMapLock); + elog(DEBUG1, + "MaintainOldSnapshotTimeMapping called with old whenTaken = %ld", + (long) whenTaken); + return; + } + else if (ts <= (oldSnapshotControl->head_timestamp + + ((oldSnapshotControl->count_used - 1) + * USECS_PER_MINUTE))) + { + /* existing mapping; advance xid if possible */ + int bucket = (oldSnapshotControl->head_offset + + ((ts - oldSnapshotControl->head_timestamp) + / USECS_PER_MINUTE)) + % old_snapshot_threshold; + + if (TransactionIdPrecedes(oldSnapshotControl->xid_by_minute[bucket], xmin)) + oldSnapshotControl->xid_by_minute[bucket] = xmin; + } + else + { + /* We need a new bucket, but it might not be the very next one. */ + int advance = ((ts - oldSnapshotControl->head_timestamp) + / USECS_PER_MINUTE); + + oldSnapshotControl->head_timestamp = ts; + + if (advance >= old_snapshot_threshold) + { + /* Advance is so far that all old data is junk; start over. */ + oldSnapshotControl->head_offset = 0; + oldSnapshotControl->count_used = 1; + oldSnapshotControl->xid_by_minute[0] = xmin; + } + else + { + /* Store the new value in one or more buckets. */ + int i; + + for (i = 0; i < advance; i++) + { + if (oldSnapshotControl->count_used == old_snapshot_threshold) + { + /* Map full and new value replaces old head. */ + int old_head = oldSnapshotControl->head_offset; + + if (old_head == (old_snapshot_threshold - 1)) + oldSnapshotControl->head_offset = 0; + else + oldSnapshotControl->head_offset = old_head + 1; + oldSnapshotControl->xid_by_minute[old_head] = xmin; + } + else + { + /* Extend map to unused entry. */ + int new_tail = (oldSnapshotControl->head_offset + + oldSnapshotControl->count_used) + % old_snapshot_threshold; + + oldSnapshotControl->count_used++; + oldSnapshotControl->xid_by_minute[new_tail] = xmin; + } + } + } + } + + LWLockRelease(OldSnapshotTimeMapLock); +} + + /* * Setup a snapshot that replaces normal catalog snapshots that allows catalog * access to behave just like it did at a certain point in the past. diff --git a/src/include/access/brin_revmap.h b/src/include/access/brin_revmap.h index 19528bf..89054e0 100644 --- a/src/include/access/brin_revmap.h +++ b/src/include/access/brin_revmap.h @@ -18,12 +18,13 @@ #include "storage/itemptr.h" #include "storage/off.h" #include "utils/relcache.h" +#include "utils/snapshot.h" /* struct definition lives in brin_revmap.c */ typedef struct BrinRevmap BrinRevmap; extern BrinRevmap *brinRevmapInitialize(Relation idxrel, - BlockNumber *pagesPerRange); + BlockNumber *pagesPerRange, Snapshot snapshot); extern void brinRevmapTerminate(BrinRevmap *revmap); extern void brinRevmapExtend(BrinRevmap *revmap, @@ -34,6 +35,6 @@ extern void brinSetHeapBlockItemptr(Buffer rmbuf, BlockNumber pagesPerRange, BlockNumber heapBlk, ItemPointerData tid); extern BrinTuple *brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, - Size *size, int mode); + Size *size, int mode, Snapshot snapshot); #endif /* BRIN_REVMAP_H */ diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index d2ea588..66ce9ac 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -703,7 +703,7 @@ typedef struct * PostingItem */ -extern GinBtreeStack *ginFindLeafPage(GinBtree btree, bool searchMode); +extern GinBtreeStack *ginFindLeafPage(GinBtree btree, bool searchMode, Snapshot snapshot); extern Buffer ginStepRight(Buffer buffer, Relation index, int lockmode); extern void freeGinBtreeStack(GinBtreeStack *stack); extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack, @@ -731,7 +731,7 @@ extern void GinPageDeletePostingItem(Page page, OffsetNumber offset); extern void ginInsertItemPointers(Relation index, BlockNumber rootBlkno, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats); -extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno); +extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, Snapshot snapshot); extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage); extern void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9046b16..ca50349 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -710,17 +710,18 @@ extern int _bt_pagedel(Relation rel, Buffer buf); */ extern BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, - Buffer *bufP, int access); + Buffer *bufP, int access, Snapshot snapshot); extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, - int access); + int access, Snapshot snapshot); extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey); extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); -extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); +extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + Snapshot snapshot); /* * prototypes for functions in nbtutils.c diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 7d57c04..a9a876a 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -14,11 +14,14 @@ #ifndef BUFMGR_H #define BUFMGR_H +#include "catalog/catalog.h" #include "storage/block.h" #include "storage/buf.h" #include "storage/bufpage.h" #include "storage/relfilenode.h" #include "utils/relcache.h" +#include "utils/snapmgr.h" +#include "utils/tqual.h" typedef void *Block; @@ -168,6 +171,37 @@ extern PGDLLIMPORT int32 *LocalRefCount; #define BufferGetPage(buffer) ((Page)BufferGetBlock(buffer)) /* + * Check whether the given snapshot is too old to have safely read the given + * page from the given table. If so, throw a "snapshot too old" error. + * + * This test generally needs to be performed after every BufferGetPage() call + * that is executed as part of a scan. It is not needed for calls made for + * modifying the page (for example, to position to the right place to insert a + * new index tuple or for vacuuming). + * + * Note that a NULL snapshot argument is allowed and causes a fast return + * without error; this is to support call sites which can be called from + * either scans or index modification areas. + * + * This is a macro for speed; keep the tests that are fastest and/or most + * likely to exclude a page from old snapshot testing near the front. + */ +#define TestForOldSnapshot(snapshot, relation, page) \ + do { \ + if (old_snapshot_threshold >= 0 \ + && (snapshot) != NULL \ + && (snapshot)->satisfies == HeapTupleSatisfiesMVCC \ + && !XLogRecPtrIsInvalid((snapshot)->lsn) \ + && PageGetLSN(page) > (snapshot)->lsn \ + && !IsCatalogRelation(relation) \ + && !RelationIsAccessibleInLogicalDecoding(relation) \ + && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp()) \ + ereport(ERROR, \ + (errcode(ERRCODE_SNAPSHOT_TOO_OLD), \ + errmsg("snapshot too old"))); \ + } while (0) + +/* * prototypes for functions in bufmgr.c */ extern bool ComputeIoConcurrency(int io_concurrency, double *target); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index f2bebf2..d417031 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -15,6 +15,7 @@ #define REL_H #include "access/tupdesc.h" +#include "access/xlog.h" #include "catalog/pg_class.h" #include "catalog/pg_index.h" #include "fmgr.h" diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index a9e9066..371042a 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -14,10 +14,20 @@ #define SNAPMGR_H #include "fmgr.h" +#include "utils/relcache.h" #include "utils/resowner.h" #include "utils/snapshot.h" +/* GUC variables */ +extern int old_snapshot_threshold; + + +extern Size SnapMgrShmemSize(void); +extern void SnapMgrInit(void); +extern int64 GetSnapshotCurrentTimestamp(void); +extern int64 GetOldSnapshotThresholdTimestamp(void); + extern bool FirstSnapshotSet; extern TransactionId TransactionXmin; @@ -54,6 +64,9 @@ extern void ImportSnapshot(const char *idstr); extern bool XactHasExportedSnapshots(void); extern void DeleteAllExportedSnapshotFiles(void); extern bool ThereAreNoPriorRegisteredSnapshots(void); +extern TransactionId TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, + Relation relation); +extern void MaintainOldSnapshotTimeMapping(int64 whenTaken, TransactionId xmin); extern char *ExportSnapshot(Snapshot snapshot); diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 2a56363..998e2e5 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -14,6 +14,7 @@ #define SNAPSHOT_H #include "access/htup.h" +#include "access/xlogdefs.h" #include "lib/pairingheap.h" #include "storage/buf.h" @@ -105,6 +106,9 @@ typedef struct SnapshotData uint32 active_count; /* refcount on ActiveSnapshot stack */ uint32 regd_count; /* refcount on RegisteredSnapshots */ pairingheap_node ph_node; /* link in the RegisteredSnapshots heap */ + + int64 whenTaken; /* timestamp when snapshot was taken */ + XLogRecPtr lsn; /* position in the WAL stream when taken */ } SnapshotData; /* diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 6167ec1..9b93552 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -8,6 +8,7 @@ SUBDIRS = \ brin \ commit_ts \ dummy_seclabel \ + snapshot_too_old \ test_ddl_deparse \ test_extensions \ test_parser \ diff --git a/src/test/modules/snapshot_too_old/Makefile b/src/test/modules/snapshot_too_old/Makefile new file mode 100644 index 0000000..7b9feca --- /dev/null +++ b/src/test/modules/snapshot_too_old/Makefile @@ -0,0 +1,47 @@ +# src/test/modules/snapshot_too_old/Makefile + +EXTRA_CLEAN = ./isolation_output + +ISOLATIONCHECKS=sto_using_cursor sto_using_select + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/snapshot_too_old +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +# Disabled because these tests require "old_snapshot_threshold" >= 0, which +# typical installcheck users do not have (e.g. buildfarm clients). +installcheck:; + +# But it can nonetheless be very helpful to run tests on preexisting +# installation, allow to do so, but only if requested explicitly. +installcheck-force: isolationcheck-install-force + +check: isolationcheck + +submake-isolation: + $(MAKE) -C $(top_builddir)/src/test/isolation all + +submake-test_decoding: + $(MAKE) -C $(top_builddir)/src/test/modules/snapshot_too_old + +isolationcheck: | submake-isolation temp-install + $(MKDIR_P) isolation_output + $(pg_isolation_regress_check) \ + --temp-config $(top_srcdir)/src/test/modules/snapshot_too_old/sto.conf \ + --outputdir=./isolation_output \ + $(ISOLATIONCHECKS) + +isolationcheck-install-force: all | submake-isolation temp-install + $(pg_isolation_regress_installcheck) \ + $(ISOLATIONCHECKS) + +PHONY: check isolationcheck isolationcheck-install-force + +temp-install: EXTRA_INSTALL=src/test/modules/snapshot_too_old diff --git a/src/test/modules/snapshot_too_old/expected/sto_using_cursor.out b/src/test/modules/snapshot_too_old/expected/sto_using_cursor.out new file mode 100644 index 0000000..8cc29ec --- /dev/null +++ b/src/test/modules/snapshot_too_old/expected/sto_using_cursor.out @@ -0,0 +1,73 @@ +Parsed test spec with 2 sessions + +starting permutation: s1decl s1f1 s1sleep s1f2 s2u +step s1decl: DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; +step s1f1: FETCH FIRST FROM cursor1; +c + +1 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: FETCH FIRST FROM cursor1; +c + +1 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; + +starting permutation: s1decl s1f1 s1sleep s2u s1f2 +step s1decl: DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; +step s1f1: FETCH FIRST FROM cursor1; +c + +1 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1f2: FETCH FIRST FROM cursor1; +ERROR: snapshot too old + +starting permutation: s1decl s1f1 s2u s1sleep s1f2 +step s1decl: DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; +step s1f1: FETCH FIRST FROM cursor1; +c + +1 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: FETCH FIRST FROM cursor1; +ERROR: snapshot too old + +starting permutation: s1decl s2u s1f1 s1sleep s1f2 +step s1decl: DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1f1: FETCH FIRST FROM cursor1; +c + +1 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: FETCH FIRST FROM cursor1; +ERROR: snapshot too old + +starting permutation: s2u s1decl s1f1 s1sleep s1f2 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1decl: DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; +step s1f1: FETCH FIRST FROM cursor1; +c + +2 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: FETCH FIRST FROM cursor1; +ERROR: snapshot too old diff --git a/src/test/modules/snapshot_too_old/expected/sto_using_select.out b/src/test/modules/snapshot_too_old/expected/sto_using_select.out new file mode 100644 index 0000000..eb15bc2 --- /dev/null +++ b/src/test/modules/snapshot_too_old/expected/sto_using_select.out @@ -0,0 +1,55 @@ +Parsed test spec with 2 sessions + +starting permutation: s1f1 s1sleep s1f2 s2u +step s1f1: SELECT c FROM sto1 ORDER BY c LIMIT 1; +c + +1 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: SELECT c FROM sto1 ORDER BY c LIMIT 1; +c + +1 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; + +starting permutation: s1f1 s1sleep s2u s1f2 +step s1f1: SELECT c FROM sto1 ORDER BY c LIMIT 1; +c + +1 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1f2: SELECT c FROM sto1 ORDER BY c LIMIT 1; +ERROR: snapshot too old + +starting permutation: s1f1 s2u s1sleep s1f2 +step s1f1: SELECT c FROM sto1 ORDER BY c LIMIT 1; +c + +1 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: SELECT c FROM sto1 ORDER BY c LIMIT 1; +ERROR: snapshot too old + +starting permutation: s2u s1f1 s1sleep s1f2 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1f1: SELECT c FROM sto1 ORDER BY c LIMIT 1; +c + +2 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: SELECT c FROM sto1 ORDER BY c LIMIT 1; +ERROR: snapshot too old diff --git a/src/test/modules/snapshot_too_old/results/sto_using_cursor.out b/src/test/modules/snapshot_too_old/results/sto_using_cursor.out new file mode 100644 index 0000000..8cc29ec --- /dev/null +++ b/src/test/modules/snapshot_too_old/results/sto_using_cursor.out @@ -0,0 +1,73 @@ +Parsed test spec with 2 sessions + +starting permutation: s1decl s1f1 s1sleep s1f2 s2u +step s1decl: DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; +step s1f1: FETCH FIRST FROM cursor1; +c + +1 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: FETCH FIRST FROM cursor1; +c + +1 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; + +starting permutation: s1decl s1f1 s1sleep s2u s1f2 +step s1decl: DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; +step s1f1: FETCH FIRST FROM cursor1; +c + +1 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1f2: FETCH FIRST FROM cursor1; +ERROR: snapshot too old + +starting permutation: s1decl s1f1 s2u s1sleep s1f2 +step s1decl: DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; +step s1f1: FETCH FIRST FROM cursor1; +c + +1 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: FETCH FIRST FROM cursor1; +ERROR: snapshot too old + +starting permutation: s1decl s2u s1f1 s1sleep s1f2 +step s1decl: DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1f1: FETCH FIRST FROM cursor1; +c + +1 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: FETCH FIRST FROM cursor1; +ERROR: snapshot too old + +starting permutation: s2u s1decl s1f1 s1sleep s1f2 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1decl: DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; +step s1f1: FETCH FIRST FROM cursor1; +c + +2 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: FETCH FIRST FROM cursor1; +ERROR: snapshot too old diff --git a/src/test/modules/snapshot_too_old/results/sto_using_select.out b/src/test/modules/snapshot_too_old/results/sto_using_select.out new file mode 100644 index 0000000..eb15bc2 --- /dev/null +++ b/src/test/modules/snapshot_too_old/results/sto_using_select.out @@ -0,0 +1,55 @@ +Parsed test spec with 2 sessions + +starting permutation: s1f1 s1sleep s1f2 s2u +step s1f1: SELECT c FROM sto1 ORDER BY c LIMIT 1; +c + +1 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: SELECT c FROM sto1 ORDER BY c LIMIT 1; +c + +1 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; + +starting permutation: s1f1 s1sleep s2u s1f2 +step s1f1: SELECT c FROM sto1 ORDER BY c LIMIT 1; +c + +1 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1f2: SELECT c FROM sto1 ORDER BY c LIMIT 1; +ERROR: snapshot too old + +starting permutation: s1f1 s2u s1sleep s1f2 +step s1f1: SELECT c FROM sto1 ORDER BY c LIMIT 1; +c + +1 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: SELECT c FROM sto1 ORDER BY c LIMIT 1; +ERROR: snapshot too old + +starting permutation: s2u s1f1 s1sleep s1f2 +step s2u: UPDATE sto1 SET c = 1001 WHERE c = 1; +step s1f1: SELECT c FROM sto1 ORDER BY c LIMIT 1; +c + +2 +step s1sleep: SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; +setting pg_sleep + +0 +step s1f2: SELECT c FROM sto1 ORDER BY c LIMIT 1; +ERROR: snapshot too old diff --git a/src/test/modules/snapshot_too_old/specs/sto_using_cursor.spec b/src/test/modules/snapshot_too_old/specs/sto_using_cursor.spec new file mode 100644 index 0000000..eac18ca --- /dev/null +++ b/src/test/modules/snapshot_too_old/specs/sto_using_cursor.spec @@ -0,0 +1,37 @@ +# This test provokes a "snapshot too old" error using a cursor. +# +# The sleep is needed because with a threshold of zero a statement could error +# on changes it made. With more normal settings no external delay is needed, +# but we don't want these tests to run long enough to see that, since +# granularity is in minutes. +# +# Since results depend on the value of old_snapshot_threshold, sneak that into +# the line generated by the sleep, so that a surprising values isn't so hard +# to identify. + +setup +{ + CREATE TABLE sto1 (c int NOT NULL); + INSERT INTO sto1 SELECT generate_series(1, 1000); + CREATE TABLE sto2 (c int NOT NULL); +} +setup +{ + VACUUM ANALYZE sto1; +} + +teardown +{ + DROP TABLE sto1, sto2; +} + +session "s1" +setup { BEGIN ISOLATION LEVEL REPEATABLE READ; } +step "s1decl" { DECLARE cursor1 CURSOR FOR SELECT c FROM sto1; } +step "s1f1" { FETCH FIRST FROM cursor1; } +step "s1sleep" { SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; } +step "s1f2" { FETCH FIRST FROM cursor1; } +teardown { COMMIT; } + +session "s2" +step "s2u" { UPDATE sto1 SET c = 1001 WHERE c = 1; } diff --git a/src/test/modules/snapshot_too_old/specs/sto_using_select.spec b/src/test/modules/snapshot_too_old/specs/sto_using_select.spec new file mode 100644 index 0000000..d7c34f3 --- /dev/null +++ b/src/test/modules/snapshot_too_old/specs/sto_using_select.spec @@ -0,0 +1,36 @@ +# This test provokes a "snapshot too old" error using SELECT statements. +# +# The sleep is needed because with a threshold of zero a statement could error +# on changes it made. With more normal settings no external delay is needed, +# but we don't want these tests to run long enough to see that, since +# granularity is in minutes. +# +# Since results depend on the value of old_snapshot_threshold, sneak that into +# the line generated by the sleep, so that a surprising values isn't so hard +# to identify. + +setup +{ + CREATE TABLE sto1 (c int NOT NULL); + INSERT INTO sto1 SELECT generate_series(1, 1000); + CREATE TABLE sto2 (c int NOT NULL); +} +setup +{ + VACUUM ANALYZE sto1; +} + +teardown +{ + DROP TABLE sto1, sto2; +} + +session "s1" +setup { BEGIN ISOLATION LEVEL REPEATABLE READ; } +step "s1f1" { SELECT c FROM sto1 ORDER BY c LIMIT 1; } +step "s1sleep" { SELECT setting, pg_sleep(6) FROM pg_settings WHERE name = 'old_snapshot_threshold'; } +step "s1f2" { SELECT c FROM sto1 ORDER BY c LIMIT 1; } +teardown { COMMIT; } + +session "s2" +step "s2u" { UPDATE sto1 SET c = 1001 WHERE c = 1; } diff --git a/src/test/modules/snapshot_too_old/sto.conf b/src/test/modules/snapshot_too_old/sto.conf new file mode 100644 index 0000000..ce8048f --- /dev/null +++ b/src/test/modules/snapshot_too_old/sto.conf @@ -0,0 +1,3 @@ +autovacuum = off +old_snapshot_threshold = 0 +