diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 5696abe..f36dd29 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -293,6 +293,10 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, * relation. On return, *buf is a valid buffer with the map page containing * the bit for heapBlk, or InvalidBuffer. The caller is responsible for * releasing *buf after it's done testing and setting bits. + * + * NOTE: This function does not lock the visibility map page, so it's + * possible we might get a slightly stale answer due to memory-ordering + * effects. It is the caller's responsibility to make sure this is safe! */ bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) @@ -327,7 +331,9 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) map = PageGetContents(BufferGetPage(*buf)); /* - * We don't need to lock the page, as we're only looking at a single bit. + * A single-bit read is atomic. There could be memory-ordering effects + * here, but for performance reasons we make it the caller's job to worry + * about that. */ result = (map[mapByte] & (1 << mapBit)) ? true : false; diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 0e0193d..4188441 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -419,6 +419,14 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. + * + * Note: Due to memory-ordering effects, the value returned by + * visibilitymap_test could be slightly stale. This is OK. If we see + * the page as all-visible when the flag's just been cleared, we might + * fail to vacuum the page. But it's OK to skip pages when scan_all is + * not set, so no great harm done; the next scan_all vacuum will find them. + * If we make the reverse mistake and vacuum a page unnecessarily, it'll + * just be a no-op. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; @@ -852,22 +860,23 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, freespace = PageGetHeapFreeSpace(page); /* mark page all-visible, if appropriate */ - if (all_visible && !all_visible_according_to_vm) + if (all_visible && !PageIsAllVisible(page)) { - if (!PageIsAllVisible(page)) - { - PageSetAllVisible(page); - MarkBufferDirty(buf); - } + PageSetAllVisible(page); + MarkBufferDirty(buf); visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer, visibility_cutoff_xid); } /* * As of PostgreSQL 9.2, the visibility map bit should never be set if - * the page-level bit is clear. + * the page-level bit is clear. However, it's possible that the bit + * got cleared after we checked it and before we got took the buffer + * content lock, so we must recheck before jumping to the conclusion + * that something bad has happened. */ - else if (all_visible_according_to_vm && !PageIsAllVisible(page)) + else if (all_visible_according_to_vm && !PageIsAllVisible(page) + && visibilitymap_test(onerel, blkno, &vmbuffer)) { elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", relname, blkno); diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 4abd805..2c11e6a 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -82,6 +82,19 @@ IndexOnlyNext(IndexOnlyScanState *node) * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, * we'll use the index tuple not the heap tuple as the data source. + * + * Note on Memory Ordering Effects: visibilitymap_test does not lock + * the visibility map buffer, and therefore the result we read here + * could be slightly stale. However, it can't be stale enough to + * matter. It suffices to show that (1) there is a read barrier + * between the time we read the index TID and the time we test the + * visibility map; and (2) there is a write barrior between the time + * some other concurrent process clears the visibility map bit and the + * time it inserts the index TID. Since acquiring or releasing a + * LWLock interposes a full barrier, this is easy to show: (1) is + * satisfied by the release of the index buffer content lock after + * reading the TID; and (2) is satisfied by the acquisition of the + * buffer content lock in order to insert the TID. */ if (!visibilitymap_test(scandesc->heapRelation, ItemPointerGetBlockNumber(tid),