Re: Visibility map, partial vacuums - Mailing list pgsql-hackers
From | Heikki Linnakangas |
---|---|
Subject | Re: Visibility map, partial vacuums |
Date | |
Msg-id | 492F051C.40403@enterprisedb.com Whole thread Raw |
In response to | Re: Visibility map, partial vacuums (Heikki Linnakangas <heikki.linnakangas@enterprisedb.com>) |
List | pgsql-hackers |
Heikki Linnakangas wrote: > Here's an updated version, ... And here it is, for real... -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com *** src/backend/access/heap/Makefile --- src/backend/access/heap/Makefile *************** *** 12,17 **** subdir = src/backend/access/heap top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global ! OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o include $(top_srcdir)/src/backend/common.mk --- 12,17 ---- top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global ! OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o visibilitymap.o include $(top_srcdir)/src/backend/common.mk *** src/backend/access/heap/heapam.c --- src/backend/access/heap/heapam.c *************** *** 47,52 **** --- 47,53 ---- #include "access/transam.h" #include "access/tuptoaster.h" #include "access/valid.h" + #include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlogutils.h" #include "catalog/catalog.h" *************** *** 195,200 **** heapgetpage(HeapScanDesc scan, BlockNumber page) --- 196,202 ---- int ntup; OffsetNumber lineoff; ItemId lpp; + bool all_visible; Assert(page < scan->rs_nblocks); *************** *** 233,252 **** heapgetpage(HeapScanDesc scan, BlockNumber page) lines = PageGetMaxOffsetNumber(dp); ntup = 0; for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff); lineoff <= lines; lineoff++, lpp++) { if (ItemIdIsNormal(lpp)) { - HeapTupleData loctup; bool valid; ! loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); ! loctup.t_len = ItemIdGetLength(lpp); ! ItemPointerSet(&(loctup.t_self), page, lineoff); ! valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) scan->rs_vistuples[ntup++] = lineoff; } --- 235,266 ---- lines = PageGetMaxOffsetNumber(dp); ntup = 0; + /* + * If the all-visible flag indicates that all tuples on the page are + * visible to everyone, we can skip the per-tuple visibility tests. + */ + all_visible = PageIsAllVisible(dp); + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff); lineoff <= lines; lineoff++, lpp++) { if (ItemIdIsNormal(lpp)) { bool valid; ! if (all_visible) ! valid = true; ! else ! { ! HeapTupleData loctup; ! ! loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); ! loctup.t_len = ItemIdGetLength(lpp); ! ItemPointerSet(&(loctup.t_self), page, lineoff); ! valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); ! } if (valid) scan->rs_vistuples[ntup++] = lineoff; } *************** *** 1860,1865 **** heap_insert(Relation relation, HeapTuple tup, CommandId cid, --- 1874,1880 ---- TransactionId xid = GetCurrentTransactionId(); HeapTuple heaptup; Buffer buffer; + bool all_visible_cleared = false; if (relation->rd_rel->relhasoids) { *************** *** 1920,1925 **** heap_insert(Relation relation, HeapTuple tup, CommandId cid, --- 1935,1946 ---- RelationPutHeapTuple(relation, buffer, heaptup); + if (PageIsAllVisible(BufferGetPage(buffer))) + { + all_visible_cleared = true; + PageClearAllVisible(BufferGetPage(buffer)); + } + /* * XXX Should we set PageSetPrunable on this page ? * *************** *** 1943,1948 **** heap_insert(Relation relation, HeapTuple tup, CommandId cid, --- 1964,1970 ---- Page page = BufferGetPage(buffer); uint8 info = XLOG_HEAP_INSERT; + xlrec.all_visible_cleared = all_visible_cleared; xlrec.target.node = relation->rd_node; xlrec.target.tid = heaptup->t_self; rdata[0].data = (char *) &xlrec; *************** *** 1994,1999 **** heap_insert(Relation relation, HeapTuple tup, CommandId cid, --- 2016,2026 ---- UnlockReleaseBuffer(buffer); + /* Clear the bit in the visibility map if necessary */ + if (all_visible_cleared) + visibilitymap_clear(relation, + ItemPointerGetBlockNumber(&(heaptup->t_self))); + /* * If tuple is cachable, mark it for invalidation from the caches in case * we abort. Note it is OK to do this after releasing the buffer, because *************** *** 2070,2075 **** heap_delete(Relation relation, ItemPointer tid, --- 2097,2103 ---- Buffer buffer; bool have_tuple_lock = false; bool iscombo; + bool all_visible_cleared = false; Assert(ItemPointerIsValid(tid)); *************** *** 2216,2221 **** l1: --- 2244,2255 ---- */ PageSetPrunable(page, xid); + if (PageIsAllVisible(page)) + { + all_visible_cleared = true; + PageClearAllVisible(page); + } + /* store transaction information of xact deleting the tuple */ tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | *************** *** 2237,2242 **** l1: --- 2271,2277 ---- XLogRecPtr recptr; XLogRecData rdata[2]; + xlrec.all_visible_cleared = all_visible_cleared; xlrec.target.node = relation->rd_node; xlrec.target.tid = tp.t_self; rdata[0].data = (char *) &xlrec; *************** *** 2281,2286 **** l1: --- 2316,2325 ---- */ CacheInvalidateHeapTuple(relation, &tp); + /* Clear the bit in the visibility map if necessary */ + if (all_visible_cleared) + visibilitymap_clear(relation, BufferGetBlockNumber(buffer)); + /* Now we can release the buffer */ ReleaseBuffer(buffer); *************** *** 2388,2393 **** heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, --- 2427,2434 ---- bool have_tuple_lock = false; bool iscombo; bool use_hot_update = false; + bool all_visible_cleared = false; + bool all_visible_cleared_new = false; Assert(ItemPointerIsValid(otid)); *************** *** 2763,2768 **** l2: --- 2804,2815 ---- MarkBufferDirty(newbuf); MarkBufferDirty(buffer); + /* + * Note: we mustn't clear PD_ALL_VISIBLE flags before writing the WAL + * record, because log_heap_update looks at those flags to set the + * corresponding flags in the WAL record. + */ + /* XLOG stuff */ if (!relation->rd_istemp) { *************** *** 2778,2783 **** l2: --- 2825,2842 ---- PageSetTLI(BufferGetPage(buffer), ThisTimeLineID); } + /* Clear PD_ALL_VISIBLE flags */ + if (PageIsAllVisible(BufferGetPage(buffer))) + { + all_visible_cleared = true; + PageClearAllVisible(BufferGetPage(buffer)); + } + if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf))) + { + all_visible_cleared_new = true; + PageClearAllVisible(BufferGetPage(newbuf)); + } + END_CRIT_SECTION(); if (newbuf != buffer) *************** *** 2791,2796 **** l2: --- 2850,2861 ---- */ CacheInvalidateHeapTuple(relation, &oldtup); + /* Clear bits in visibility map */ + if (all_visible_cleared) + visibilitymap_clear(relation, BufferGetBlockNumber(buffer)); + if (all_visible_cleared_new) + visibilitymap_clear(relation, BufferGetBlockNumber(newbuf)); + /* Now we can release the buffer(s) */ if (newbuf != buffer) ReleaseBuffer(newbuf); *************** *** 3412,3417 **** l3: --- 3477,3487 ---- LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); /* + * Don't update the visibility map here. Locking a tuple doesn't + * change visibility info. + */ + + /* * Now that we have successfully marked the tuple as locked, we can * release the lmgr tuple lock, if we had it. */ *************** *** 3916,3922 **** log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, --- 3986,3994 ---- xlrec.target.node = reln->rd_node; xlrec.target.tid = from; + xlrec.all_visible_cleared = PageIsAllVisible(BufferGetPage(oldbuf)); xlrec.newtid = newtup->t_self; + xlrec.new_all_visible_cleared = PageIsAllVisible(BufferGetPage(newbuf)); rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapUpdate; *************** *** 4185,4197 **** heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) OffsetNumber offnum; ItemId lp = NULL; HeapTupleHeader htup; if (record->xl_info & XLR_BKP_BLOCK_1) return; ! buffer = XLogReadBuffer(xlrec->target.node, ! ItemPointerGetBlockNumber(&(xlrec->target.tid)), ! false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); --- 4257,4281 ---- OffsetNumber offnum; ItemId lp = NULL; HeapTupleHeader htup; + BlockNumber blkno; + + blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid)); + + /* + * The visibility map always needs to be updated, even if the heap page + * is already up-to-date. + */ + if (xlrec->all_visible_cleared) + { + Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); + visibilitymap_clear(reln, blkno); + FreeFakeRelcacheEntry(reln); + } if (record->xl_info & XLR_BKP_BLOCK_1) return; ! buffer = XLogReadBuffer(xlrec->target.node, blkno, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); *************** *** 4223,4228 **** heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) --- 4307,4315 ---- /* Mark the page as a candidate for pruning */ PageSetPrunable(page, record->xl_xid); + if (xlrec->all_visible_cleared) + PageClearAllVisible(page); + /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = xlrec->target.tid; PageSetLSN(page, lsn); *************** *** 4249,4259 **** heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) Size freespace; BlockNumber blkno; if (record->xl_info & XLR_BKP_BLOCK_1) return; - blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid)); - if (record->xl_info & XLOG_HEAP_INIT_PAGE) { buffer = XLogReadBuffer(xlrec->target.node, blkno, true); --- 4336,4357 ---- Size freespace; BlockNumber blkno; + blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid)); + + /* + * The visibility map always needs to be updated, even if the heap page + * is already up-to-date. + */ + if (xlrec->all_visible_cleared) + { + Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); + visibilitymap_clear(reln, blkno); + FreeFakeRelcacheEntry(reln); + } + if (record->xl_info & XLR_BKP_BLOCK_1) return; if (record->xl_info & XLOG_HEAP_INIT_PAGE) { buffer = XLogReadBuffer(xlrec->target.node, blkno, true); *************** *** 4307,4312 **** heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) --- 4405,4414 ---- PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); + + if (xlrec->all_visible_cleared) + PageClearAllVisible(page); + MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); *************** *** 4347,4352 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update) --- 4449,4466 ---- uint32 newlen; Size freespace; + /* + * The visibility map always needs to be updated, even if the heap page + * is already up-to-date. + */ + if (xlrec->all_visible_cleared) + { + Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); + visibilitymap_clear(reln, + ItemPointerGetBlockNumber(&xlrec->target.tid)); + FreeFakeRelcacheEntry(reln); + } + if (record->xl_info & XLR_BKP_BLOCK_1) { if (samepage) *************** *** 4411,4416 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update) --- 4525,4533 ---- /* Mark the page as a candidate for pruning */ PageSetPrunable(page, record->xl_xid); + if (xlrec->all_visible_cleared) + PageClearAllVisible(page); + /* * this test is ugly, but necessary to avoid thinking that insert change * is already applied *************** *** 4426,4431 **** heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update) --- 4543,4559 ---- newt:; + /* + * The visibility map always needs to be updated, even if the heap page + * is already up-to-date. + */ + if (xlrec->new_all_visible_cleared) + { + Relation reln = CreateFakeRelcacheEntry(xlrec->target.node); + visibilitymap_clear(reln, ItemPointerGetBlockNumber(&xlrec->newtid)); + FreeFakeRelcacheEntry(reln); + } + if (record->xl_info & XLR_BKP_BLOCK_2) return; *************** *** 4504,4509 **** newsame:; --- 4632,4640 ---- if (offnum == InvalidOffsetNumber) elog(PANIC, "heap_update_redo: failed to add tuple"); + if (xlrec->new_all_visible_cleared) + PageClearAllVisible(page); + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ PageSetLSN(page, lsn); *** /dev/null --- src/backend/access/heap/visibilitymap.c *************** *** 0 **** --- 1,478 ---- + /*------------------------------------------------------------------------- + * + * visibilitymap.c + * bitmap for tracking visibility of heap tuples + * + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + * INTERFACE ROUTINES + * visibilitymap_clear - clear a bit in the visibility map + * visibilitymap_pin - pin a map page for setting a bit + * visibilitymap_set - set a bit in a previously pinned page + * visibilitymap_test - test if a bit is set + * + * NOTES + * + * The visibility map is a bitmap with one bit per heap page. A set bit means + * that all tuples on the page are visible to all transactions, and doesn't + * therefore need to be vacuumed. The map is conservative in the sense that we + * make sure that whenever a bit is set, we know the condition is true, but if + * a bit is not set, it might or might not be. + * + * There's no explicit WAL logging in the functions in this file. The callers + * must make sure that whenever a bit is cleared, the bit is cleared on WAL + * replay of the updating operation as well. Setting bits during recovery + * isn't necessary for correctness. + * + * Currently, the visibility map is only used as a hint, to speed up VACUUM. + * A corrupted visibility map won't cause data corruption, although it can + * make VACUUM skip pages that need vacuuming, until the next anti-wraparound + * vacuum. The visibility map is not used for anti-wraparound vacuums, because + * an anti-wraparound vacuum needs to freeze tuples and observe the latest xid + * present in the table, also on pages that don't have any dead tuples. + * + * Although the visibility map is just a hint at the moment, the PD_ALL_VISIBLE + * flag on heap pages *must* be correct. + * + * LOCKING + * + * In heapam.c, whenever a page is modified so that not all tuples on the + * page are visible to everyone anymore, the corresponding bit in the + * visibility map is cleared. The bit in the visibility map is cleared + * after releasing the lock on the heap page, to avoid holding the lock + * over possible I/O to read in the visibility map page. + * + * To set a bit, you need to hold a lock on the heap page. That prevents + * the race condition where VACUUM sees that all tuples on the page are + * visible to everyone, but another backend modifies the page before VACUUM + * sets the bit in the visibility map. + * + * When a bit is set, the LSN of the visibility map page is updated to make + * sure that the visibility map update doesn't get written to disk before the + * WAL record of the changes that made it possible to set the bit is flushed. + * But when a bit is cleared, we don't have to do that because it's always OK + * to clear a bit in the map from correctness point of view. + * + * TODO + * + * It would be nice to use the visibility map to skip visibility checkes in + * index scans. + * + * Currently, the visibility map is not 100% correct all the time. + * During updates, the bit in the visibility map is cleared after releasing + * the lock on the heap page. During the window after releasing the lock + * and clearing the bit in the visibility map, the bit in the visibility map + * is set, but the new insertion or deletion is not yet visible to other + * backends. + * + * That might actually be OK for the index scans, though. The newly inserted + * tuple wouldn't have an index pointer yet, so all tuples reachable from an + * index would still be visible to all other backends, and deletions wouldn't + * be visible to other backends yet. + * + * There's another hole in the way the PD_ALL_VISIBLE flag is set. When + * vacuum observes that all tuples are visible to all, it sets the flag on + * the heap page, and also sets the bit in the visibility map. If we then + * crash, and only the visibility map page was flushed to disk, we'll have + * a bit set in the visibility map, but the corresponding flag on the heap + * page is not set. If the heap page is then updated, the updater won't + * know to clear the bit in the visibility map. + * + *------------------------------------------------------------------------- + */ + #include "postgres.h" + + #include "access/visibilitymap.h" + #include "storage/bufmgr.h" + #include "storage/bufpage.h" + #include "storage/lmgr.h" + #include "storage/smgr.h" + #include "utils/inval.h" + + /*#define TRACE_VISIBILITYMAP */ + + /* + * Size of the bitmap on each visibility map page, in bytes. There's no + * extra headers, so the whole page minus except for the standard page header + * is used for the bitmap. + */ + #define MAPSIZE (BLCKSZ - SizeOfPageHeaderData) + + /* Number of bits allocated for each heap block. */ + #define BITS_PER_HEAPBLOCK 1 + + /* Number of heap blocks we can represent in one byte. */ + #define HEAPBLOCKS_PER_BYTE 8 + + /* Number of heap blocks we can represent in one visibility map page. */ + #define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE) + + /* Mapping from heap block number to the right bit in the visibility map */ + #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) + #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) + #define HEAPBLK_TO_MAPBIT(x) ((x) % HEAPBLOCKS_PER_BYTE) + + /* prototypes for internal routines */ + static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); + static void vm_extend(Relation rel, BlockNumber nvmblocks); + + + /* + * visibilitymap_clear - clear a bit in visibility map + * + * Clear a bit in the visibility map, marking that not all tuples are + * visible to all transactions anymore. + */ + void + visibilitymap_clear(Relation rel, BlockNumber heapBlk) + { + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + int mapBit = HEAPBLK_TO_MAPBIT(heapBlk); + uint8 mask = 1 << mapBit; + Buffer mapBuffer; + char *map; + + #ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk); + #endif + + mapBuffer = vm_readbuf(rel, mapBlock, false); + if (!BufferIsValid(mapBuffer)) + return; /* nothing to do */ + + LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); + map = PageGetContents(BufferGetPage(mapBuffer)); + + if (map[mapByte] & mask) + { + map[mapByte] &= ~mask; + + MarkBufferDirty(mapBuffer); + } + + UnlockReleaseBuffer(mapBuffer); + } + + /* + * visibilitymap_pin - pin a map page for setting a bit + * + * Setting a bit in the visibility map is a two-phase operation. First, call + * visibilitymap_pin, to pin the visibility map page containing the bit for + * the heap page. Because that can require I/O to read the map page, you + * shouldn't hold a lock on the heap page while doing that. Then, call + * visibilitymap_set to actually set the bit. + * + * On entry, *buf should be InvalidBuffer or a valid buffer returned by + * an earlier call to visibilitymap_pin or visibilitymap_test on the same + * relation. On return, *buf is a valid buffer with the map page containing + * the the bit for heapBlk. + * + * If the page doesn't exist in the map file yet, it is extended. + */ + void + visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf) + { + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + + /* Reuse the old pinned buffer if possible */ + if (BufferIsValid(*buf)) + { + if (BufferGetBlockNumber(*buf) == mapBlock) + return; + + ReleaseBuffer(*buf); + } + *buf = vm_readbuf(rel, mapBlock, true); + } + + /* + * visibilitymap_set - set a bit on a previously pinned page + * + * recptr is the LSN of the heap page. The LSN of the visibility map page is + * advanced to that, to make sure that the visibility map doesn't get flushed + * to disk before the update to the heap page that made all tuples visible. + * + * This is an opportunistic function. It does nothing, unless *buf + * contains the bit for heapBlk. Call visibilitymap_pin first to pin + * the right map page. This function doesn't do any I/O. + */ + void + visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, + Buffer *buf) + { + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); + Page page; + char *map; + + #ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); + #endif + + /* Check that we have the right page pinned */ + if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != mapBlock) + return; + + page = BufferGetPage(*buf); + map = PageGetContents(page); + LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); + + if (!(map[mapByte] & (1 << mapBit))) + { + map[mapByte] |= (1 << mapBit); + + if (XLByteLT(PageGetLSN(page), recptr)) + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(*buf); + } + + LockBuffer(*buf, BUFFER_LOCK_UNLOCK); + } + + /* + * visibilitymap_test - test if a bit is set + * + * Are all tuples on heapBlk visible to all, according to the visibility map? + * + * On entry, *buf should be InvalidBuffer or a valid buffer returned by an + * earlier call to visibilitymap_pin or visibilitymap_test on the same + * relation. On return, *buf is a valid buffer with the map page containing + * the the bit for heapBlk, or InvalidBuffer. The caller is responsible for + * releasing *buf after it's done testing and setting bits. + */ + bool + visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) + { + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); + bool result; + char *map; + + #ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk); + #endif + + /* Reuse the old pinned buffer if possible */ + if (BufferIsValid(*buf)) + { + if (BufferGetBlockNumber(*buf) != mapBlock) + { + ReleaseBuffer(*buf); + *buf = InvalidBuffer; + } + } + + if (!BufferIsValid(*buf)) + { + *buf = vm_readbuf(rel, mapBlock, false); + if (!BufferIsValid(*buf)) + return false; + } + + map = PageGetContents(BufferGetPage(*buf)); + + /* + * We don't need to lock the page, as we're only looking at a single bit. + */ + result = (map[mapByte] & (1 << mapBit)) ? true : false; + + return result; + } + + /* + * visibilitymap_test - truncate the visibility map + */ + void + visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) + { + BlockNumber newnblocks; + /* last remaining block, byte, and bit */ + BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); + uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); + uint8 truncBit = HEAPBLK_TO_MAPBIT(nheapblocks); + + #ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); + #endif + + /* + * If no visibility map has been created yet for this relation, there's + * nothing to truncate. + */ + if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) + return; + + /* + * Unless the new size is exactly at a visibility map page boundary, the + * tail bits in the last remaining map page, representing truncated heap + * blocks, need to be cleared. This is not only tidy, but also necessary + * because we don't get a chance to clear the bits if the heap is + * extended again. + */ + if (truncByte != 0 || truncBit != 0) + { + Buffer mapBuffer; + Page page; + char *map; + + newnblocks = truncBlock + 1; + + mapBuffer = vm_readbuf(rel, truncBlock, false); + if (!BufferIsValid(mapBuffer)) + { + /* nothing to do, the file was already smaller */ + return; + } + + page = BufferGetPage(mapBuffer); + map = PageGetContents(page); + + LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); + + /* Clear out the unwanted bytes. */ + MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); + + /* + * Mask out the unwanted bits of the last remaining byte. + * + * ((1 << 0) - 1) = 00000000 + * ((1 << 1) - 1) = 00000001 + * ... + * ((1 << 6) - 1) = 00111111 + * ((1 << 7) - 1) = 01111111 + */ + map[truncByte] &= (1 << truncBit) - 1; + + MarkBufferDirty(mapBuffer); + UnlockReleaseBuffer(mapBuffer); + } + else + newnblocks = truncBlock; + + if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) < newnblocks) + { + /* nothing to do, the file was already smaller than requested size */ + return; + } + + smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks, + rel->rd_istemp); + + /* + * Need to invalidate the relcache entry, because rd_vm_nblocks + * seen by other backends is no longer valid. + */ + if (!InRecovery) + CacheInvalidateRelcache(rel); + + rel->rd_vm_nblocks = newnblocks; + } + + /* + * Read a visibility map page. + * + * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is + * true, the visibility map file is extended. + */ + static Buffer + vm_readbuf(Relation rel, BlockNumber blkno, bool extend) + { + Buffer buf; + + RelationOpenSmgr(rel); + + /* + * The current size of the visibility map fork is kept in relcache, to + * avoid reading beyond EOF. If we haven't cached the size of the map yet, + * do that first. + */ + if (rel->rd_vm_nblocks == InvalidBlockNumber) + { + if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) + rel->rd_vm_nblocks = smgrnblocks(rel->rd_smgr, + VISIBILITYMAP_FORKNUM); + else + rel->rd_vm_nblocks = 0; + } + + /* Handle requests beyond EOF */ + if (blkno >= rel->rd_vm_nblocks) + { + if (extend) + vm_extend(rel, blkno + 1); + else + return InvalidBuffer; + } + + /* + * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's + * always safe to clear bits, so it's better to clear corrupt pages than + * error out. + */ + buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, + RBM_ZERO_ON_ERROR, NULL); + if (PageIsNew(BufferGetPage(buf))) + PageInit(BufferGetPage(buf), BLCKSZ, 0); + return buf; + } + + /* + * Ensure that the visibility map fork is at least vm_nblocks long, extending + * it if necessary with zeroed pages. + */ + static void + vm_extend(Relation rel, BlockNumber vm_nblocks) + { + BlockNumber vm_nblocks_now; + Page pg; + + pg = (Page) palloc(BLCKSZ); + PageInit(pg, BLCKSZ, 0); + + /* + * We use the relation extension lock to lock out other backends trying + * to extend the visibility map at the same time. It also locks out + * extension of the main fork, unnecessarily, but extending the + * visibility map happens seldom enough that it doesn't seem worthwhile to + * have a separate lock tag type for it. + * + * Note that another backend might have extended or created the + * relation before we get the lock. + */ + LockRelationForExtension(rel, ExclusiveLock); + + /* Create the file first if it doesn't exist */ + if ((rel->rd_vm_nblocks == 0 || rel->rd_vm_nblocks == InvalidBlockNumber) + && !smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) + { + smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false); + vm_nblocks_now = 0; + } + else + vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); + + while (vm_nblocks_now < vm_nblocks) + { + smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, + (char *) pg, rel->rd_istemp); + vm_nblocks_now++; + } + + UnlockRelationForExtension(rel, ExclusiveLock); + + pfree(pg); + + /* Update the relcache with the up-to-date size */ + if (!InRecovery) + CacheInvalidateRelcache(rel); + rel->rd_vm_nblocks = vm_nblocks_now; + } *** src/backend/access/transam/xlogutils.c --- src/backend/access/transam/xlogutils.c *************** *** 377,382 **** CreateFakeRelcacheEntry(RelFileNode rnode) --- 377,383 ---- rel->rd_targblock = InvalidBlockNumber; rel->rd_fsm_nblocks = InvalidBlockNumber; + rel->rd_vm_nblocks = InvalidBlockNumber; rel->rd_smgr = NULL; return rel; *** src/backend/catalog/catalog.c --- src/backend/catalog/catalog.c *************** *** 54,60 **** */ const char *forkNames[] = { "main", /* MAIN_FORKNUM */ ! "fsm" /* FSM_FORKNUM */ }; /* --- 54,61 ---- */ const char *forkNames[] = { "main", /* MAIN_FORKNUM */ ! "fsm", /* FSM_FORKNUM */ ! "vm" /* VISIBILITYMAP_FORKNUM */ }; /* *** src/backend/catalog/storage.c --- src/backend/catalog/storage.c *************** *** 19,24 **** --- 19,25 ---- #include "postgres.h" + #include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlogutils.h" #include "catalog/catalog.h" *************** *** 175,180 **** void --- 176,182 ---- RelationTruncate(Relation rel, BlockNumber nblocks) { bool fsm; + bool vm; /* Open it at the smgr level if not already done */ RelationOpenSmgr(rel); *************** *** 187,192 **** RelationTruncate(Relation rel, BlockNumber nblocks) --- 189,199 ---- if (fsm) FreeSpaceMapTruncateRel(rel, nblocks); + /* Truncate the visibility map too if it exists. */ + vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM); + if (vm) + visibilitymap_truncate(rel, nblocks); + /* * We WAL-log the truncation before actually truncating, which * means trouble if the truncation fails. If we then crash, the WAL *************** *** 217,228 **** RelationTruncate(Relation rel, BlockNumber nblocks) /* * Flush, because otherwise the truncation of the main relation ! * might hit the disk before the WAL record of truncating the ! * FSM is flushed. If we crashed during that window, we'd be ! * left with a truncated heap, but the FSM would still contain ! * entries for the non-existent heap pages. */ ! if (fsm) XLogFlush(lsn); } --- 224,235 ---- /* * Flush, because otherwise the truncation of the main relation ! * might hit the disk before the WAL record, and the truncation of ! * the FSM or visibility map. If we crashed during that window, we'd ! * be left with a truncated heap, but the FSM or visibility map would ! * still contain entries for the non-existent heap pages. */ ! if (fsm || vm) XLogFlush(lsn); } *** src/backend/commands/vacuum.c --- src/backend/commands/vacuum.c *************** *** 26,31 **** --- 26,32 ---- #include "access/genam.h" #include "access/heapam.h" #include "access/transam.h" + #include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlog.h" #include "catalog/namespace.h" *************** *** 2902,2907 **** move_chain_tuple(Relation rel, --- 2903,2914 ---- Size tuple_len = old_tup->t_len; /* + * Clear the bits in the visibility map. + */ + visibilitymap_clear(rel, BufferGetBlockNumber(old_buf)); + visibilitymap_clear(rel, BufferGetBlockNumber(dst_buf)); + + /* * make a modifiable copy of the source tuple. */ heap_copytuple_with_tuple(old_tup, &newtup); *************** *** 3005,3010 **** move_chain_tuple(Relation rel, --- 3012,3021 ---- END_CRIT_SECTION(); + PageClearAllVisible(BufferGetPage(old_buf)); + if (dst_buf != old_buf) + PageClearAllVisible(BufferGetPage(dst_buf)); + LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK); if (dst_buf != old_buf) LockBuffer(old_buf, BUFFER_LOCK_UNLOCK); *************** *** 3107,3112 **** move_plain_tuple(Relation rel, --- 3118,3140 ---- END_CRIT_SECTION(); + /* + * Clear the visible-to-all hint bits on the page, and bits in the + * visibility map. Normally we'd release the locks on the heap pages + * before updating the visibility map, but doesn't really matter here + * because we're holding an AccessExclusiveLock on the relation anyway. + */ + if (PageIsAllVisible(dst_page)) + { + PageClearAllVisible(dst_page); + visibilitymap_clear(rel, BufferGetBlockNumber(dst_buf)); + } + if (PageIsAllVisible(old_page)) + { + PageClearAllVisible(old_page); + visibilitymap_clear(rel, BufferGetBlockNumber(old_buf)); + } + dst_vacpage->free = PageGetFreeSpaceWithFillFactor(rel, dst_page); LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK); LockBuffer(old_buf, BUFFER_LOCK_UNLOCK); *** src/backend/commands/vacuumlazy.c --- src/backend/commands/vacuumlazy.c *************** *** 40,45 **** --- 40,46 ---- #include "access/genam.h" #include "access/heapam.h" #include "access/transam.h" + #include "access/visibilitymap.h" #include "catalog/storage.h" #include "commands/dbcommands.h" #include "commands/vacuum.h" *************** *** 88,93 **** typedef struct LVRelStats --- 89,95 ---- int max_dead_tuples; /* # slots allocated in array */ ItemPointer dead_tuples; /* array of ItemPointerData */ int num_index_scans; + bool scanned_all; /* have we scanned all pages (this far)? */ } LVRelStats; *************** *** 102,108 **** static BufferAccessStrategy vac_strategy; /* non-export function prototypes */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, ! Relation *Irel, int nindexes); static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats); static void lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats, --- 104,110 ---- /* non-export function prototypes */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, ! Relation *Irel, int nindexes, bool scan_all); static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats); static void lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats, *************** *** 141,146 **** lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, --- 143,149 ---- BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; + bool scan_all; pg_rusage_init(&ru0); *************** *** 161,173 **** lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ ! lazy_scan_heap(onerel, vacrelstats, Irel, nindexes); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); --- 164,183 ---- vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); vacrelstats->num_index_scans = 0; + vacrelstats->scanned_all = true; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); + /* Should we use the visibility map or scan all pages? */ + if (vacstmt->freeze_min_age != -1) + scan_all = true; + else + scan_all = false; + /* Do the vacuuming */ ! lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); *************** *** 186,195 **** lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, /* Vacuum the Free Space Map */ FreeSpaceMapVacuum(onerel); ! /* Update statistics in pg_class */ vac_update_relstats(onerel, vacrelstats->rel_pages, vacrelstats->rel_tuples, ! vacrelstats->hasindex, FreezeLimit); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, --- 196,209 ---- /* Vacuum the Free Space Map */ FreeSpaceMapVacuum(onerel); ! /* ! * Update statistics in pg_class. We can only advance relfrozenxid if we ! * didn't skip any pages. ! */ vac_update_relstats(onerel, vacrelstats->rel_pages, vacrelstats->rel_tuples, ! vacrelstats->hasindex, ! vacrelstats->scanned_all ? FreezeLimit : InvalidOid); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, *************** *** 230,242 **** lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, ! Relation *Irel, int nindexes) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, --- 244,257 ---- */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, ! Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, + scanned_pages, vacuumed_pages; double num_tuples, tups_vacuumed, *************** *** 245,250 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 260,266 ---- IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; + Buffer vmbuffer = InvalidBuffer; pg_rusage_init(&ru0); *************** *** 254,260 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, get_namespace_name(RelationGetNamespace(onerel)), relname))); ! empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) --- 270,276 ---- get_namespace_name(RelationGetNamespace(onerel)), relname))); ! empty_pages = vacuumed_pages = scanned_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) *************** *** 278,286 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 294,321 ---- OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; + bool all_visible_according_to_vm = false; + bool all_visible; + + /* + * Skip pages that don't require vacuuming according to the + * visibility map. + */ + if (!scan_all) + { + all_visible_according_to_vm = + visibilitymap_test(onerel, blkno, &vmbuffer); + if (all_visible_according_to_vm) + { + vacrelstats->scanned_all = false; + continue; + } + } vacuum_delay_point(); + scanned_pages++; + /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. *************** *** 354,360 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, { empty_pages++; freespace = PageGetHeapFreeSpace(page); ! UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } --- 389,414 ---- { empty_pages++; freespace = PageGetHeapFreeSpace(page); ! ! if (!PageIsAllVisible(page)) ! { ! SetBufferCommitInfoNeedsSave(buf); ! PageSetAllVisible(page); ! } ! ! LockBuffer(buf, BUFFER_LOCK_UNLOCK); ! ! /* Update the visibility map */ ! if (!all_visible_according_to_vm) ! { ! visibilitymap_pin(onerel, blkno, &vmbuffer); ! LockBuffer(buf, BUFFER_LOCK_SHARE); ! if (PageIsAllVisible(page)) ! visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); ! LockBuffer(buf, BUFFER_LOCK_UNLOCK); ! } ! ! ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } *************** *** 371,376 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 425,431 ---- * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ + all_visible = true; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; *************** *** 408,413 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 463,469 ---- if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); + all_visible = false; continue; } *************** *** 442,447 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 498,504 ---- nkeep += 1; else tupgone = true; /* we can delete the tuple */ + all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ *************** *** 449,454 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 506,540 ---- !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); + + /* + * Is the tuple definitely visible to all transactions? + * + * NB: Like with per-tuple hint bits, we can't set the + * flag if the inserter committed asynchronously. See + * SetHintBits for more info. Check that the + * HEAP_XMIN_COMMITTED hint bit is set because of that. + */ + if (all_visible) + { + TransactionId xmin; + + if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) + { + all_visible = false; + break; + } + /* + * The inserter definitely committed. But is it + * old enough that everyone sees it as committed? + */ + xmin = HeapTupleHeaderGetXmin(tuple.t_data); + if (!TransactionIdPrecedes(xmin, OldestXmin)) + { + all_visible = false; + break; + } + } break; case HEAPTUPLE_RECENTLY_DEAD: *************** *** 457,468 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 543,557 ---- * from relation. */ nkeep += 1; + all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ + all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ + all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); *************** *** 525,536 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, freespace = PageGetHeapFreeSpace(page); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; - UnlockReleaseBuffer(buf); - /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record --- 614,656 ---- freespace = PageGetHeapFreeSpace(page); + /* Update the all-visible flag on the page */ + if (!PageIsAllVisible(page) && all_visible) + { + SetBufferCommitInfoNeedsSave(buf); + PageSetAllVisible(page); + } + else if (PageIsAllVisible(page) && !all_visible) + { + elog(WARNING, "PD_ALL_VISIBLE flag was incorrectly set"); + SetBufferCommitInfoNeedsSave(buf); + PageClearAllVisible(page); + + /* + * XXX: Normally, we would drop the lock on the heap page before + * updating the visibility map. + */ + visibilitymap_clear(onerel, blkno); + } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + /* Update the visibility map */ + if (!all_visible_according_to_vm && all_visible) + { + visibilitymap_pin(onerel, blkno, &vmbuffer); + LockBuffer(buf, BUFFER_LOCK_SHARE); + if (PageIsAllVisible(page)) + visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + ReleaseBuffer(buf); + /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record *************** *** 560,565 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 680,692 ---- vacrelstats->num_index_scans++; } + /* Release the pin on the visibility map page */ + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } + /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); *************** *** 572,580 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, tups_vacuumed, vacuumed_pages))); ereport(elevel, ! (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", RelationGetRelationName(onerel), ! tups_vacuumed, num_tuples, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" --- 699,707 ---- tups_vacuumed, vacuumed_pages))); ereport(elevel, ! (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), ! tups_vacuumed, num_tuples, scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" *************** *** 623,628 **** lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) --- 750,764 ---- LockBufferForCleanup(buf); tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats); + /* + * Before we let the page go, prune it. The primary reason is to + * update the visibility map in the common special case that we just + * vacuumed away the last tuple on the page that wasn't visible to + * everyone. + */ + vacrelstats->tuples_deleted += + heap_page_prune(onerel, buf, OldestXmin, false, false); + /* Now that we've compacted the page, record its available space */ page = BufferGetPage(buf); freespace = PageGetHeapFreeSpace(page); *** src/backend/utils/cache/relcache.c --- src/backend/utils/cache/relcache.c *************** *** 305,310 **** AllocateRelationDesc(Relation relation, Form_pg_class relp) --- 305,311 ---- MemSet(relation, 0, sizeof(RelationData)); relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; + relation->rd_vm_nblocks = InvalidBlockNumber; /* make sure relation is marked as having no open file yet */ relation->rd_smgr = NULL; *************** *** 1377,1382 **** formrdesc(const char *relationName, Oid relationReltype, --- 1378,1384 ---- relation = (Relation) palloc0(sizeof(RelationData)); relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; + relation->rd_vm_nblocks = InvalidBlockNumber; /* make sure relation is marked as having no open file yet */ relation->rd_smgr = NULL; *************** *** 1665,1673 **** RelationReloadIndexInfo(Relation relation) heap_freetuple(pg_class_tuple); /* We must recalculate physical address in case it changed */ RelationInitPhysicalAddr(relation); ! /* Must reset targblock and fsm_nblocks in case rel was truncated */ relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; /* Must free any AM cached data, too */ if (relation->rd_amcache) pfree(relation->rd_amcache); --- 1667,1679 ---- heap_freetuple(pg_class_tuple); /* We must recalculate physical address in case it changed */ RelationInitPhysicalAddr(relation); ! /* ! * Must reset targblock, fsm_nblocks and vm_nblocks in case rel was ! * truncated ! */ relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; + relation->rd_vm_nblocks = InvalidBlockNumber; /* Must free any AM cached data, too */ if (relation->rd_amcache) pfree(relation->rd_amcache); *************** *** 1751,1756 **** RelationClearRelation(Relation relation, bool rebuild) --- 1757,1763 ---- { relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; + relation->rd_vm_nblocks = InvalidBlockNumber; if (relation->rd_rel->relkind == RELKIND_INDEX) { relation->rd_isvalid = false; /* needs to be revalidated */ *************** *** 2346,2351 **** RelationBuildLocalRelation(const char *relname, --- 2353,2359 ---- rel->rd_targblock = InvalidBlockNumber; rel->rd_fsm_nblocks = InvalidBlockNumber; + rel->rd_vm_nblocks = InvalidBlockNumber; /* make sure relation is marked as having no open file yet */ rel->rd_smgr = NULL; *************** *** 3603,3608 **** load_relcache_init_file(void) --- 3611,3617 ---- rel->rd_smgr = NULL; rel->rd_targblock = InvalidBlockNumber; rel->rd_fsm_nblocks = InvalidBlockNumber; + rel->rd_vm_nblocks = InvalidBlockNumber; if (rel->rd_isnailed) rel->rd_refcnt = 1; else *** src/include/access/heapam.h --- src/include/access/heapam.h *************** *** 153,158 **** extern void heap_page_prune_execute(Buffer buffer, --- 153,159 ---- OffsetNumber *nowunused, int nunused, bool redirect_move); extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets); + extern void heap_page_update_all_visible(Buffer buffer); /* in heap/syncscan.c */ extern void ss_report_location(Relation rel, BlockNumber location); *** src/include/access/htup.h --- src/include/access/htup.h *************** *** 601,609 **** typedef struct xl_heaptid typedef struct xl_heap_delete { xl_heaptid target; /* deleted tuple id */ } xl_heap_delete; ! #define SizeOfHeapDelete (offsetof(xl_heap_delete, target) + SizeOfHeapTid) /* * We don't store the whole fixed part (HeapTupleHeaderData) of an inserted --- 601,610 ---- typedef struct xl_heap_delete { xl_heaptid target; /* deleted tuple id */ + bool all_visible_cleared; /* PD_ALL_VISIBLE was cleared */ } xl_heap_delete; ! #define SizeOfHeapDelete (offsetof(xl_heap_delete, all_visible_cleared) + sizeof(bool)) /* * We don't store the whole fixed part (HeapTupleHeaderData) of an inserted *************** *** 626,646 **** typedef struct xl_heap_header typedef struct xl_heap_insert { xl_heaptid target; /* inserted tuple id */ /* xl_heap_header & TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_insert; ! #define SizeOfHeapInsert (offsetof(xl_heap_insert, target) + SizeOfHeapTid) /* This is what we need to know about update|move|hot_update */ typedef struct xl_heap_update { xl_heaptid target; /* deleted tuple id */ ItemPointerData newtid; /* new inserted tuple id */ /* NEW TUPLE xl_heap_header (PLUS xmax & xmin IF MOVE OP) */ /* and TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_update; ! #define SizeOfHeapUpdate (offsetof(xl_heap_update, newtid) + SizeOfIptrData) /* * This is what we need to know about vacuum page cleanup/redirect --- 627,650 ---- typedef struct xl_heap_insert { xl_heaptid target; /* inserted tuple id */ + bool all_visible_cleared; /* PD_ALL_VISIBLE was cleared */ /* xl_heap_header & TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_insert; ! #define SizeOfHeapInsert (offsetof(xl_heap_insert, all_visible_cleared) + sizeof(bool)) /* This is what we need to know about update|move|hot_update */ typedef struct xl_heap_update { xl_heaptid target; /* deleted tuple id */ ItemPointerData newtid; /* new inserted tuple id */ + bool all_visible_cleared; /* PD_ALL_VISIBLE was cleared */ + bool new_all_visible_cleared; /* same for the page of newtid */ /* NEW TUPLE xl_heap_header (PLUS xmax & xmin IF MOVE OP) */ /* and TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_update; ! #define SizeOfHeapUpdate (offsetof(xl_heap_update, new_all_visible_cleared) + sizeof(bool)) /* * This is what we need to know about vacuum page cleanup/redirect *** /dev/null --- src/include/access/visibilitymap.h *************** *** 0 **** --- 1,30 ---- + /*------------------------------------------------------------------------- + * + * visibilitymap.h + * visibility map interface + * + * + * Portions Copyright (c) 2007, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + #ifndef VISIBILITYMAP_H + #define VISIBILITYMAP_H + + #include "utils/rel.h" + #include "storage/buf.h" + #include "storage/itemptr.h" + #include "access/xlogdefs.h" + + extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk); + extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, + Buffer *vmbuf); + extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, + XLogRecPtr recptr, Buffer *vmbuf); + extern bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); + extern void visibilitymap_truncate(Relation rel, BlockNumber heapblk); + + #endif /* VISIBILITYMAP_H */ *** src/include/storage/bufpage.h --- src/include/storage/bufpage.h *************** *** 152,159 **** typedef PageHeaderData *PageHeader; #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new * tuple? */ ! #define PD_VALID_FLAG_BITS 0x0003 /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. --- 152,161 ---- #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new * tuple? */ + #define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to + * everyone */ ! #define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. *************** *** 336,341 **** typedef PageHeaderData *PageHeader; --- 338,350 ---- #define PageClearFull(page) \ (((PageHeader) (page))->pd_flags &= ~PD_PAGE_FULL) + #define PageIsAllVisible(page) \ + (((PageHeader) (page))->pd_flags & PD_ALL_VISIBLE) + #define PageSetAllVisible(page) \ + (((PageHeader) (page))->pd_flags |= PD_ALL_VISIBLE) + #define PageClearAllVisible(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE) + #define PageIsPrunable(page, oldestxmin) \ ( \ AssertMacro(TransactionIdIsNormal(oldestxmin)), \ *** src/include/storage/relfilenode.h --- src/include/storage/relfilenode.h *************** *** 24,37 **** typedef enum ForkNumber { InvalidForkNumber = -1, MAIN_FORKNUM = 0, ! FSM_FORKNUM /* * NOTE: if you add a new fork, change MAX_FORKNUM below and update the * forkNames array in catalog.c */ } ForkNumber; ! #define MAX_FORKNUM FSM_FORKNUM /* * RelFileNode must provide all that we need to know to physically access --- 24,38 ---- { InvalidForkNumber = -1, MAIN_FORKNUM = 0, ! FSM_FORKNUM, ! VISIBILITYMAP_FORKNUM /* * NOTE: if you add a new fork, change MAX_FORKNUM below and update the * forkNames array in catalog.c */ } ForkNumber; ! #define MAX_FORKNUM VISIBILITYMAP_FORKNUM /* * RelFileNode must provide all that we need to know to physically access *** src/include/utils/rel.h --- src/include/utils/rel.h *************** *** 195,202 **** typedef struct RelationData List *rd_indpred; /* index predicate tree, if any */ void *rd_amcache; /* available for use by index AM */ ! /* size of the FSM, or InvalidBlockNumber if not known yet */ BlockNumber rd_fsm_nblocks; /* use "struct" here to avoid needing to include pgstat.h: */ struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ --- 195,206 ---- List *rd_indpred; /* index predicate tree, if any */ void *rd_amcache; /* available for use by index AM */ ! /* ! * sizes of the free space and visibility map forks, or InvalidBlockNumber ! * if not known yet ! */ BlockNumber rd_fsm_nblocks; + BlockNumber rd_vm_nblocks; /* use "struct" here to avoid needing to include pgstat.h: */ struct PgStat_TableStatus *pgstat_info; /* statistics collection area */
pgsql-hackers by date: