diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index 9985e3e..84f6a11 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -366,6 +366,7 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); Relation rel; + XLogRecPtr lsn = InvalidXLogRecPtr; rel = relation_open(relid, AccessExclusiveLock); @@ -380,8 +381,6 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) RelationOpenSmgr(rel); rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber; - visibilitymap_truncate(rel, 0); - if (RelationNeedsWAL(rel)) { xl_smgr_truncate xlrec; @@ -393,9 +392,11 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); - XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); } + visibilitymap_truncate(rel, 0, lsn); + /* * Release the lock right away, not at commit time. * diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 3ad4a9f..e93806d 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -457,9 +457,14 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro * before they access the VM again. * * nheapblocks is the new size of the heap. + * + * The caller must have WAL-logged the truncation already (if the relation + * needs WAL-logging at all). 'lsn' is the LSN of the XLOG record. It is + * used to stamp the last remaining VM page, so that it doesn't get flushed + * to disk before the WAL record. */ void -visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) +visibilitymap_truncate(Relation rel, BlockNumber nheapblocks, XLogRecPtr lsn) { BlockNumber newnblocks; @@ -524,6 +529,8 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) map[truncByte] &= (1 << truncOffset) - 1; MarkBufferDirty(mapBuffer); + if (!XLogRecPtrIsInvalid(lsn)) + PageSetLSN(page, lsn); UnlockReleaseBuffer(mapBuffer); } else diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 0d8311c..6c6e21f 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -28,6 +28,7 @@ #include "catalog/storage.h" #include "catalog/storage_xlog.h" #include "storage/freespace.h" +#include "storage/proc.h" #include "storage/smgr.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -226,9 +227,6 @@ RelationPreserveStorage(RelFileNode rnode, bool atCommit) void RelationTruncate(Relation rel, BlockNumber nblocks) { - bool fsm; - bool vm; - /* Open it at the smgr level if not already done */ RelationOpenSmgr(rel); @@ -239,56 +237,85 @@ RelationTruncate(Relation rel, BlockNumber nblocks) rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber; rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber; - /* Truncate the FSM first if it exists */ - fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM); - if (fsm) - FreeSpaceMapTruncateRel(rel, nblocks); - - /* Truncate the visibility map too if it exists. */ - vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM); - if (vm) - visibilitymap_truncate(rel, nblocks); - - /* - * We WAL-log the truncation before actually truncating, which means - * trouble if the truncation fails. If we then crash, the WAL replay - * likely isn't going to succeed in the truncation either, and cause a - * PANIC. It's tempting to put a critical section here, but that cure - * would be worse than the disease. It would turn a usually harmless - * failure to truncate, that might spell trouble at WAL replay, into a - * certain PANIC. - */ - if (RelationNeedsWAL(rel)) + PG_TRY(); { + bool fsm; + bool vm; + XLogRecPtr lsn = InvalidXLogRecPtr; + /* - * Make an XLOG entry reporting the file truncation. + * Look first for the existence of the VM and FSM to control + * properly the flush of WAL record marking relation truncation. */ - XLogRecPtr lsn; - xl_smgr_truncate xlrec; + fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM); + vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM); - xlrec.blkno = nblocks; - xlrec.rnode = rel->rd_node; - xlrec.flags = SMGR_TRUNCATE_ALL; + /* + * We WAL-log the truncation before actually truncating, which means + * trouble if the truncation fails. If we then crash, the WAL replay + * likely isn't going to succeed in the truncation either, and cause a + * PANIC. It's tempting to put a critical section here, but that cure + * would be worse than the disease. It would turn a usually harmless + * failure to truncate, that might spell trouble at WAL replay, into a + * certain PANIC. + */ + if (RelationNeedsWAL(rel)) + { + /* + * Make an XLOG entry reporting the file truncation. + */ + xl_smgr_truncate xlrec; + + /* + * Since we're not holding a buffer lock, use delayChkpt to + * prevent a checkpoint from happening between WAL-logging, and + * performing the truncation. + */ + MyPgXact->delayChkpt = true; + + xlrec.blkno = nblocks; + xlrec.rnode = rel->rd_node; + xlrec.flags = SMGR_TRUNCATE_ALL; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + + lsn = XLogInsert(RM_SMGR_ID, + XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + + /* + * Flush, because otherwise the truncation of the main relation + * might hit the disk before the WAL record, and the truncation of + * the FSM or visibility map. If we crashed during that window, + * we'd be left with a truncated heap, but the FSM or visibility + * map would still contain entries for the non-existent heap + * pages. + */ + if (fsm || vm) + XLogFlush(lsn); + } - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + /* Truncate the FSM first if it exists */ + if (fsm) + FreeSpaceMapTruncateRel(rel, nblocks, lsn); - lsn = XLogInsert(RM_SMGR_ID, - XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + /* Truncate the visibility map too if it exists. */ + if (vm) + visibilitymap_truncate(rel, nblocks, lsn); - /* - * Flush, because otherwise the truncation of the main relation might - * hit the disk before the WAL record, and the truncation of the FSM - * or visibility map. If we crashed during that window, we'd be left - * with a truncated heap, but the FSM or visibility map would still - * contain entries for the non-existent heap pages. - */ - if (fsm || vm) - XLogFlush(lsn); + /* Do the real work */ + smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks); + } + PG_CATCH(); + { + if (RelationNeedsWAL(rel)) + MyPgXact->delayChkpt = false; + PG_RE_THROW(); } + PG_END_TRY(); - /* Do the real work */ - smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks); + if (RelationNeedsWAL(rel)) + MyPgXact->delayChkpt = false; } /* @@ -536,10 +563,10 @@ smgr_redo(XLogReaderState *record) if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 && smgrexists(reln, FSM_FORKNUM)) - FreeSpaceMapTruncateRel(rel, xlrec->blkno); + FreeSpaceMapTruncateRel(rel, xlrec->blkno, lsn); if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 && smgrexists(reln, VISIBILITYMAP_FORKNUM)) - visibilitymap_truncate(rel, xlrec->blkno); + visibilitymap_truncate(rel, xlrec->blkno, lsn); FreeFakeRelcacheEntry(rel); } diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index bbd90c9..0e675c8 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -294,9 +294,14 @@ GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk) * before they access the FSM again. * * nblocks is the new size of the heap. + * + * The caller must have WAL-logged the truncation already (if the relation + * needs WAL-logging at all). 'lsn' is the LSN of the XLOG record. It is + * used to stamp the last remaining VM page, so that it doesn't get flushed + * to disk before the WAL record. */ void -FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) +FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks, XLogRecPtr lsn) { BlockNumber new_nfsmblocks; FSMAddress first_removed_address; @@ -327,8 +332,13 @@ FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) if (!BufferIsValid(buf)) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); - MarkBufferDirtyHint(buf, false); + + MarkBufferDirty(buf); + if (!XLogRecPtrIsInvalid(lsn)) + PageSetLSN(BufferGetPage(buf), lsn); + UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index 00bbd4c..8fe8906 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -44,6 +44,6 @@ extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, uint8 flags); extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen); -extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks); +extern void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks, XLogRecPtr lsn); #endif /* VISIBILITYMAP_H */ diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h index 77b3bc3..ac014a4 100644 --- a/src/include/storage/freespace.h +++ b/src/include/storage/freespace.h @@ -14,6 +14,7 @@ #ifndef FREESPACE_H_ #define FREESPACE_H_ +#include "access/xlogdefs.h" #include "storage/block.h" #include "storage/relfilenode.h" #include "utils/relcache.h" @@ -30,7 +31,7 @@ extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, Size spaceAvail); -extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks); +extern void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks, XLogRecPtr lsn); extern void FreeSpaceMapVacuum(Relation rel); extern void UpdateFreeSpaceMap(Relation rel, BlockNumber startBlkNum, diff --git a/src/test/recovery/t/008_fsm_check.pl b/src/test/recovery/t/008_fsm_check.pl new file mode 100644 index 0000000..2c58e31 --- /dev/null +++ b/src/test/recovery/t/008_fsm_check.pl @@ -0,0 +1,90 @@ +# Set of tests to check file-space map consistency on standbys +use strict; +use warnings; + +use PostgresNode; +use TestLib; +use Test::More tests => 1; + +my $node_master = get_new_node('master'); +$node_master->init(allows_streaming => 1); + +$node_master->append_conf('postgresql.conf', qq{ +fsync = on +wal_level = replica +wal_log_hints = on +max_prepared_transactions = 5 +autovacuum = off +}); + +# Create a master node and its standby, initializing both with +# some data at the same time. +$node_master->start; + +$node_master->backup('master_backup'); +my $node_standby = get_new_node('standby'); +$node_standby->init_from_backup($node_master, 'master_backup', + has_streaming => 1); +$node_standby->start; + +$node_master->psql('postgres', qq{ +create table testtab (a int, b char(100)); +insert into testtab select generate_series(1,1000), 'foo'; +insert into testtab select generate_series(1,1000), 'foo'; +delete from testtab where ctid > '(8,0)'; +}); + +# Take a lock on the table to prevent following vacuum from truncating it +$node_master->psql('postgres', qq{ +begin; +lock table testtab in row share mode; +prepare transaction 'p1'; +}); + +# Vacuum, update FSM without truncation +$node_master->psql('postgres', 'vacuum verbose testtab'); + +# Force a checkpoint +$node_master->psql('postgres', 'checkpoint'); + +# Now do some more insert/deletes, another vacuum to ensure FPW writes are +# done +$node_master->psql('postgres', qq{ +insert into testtab select generate_series(1,1000), 'foo'; +delete from testtab where ctid > '(8,0)'; +vacuum verbose testtab; +}); + +# Ensure all buffers are now clean on the standby +$node_standby->psql('postgres', 'checkpoint'); + +# Release the lock, vacuum again which should lead to truncation +$node_master->psql('postgres', qq{ +rollback prepared 'p1'; +vacuum verbose testtab; +}); + +$node_master->psql('postgres', 'checkpoint'); +my $until_lsn = + $node_master->safe_psql('postgres', "SELECT pg_current_xlog_location();"); + +# Wait long enough for standby to receive and apply all WAL +my $caughtup_query = + "SELECT '$until_lsn'::pg_lsn <= pg_last_xlog_replay_location()"; +$node_standby->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for standby to catch up"; + +# now promote the standby +$node_standby->promote; +$node_standby->poll_query_until('postgres', + "SELECT NOT pg_is_in_recovery()") + or die "Timed out while waiting for promotion of standby"; +$node_standby->psql('postgres', 'checkpoint'); + +# restart to discard in-memory copy of FSM +$node_standby->restart; + +# Insert should work on standby +is($node_standby->psql('postgres', + qq{insert into testtab select generate_series(1,1000), 'foo';}), + 0, 'INSERT succeeds with truncated relation FSM');