From 7d0ab70fff64fa38209932a05d8d4e2e2193d8ec Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi Date: Mon, 4 Sep 2023 17:23:05 +0900 Subject: [PATCH v31 3/3] In-place table persistence change Previously, the command caused a large amount of file I/O due to heap rewrites, even though ALTER TABLE SET UNLOGGED does not require any data rewrites. This patch eliminates the need for rewrites. Additionally, ALTER TABLE SET LOGGED is updated to emit XLOG_FPI records instead of numerous HEAP_INSERTs when wal_level > minimal, reducing resource consumption. --- src/backend/access/rmgrdesc/smgrdesc.c | 12 + src/backend/catalog/storage.c | 338 ++++++++++++++++++++++++- src/backend/commands/tablecmds.c | 269 +++++++++++++++++--- src/backend/storage/buffer/bufmgr.c | 84 ++++++ src/bin/pg_rewind/parsexlog.c | 6 + src/include/catalog/storage_xlog.h | 10 + src/include/storage/bufmgr.h | 3 + src/include/storage/reinit.h | 2 +- src/tools/pgindent/typedefs.list | 1 + 9 files changed, 684 insertions(+), 41 deletions(-) diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c index 71410e0a2d..77a8fdb045 100644 --- a/src/backend/access/rmgrdesc/smgrdesc.c +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -40,6 +40,15 @@ smgr_desc(StringInfo buf, XLogReaderState *record) xlrec->blkno, xlrec->flags); pfree(path); } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = (xl_smgr_bufpersistence *) rec; + char *path = relpathperm(xlrec->rlocator, MAIN_FORKNUM); + + appendStringInfoString(buf, path); + appendStringInfo(buf, " persistence %d", xlrec->persistence); + pfree(path); + } } const char * @@ -55,6 +64,9 @@ smgr_identify(uint8 info) case XLOG_SMGR_TRUNCATE: id = "TRUNCATE"; break; + case XLOG_SMGR_BUFPERSISTENCE: + id = "BUFPERSISTENCE"; + break; } return id; diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 03553c4980..6616466f61 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -71,11 +71,13 @@ typedef struct PendingRelDelete } PendingRelDelete; #define PCOP_UNLINK_FORK (1 << 0) +#define PCOP_SET_PERSISTENCE (1 << 1) typedef struct PendingCleanup { RelFileLocator rlocator; /* relation that need a cleanup */ int op; /* operation mask */ + bool bufpersistence; /* buffer persistence to set */ ForkNumber unlink_forknum; /* forknum to unlink */ BackendId backend; /* InvalidBackendId if not a temp rel */ bool atCommit; /* T=delete at commit; F=delete at abort */ @@ -209,6 +211,208 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, return srel; } +/* + * RelationCreateInitFork + * Create physical storage for the init fork of a relation. + * + * Create the init fork for the relation. + * + * This function is transactional. The creation is WAL-logged, and if the + * transaction aborts later on, the init fork will be removed. + */ +void +RelationCreateInitFork(Relation rel) +{ + RelFileLocator rlocator = rel->rd_locator; + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + SMgrRelation srel; + ul_uncommitted_storage ul_storage; + bool create = true; + + /* switch buffer persistence */ + SetRelationBuffersPersistence(RelationGetSmgr(rel), false, false); + + /* + * If a pending-unlink exists for this relation's init-fork, it indicates + * the init-fork's existed before the current transaction; this function + * reverts the pending-unlink by removing the entry. See + * RelationDropInitFork. + */ + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + + if (RelFileLocatorEquals(rlocator, pending->rlocator) && + pending->unlink_forknum == INIT_FORKNUM) + { + /* write cancel log for preceding undo log entry */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = false; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + + if (prev) + prev->next = next; + else + pendingCleanups = next; + + pfree(pending); + /* prev does not change */ + + create = false; + } + else + prev = pending; + } + + if (!create) + return; + + /* create undo log entry, then the init fork */ + srel = smgropen(rlocator, InvalidBackendId); + + /* write undo log */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = true; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + + /* We don't have existing init fork, create it. */ + smgrcreate(srel, INIT_FORKNUM, false); + + /* + * For index relations, WAL-logging and file sync are handled by + * ambuildempty. In contrast, for heap relations, these tasks are performed + * directly. + */ + if (rel->rd_rel->relkind == RELKIND_INDEX) + rel->rd_indam->ambuildempty(rel); + else + { + log_smgrcreate(&rlocator, INIT_FORKNUM); + smgrimmedsync(srel, INIT_FORKNUM); + } + + /* drop the init fork, mark file then revert persistence at abort */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = rlocator; + pending->op = PCOP_UNLINK_FORK | PCOP_SET_PERSISTENCE; + pending->unlink_forknum = INIT_FORKNUM; + pending->bufpersistence = true; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; +} + +/* + * RelationDropInitFork + * Delete physical storage for the init fork of a relation. + */ +void +RelationDropInitFork(Relation rel) +{ + RelFileLocator rlocator = rel->rd_locator; + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + bool inxact_created = false; + + /* switch buffer persistence */ + SetRelationBuffersPersistence(RelationGetSmgr(rel), true, false); + + /* + * Search for a pending-unlink associated with the init-fork of the + * relation. Its presence indicates that the init-fork was created within + * the current transaction. + */ + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + + if (RelFileLocatorEquals(rlocator, pending->rlocator) && + pending->unlink_forknum == INIT_FORKNUM) + { + ul_uncommitted_storage ul_storage; + + /* write cancel log for preceding undo log entry */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = false; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + + /* unlink list entry */ + if (prev) + prev->next = next; + else + pendingCleanups = next; + + pfree(pending); + + /* prev does not change */ + + inxact_created = true; + } + else + prev = pending; + } + + /* + * If the init-fork was created in this transaction, remove the init-fork + * and cancel preceding undo log. Otherwise, register an at-commit + * pending-unlink for the existing init-fork. See RelationCreateInitFork. + */ + if (inxact_created) + { + SMgrRelation srel = smgropen(rlocator, InvalidBackendId); + ForkNumber forknum = INIT_FORKNUM; + BlockNumber firstblock = 0; + ul_uncommitted_storage ul_storage; + + /* + * Some AMs initialize init-fork via the buffer manager. To properly + * drop the init-fork, first drop all buffers for the init-fork, then + * unlink the init-fork and cancel preceding undo log. + */ + DropRelationBuffers(srel, &forknum, 1, &firstblock); + + /* cancel existing undo log */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = false; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + log_smgrunlink(&rlocator, INIT_FORKNUM); + smgrunlink(srel, INIT_FORKNUM, false); + return; + } + + /* register drop of this init fork file at commit */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = rlocator; + pending->op = PCOP_UNLINK_FORK; + pending->unlink_forknum = INIT_FORKNUM; + pending->backend = InvalidBackendId; + pending->atCommit = true; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; +} + /* * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL. */ @@ -248,6 +452,25 @@ log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum) XLogInsert(RM_SMGR_ID, XLOG_SMGR_UNLINK | XLR_SPECIAL_REL_UPDATE); } +/* + * Perform XLogInsert of an XLOG_SMGR_BUFPERSISTENCE record to WAL. + */ +void +log_smgrbufpersistence(const RelFileLocator rlocator, bool persistence) +{ + xl_smgr_bufpersistence xlrec; + + /* + * Make an XLOG entry reporting the change of buffer persistence. + */ + xlrec.rlocator = rlocator; + xlrec.persistence = persistence; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_BUFPERSISTENCE | XLR_SPECIAL_REL_UPDATE); +} + /* * RelationDropStorage * Schedule unlinking of physical storage at transaction commit. @@ -800,7 +1023,14 @@ smgrDoPendingCleanups(bool isCommit) srel = smgropen(pending->rlocator, pending->backend); - Assert((pending->op & ~(PCOP_UNLINK_FORK)) == 0); + Assert((pending->op & + ~(PCOP_UNLINK_FORK | PCOP_SET_PERSISTENCE)) == 0); + + if (pending->op & PCOP_SET_PERSISTENCE) + { + SetRelationBuffersPersistence(srel, pending->bufpersistence, + InRecovery); + } if (pending->op & PCOP_UNLINK_FORK) { @@ -1200,6 +1430,112 @@ smgr_redo(XLogReaderState *record) FreeFakeRelcacheEntry(rel); } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = + (xl_smgr_bufpersistence *) XLogRecGetData(record); + SMgrRelation reln; + PendingCleanup *pending; + PendingCleanup *prev = NULL; + + reln = smgropen(xlrec->rlocator, InvalidBackendId); + SetRelationBuffersPersistence(reln, xlrec->persistence, true); + + /* + * Delete any pending action for persistence change, if present. There + * should be at most one entry for this action. + */ + for (pending = pendingCleanups; pending != NULL; + pending = pending->next) + { + if (RelFileLocatorEquals(xlrec->rlocator, pending->rlocator) && + (pending->op & PCOP_SET_PERSISTENCE) != 0) + { + Assert(pending->bufpersistence == xlrec->persistence); + + if (prev) + prev->next = pending->next; + else + pendingCleanups = pending->next; + + pfree(pending); + break; + } + + prev = pending; + } + + /* + * During abort, revert any changes to buffer persistence made made in + * this transaction. + */ + if (!pending) + { + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = xlrec->rlocator; + pending->op = PCOP_SET_PERSISTENCE; + pending->bufpersistence = !xlrec->persistence; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + } + } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = + (xl_smgr_bufpersistence *) XLogRecGetData(record); + SMgrRelation reln; + PendingCleanup *pending; + PendingCleanup *prev = NULL; + + reln = smgropen(xlrec->rlocator, InvalidBackendId); + SetRelationBuffersPersistence(reln, xlrec->persistence, true); + + /* + * Delete any pending action for persistence change, if present. There + * should be at most one entry for this action. + */ + for (pending = pendingCleanups; pending != NULL; + pending = pending->next) + { + if (RelFileLocatorEquals(xlrec->rlocator, pending->rlocator) && + (pending->op & PCOP_SET_PERSISTENCE) != 0) + { + Assert(pending->bufpersistence == xlrec->persistence); + + if (prev) + prev->next = pending->next; + else + pendingCleanups = pending->next; + + pfree(pending); + break; + } + + prev = pending; + } + + /* + * During abort, revert any changes to buffer persistence made made in + * this transaction. + */ + if (!pending) + { + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = xlrec->rlocator; + pending->op = PCOP_SET_PERSISTENCE; + pending->bufpersistence = !xlrec->persistence; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + } + } else elog(PANIC, "smgr_redo: unknown op code %u", info); } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index fdcd09bc5e..ea750812cc 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -55,6 +55,7 @@ #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/policy.h" +#include "commands/progress.h" #include "commands/sequence.h" #include "commands/tablecmds.h" #include "commands/tablespace.h" @@ -5661,6 +5662,189 @@ ATParseTransformCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, return newcmd; } +/* + * RelationChangePersistence: perform in-place persistence change of a relation + */ +static void +RelationChangePersistence(AlteredTableInfo *tab, char persistence, + LOCKMODE lockmode) +{ + Relation rel; + Relation classRel; + HeapTuple tuple, + newtuple; + Datum new_val[Natts_pg_class]; + bool new_null[Natts_pg_class], + new_repl[Natts_pg_class]; + int i; + List *relids; + ListCell *lc_oid; + + Assert(tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE); + Assert(lockmode == AccessExclusiveLock); + + /* + * Use ATRewriteTable instead of this function under the following + * condition. + */ + Assert(tab->constraints == NULL && tab->partition_constraint == NULL && + tab->newvals == NULL && !tab->verify_new_notnull); + + rel = table_open(tab->relid, lockmode); + + Assert(rel->rd_rel->relpersistence != persistence); + + elog(DEBUG1, "perform in-place persistence change"); + + /* + * Initially, gather all relations that require a persistence change. + */ + + /* Collect OIDs of indexes and toast relations */ + relids = RelationGetIndexList(rel); + relids = lcons_oid(rel->rd_id, relids); + + /* Add toast relation if any */ + if (OidIsValid(rel->rd_rel->reltoastrelid)) + { + List *toastidx; + Relation toastrel = table_open(rel->rd_rel->reltoastrelid, lockmode); + + relids = lappend_oid(relids, rel->rd_rel->reltoastrelid); + toastidx = RelationGetIndexList(toastrel); + relids = list_concat(relids, toastidx); + pfree(toastidx); + table_close(toastrel, NoLock); + } + + table_close(rel, NoLock); + + /* Make changes in storage */ + classRel = table_open(RelationRelationId, RowExclusiveLock); + + foreach(lc_oid, relids) + { + Oid reloid = lfirst_oid(lc_oid); + Relation r = relation_open(reloid, lockmode); + + /* + * XXXX: Some access methods don't support in-place persistence + * changes. GiST uses page LSNs to figure out whether a block has been + * modified. However, UNLOGGED GiST indexes use fake LSNs, which are + * incompatible with the real LSNs used for LOGGED indexes. + * + * Potentially, if gistGetFakeLSN behaved similarly for both permanent + * and unlogged indexes, we could avoid index rebuilds by emitting + * extra WAL records while the index is unlogged. + * + * Compare relam against a positive list to ensure the hard way is + * taken for unknown AMs. + */ + if (r->rd_rel->relkind == RELKIND_INDEX && + /* GiST is excluded */ + r->rd_rel->relam != BTREE_AM_OID && + r->rd_rel->relam != HASH_AM_OID && + r->rd_rel->relam != GIN_AM_OID && + r->rd_rel->relam != SPGIST_AM_OID && + r->rd_rel->relam != BRIN_AM_OID) + { + int reindex_flags; + ReindexParams params = {0}; + + /* reindex doesn't allow concurrent use of the index */ + table_close(r, NoLock); + + reindex_flags = + REINDEX_REL_SUPPRESS_INDEX_USE | + REINDEX_REL_CHECK_CONSTRAINTS; + + /* Set the same persistence with the parent relation. */ + if (persistence == RELPERSISTENCE_UNLOGGED) + reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED; + else + reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; + + /* this doesn't fire REINDEX event triegger */ + reindex_index(NULL, reloid, reindex_flags, persistence, ¶ms); + + continue; + } + + /* Create or drop init fork */ + if (persistence == RELPERSISTENCE_UNLOGGED) + RelationCreateInitFork(r); + else + RelationDropInitFork(r); + + /* + * If this relation becomes WAL-logged, immediately sync all files + * except the init-fork to establish the initial state on storage. The + * buffers should have already been flushed out by + * RelationCreate(Drop)InitFork called just above. The init-fork should + * already be synchronized as required. + */ + if (persistence == RELPERSISTENCE_PERMANENT) + { + for (i = 0; i < INIT_FORKNUM; i++) + { + if (smgrexists(RelationGetSmgr(r), i)) + smgrimmedsync(RelationGetSmgr(r), i); + } + } + + /* Update catalog */ + tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(reloid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", reloid); + + memset(new_val, 0, sizeof(new_val)); + memset(new_null, false, sizeof(new_null)); + memset(new_repl, false, sizeof(new_repl)); + + new_val[Anum_pg_class_relpersistence - 1] = CharGetDatum(persistence); + new_null[Anum_pg_class_relpersistence - 1] = false; + new_repl[Anum_pg_class_relpersistence - 1] = true; + + newtuple = heap_modify_tuple(tuple, RelationGetDescr(classRel), + new_val, new_null, new_repl); + + CatalogTupleUpdate(classRel, &newtuple->t_self, newtuple); + heap_freetuple(newtuple); + + /* + * If wal_level >= replica, switching to LOGGED necessitates WAL-logging + * the relation content for later recovery. This is not emitted when + * wal_level = minimal. + */ + if (persistence == RELPERSISTENCE_PERMANENT && XLogIsNeeded()) + { + ForkNumber fork; + xl_smgr_truncate xlrec; + + xlrec.blkno = 0; + xlrec.rlocator = r->rd_locator; + xlrec.flags = SMGR_TRUNCATE_ALL; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + + XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + + for (fork = 0; fork < INIT_FORKNUM; fork++) + { + if (smgrexists(RelationGetSmgr(r), fork)) + log_newpage_range(r, fork, 0, + smgrnblocks(RelationGetSmgr(r), fork), + false); + } + } + + table_close(r, NoLock); + } + + table_close(classRel, NoLock); +} + /* * ATRewriteTables: ALTER TABLE phase 3 */ @@ -5791,48 +5975,55 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode, tab->relid, tab->rewrite); - /* - * Create transient table that will receive the modified data. - * - * Ensure it is marked correctly as logged or unlogged. We have - * to do this here so that buffers for the new relfilenumber will - * have the right persistence set, and at the same time ensure - * that the original filenumbers's buffers will get read in with - * the correct setting (i.e. the original one). Otherwise a - * rollback after the rewrite would possibly result with buffers - * for the original filenumbers having the wrong persistence - * setting. - * - * NB: This relies on swap_relation_files() also swapping the - * persistence. That wouldn't work for pg_class, but that can't be - * unlogged anyway. - */ - OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, NewAccessMethod, - persistence, lockmode); + if (tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE) + RelationChangePersistence(tab, persistence, lockmode); + else + { + /* + * Create transient table that will receive the modified data. + * + * Ensure it is marked correctly as logged or unlogged. We + * have to do this here so that buffers for the new + * relfilenumber will have the right persistence set, and at + * the same time ensure that the original filenumbers's buffers + * will get read in with the correct setting (i.e. the original + * one). Otherwise a rollback after the rewrite would possibly + * result with buffers for the original filenumbers having the + * wrong persistence setting. + * + * NB: This relies on swap_relation_files() also swapping the + * persistence. That wouldn't work for pg_class, but that + * can't be unlogged anyway. + */ + OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, + NewAccessMethod, + persistence, lockmode); - /* - * Copy the heap data into the new table with the desired - * modifications, and test the current data within the table - * against new constraints generated by ALTER TABLE commands. - */ - ATRewriteTable(tab, OIDNewHeap, lockmode); + /* + * Copy the heap data into the new table with the desired + * modifications, and test the current data within the table + * against new constraints generated by ALTER TABLE commands. + */ + ATRewriteTable(tab, OIDNewHeap, lockmode); - /* - * Swap the physical files of the old and new heaps, then rebuild - * indexes and discard the old heap. We can use RecentXmin for - * the table's new relfrozenxid because we rewrote all the tuples - * in ATRewriteTable, so no older Xid remains in the table. Also, - * we never try to swap toast tables by content, since we have no - * interest in letting this code work on system catalogs. - */ - finish_heap_swap(tab->relid, OIDNewHeap, - false, false, true, - !OidIsValid(tab->newTableSpace), - RecentXmin, - ReadNextMultiXactId(), - persistence); + /* + * Swap the physical files of the old and new heaps, then + * rebuild indexes and discard the old heap. We can use + * RecentXmin for the table's new relfrozenxid because we + * rewrote all the tuples in ATRewriteTable, so no older Xid + * remains in the table. Also, we never try to swap toast + * tables by content, since we have no interest in letting + * this code work on system catalogs. + */ + finish_heap_swap(tab->relid, OIDNewHeap, + false, false, true, + !OidIsValid(tab->newTableSpace), + RecentXmin, + ReadNextMultiXactId(), + persistence); - InvokeObjectPostAlterHook(RelationRelationId, tab->relid, 0); + InvokeObjectPostAlterHook(RelationRelationId, tab->relid, 0); + } } else if (tab->rewrite > 0 && tab->relkind == RELKIND_SEQUENCE) { diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 7d601bef6d..4de1db412c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3804,6 +3804,90 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, } } +/* --------------------------------------------------------------------- + * SetRelationBuffersPersistence + * + * This function changes the persistence of all buffer pages of a relation + * then writes all dirty pages to disk (or kernel disk buffers) when + * switching to PERMANENT, ensuring the kernel has an up-to-date view of + * the relation. + * + * The caller must be holding AccessExclusiveLock on the target relation + * to ensure no other backend is busy dirtying more blocks. + * + * XXX currently it sequentially searches the buffer pool; consider + * implementing more efficient search methods. This routine isn't used in + * performance-critical code paths, so it's not worth additional overhead + * to make it go faster; see also DropRelationBuffers. + * -------------------------------------------------------------------- + */ +void +SetRelationBuffersPersistence(SMgrRelation srel, bool permanent, bool isRedo) +{ + int i; + RelFileLocatorBackend rlocator = srel->smgr_rlocator; + + Assert(!RelFileLocatorBackendIsTemp(rlocator)); + + if (!isRedo) + log_smgrbufpersistence(srel->smgr_rlocator.locator, permanent); + + ResourceOwnerEnlarge(CurrentResourceOwner); + + for (i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + if (!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufHdr->tag), + rlocator.locator)) + continue; + + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(bufHdr); + + if (!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufHdr->tag), + rlocator.locator)) + { + UnlockBufHdr(bufHdr, buf_state); + continue; + } + + if (permanent) + { + /* The init fork is being dropped, drop buffers for it. */ + if (BufTagGetForkNum(&bufHdr->tag) == INIT_FORKNUM) + { + InvalidateBuffer(bufHdr); + continue; + } + + buf_state |= BM_PERMANENT; + pg_atomic_write_u32(&bufHdr->state, buf_state); + + /* flush this buffer when switching to PERMANENT */ + if ((buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), + LW_SHARED); + FlushBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + UnpinBuffer(bufHdr); + } + else + UnlockBufHdr(bufHdr, buf_state); + } + else + { + /* There shouldn't be an init fork */ + Assert(BufTagGetForkNum(&bufHdr->tag) != INIT_FORKNUM); + UnlockBufHdr(bufHdr, buf_state); + } + } +} + /* --------------------------------------------------------------------- * DropRelationsAllBuffers * diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 525b98899f..c8c9cc361f 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -418,6 +418,12 @@ extractPageInfo(XLogReaderState *record) * source system. */ } + else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_BUFPERSISTENCE) + { + /* + * We can safely ignore these. These don't make any on-disk changes. + */ + } else if (rmid == RM_XACT_ID && ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT || (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED || diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index 807c0f8235..b38909ceb3 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -14,6 +14,7 @@ #ifndef STORAGE_XLOG_H #define STORAGE_XLOG_H +#include "access/simpleundolog.h" #include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/block.h" @@ -30,6 +31,7 @@ #define XLOG_SMGR_CREATE 0x10 #define XLOG_SMGR_TRUNCATE 0x20 #define XLOG_SMGR_UNLINK 0x30 +#define XLOG_SMGR_BUFPERSISTENCE 0x40 typedef struct xl_smgr_create { @@ -44,6 +46,12 @@ typedef struct xl_smgr_unlink ForkNumber forkNum; } xl_smgr_unlink; +typedef struct xl_smgr_bufpersistence +{ + RelFileLocator rlocator; + bool persistence; +} xl_smgr_bufpersistence; + /* flags for xl_smgr_truncate */ #define SMGR_TRUNCATE_HEAP 0x0001 #define SMGR_TRUNCATE_VM 0x0002 @@ -60,6 +68,8 @@ typedef struct xl_smgr_truncate extern void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum); extern void log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum); +extern void log_smgrbufpersistence(const RelFileLocator rlocator, + bool persistence); extern void smgr_redo(XLogReaderState *record); extern void smgr_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index d51d46d335..62f4fe430b 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -224,6 +224,9 @@ extern void DropRelationBuffers(struct SMgrRelationData *smgr_reln, int nforks, BlockNumber *firstDelBlock); extern void DropRelationsAllBuffers(struct SMgrRelationData **smgr_reln, int nlocators); +extern void SetRelationBuffersPersistence(struct SMgrRelationData *srel, + bool permanent, bool isRedo); + extern void DropDatabaseBuffers(Oid dbid); #define RelationGetNumberOfBlocks(reln) \ diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h index c57ae26b4c..746d3a910a 100644 --- a/src/include/storage/reinit.h +++ b/src/include/storage/reinit.h @@ -20,11 +20,11 @@ extern void ResetUnloggedRelations(int op); -extern void ResetUnloggedRelationIgnore(RelFileLocator rloc); extern bool parse_filename_for_nontemp_relation(const char *name, RelFileNumber *relnumber, ForkNumber *fork, unsigned *segno); +extern void ResetUnloggedRelationIgnore(RelFileLocator rloc); #define UNLOGGED_RELATION_CLEANUP 0x0001 #define UNLOGGED_RELATION_INIT 0x0002 diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index cf11358d8d..cf0b0dd51b 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3968,6 +3968,7 @@ xl_replorigin_set xl_restore_point xl_running_xacts xl_seq_rec +xl_smgr_bufpersistence xl_smgr_create xl_smgr_truncate xl_smgr_unlink -- 2.39.3