Re: In-placre persistance change of a relation - Mailing list pgsql-hackers
From | Kyotaro Horiguchi |
---|---|
Subject | Re: In-placre persistance change of a relation |
Date | |
Msg-id | 20220119.093907.300059856366141656.horikyota.ntt@gmail.com Whole thread Raw |
In response to | Re: In-placre persistance change of a relation (Tom Lane <tgl@sss.pgh.pa.us>) |
Responses |
Re: In-placre persistance change of a relation
|
List | pgsql-hackers |
At Tue, 18 Jan 2022 10:37:53 -0500, Tom Lane <tgl@sss.pgh.pa.us> wrote in > Julien Rouhaud <rjuju123@gmail.com> writes: > > The cfbot is failing on all OS with this version of the patch. Apparently > > v16-0002 introduces some usage of "testtablespace" client-side variable that's > > never defined, e.g. > > That test infrastructure got rearranged very recently, see d6d317dbf. Thanks to both. It seems that even though I know about the change, I forgot to make my repo up to date before checking. The v17 attached changes only the following point (as well as corresponding "expected" file). -+CREATE TABLESPACE regress_tablespace LOCATION :'testtablespace'; ++CREATE TABLESPACE regress_tablespace LOCATION ''; regards. -- Kyotaro Horiguchi NTT Open Source Software Center From c227842521de00d5da9dffb2f2dd86e8c1c171a8 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horikyoga.ntt@gmail.com> Date: Wed, 11 Nov 2020 21:51:11 +0900 Subject: [PATCH v17 1/2] In-place table persistence change Even though ALTER TABLE SET LOGGED/UNLOGGED does not require data rewriting, currently it runs heap rewrite which causes large amount of file I/O. This patch makes the command run without heap rewrite. Addition to that, SET LOGGED while wal_level > minimal emits WAL using XLOG_FPI instead of massive number of HEAP_INSERT's, which should be smaller. Also this allows for the cleanup of files left behind in the crash of the transaction that created it. --- src/backend/access/rmgrdesc/smgrdesc.c | 52 ++ src/backend/access/transam/README | 8 + src/backend/access/transam/xact.c | 7 + src/backend/access/transam/xlog.c | 17 + src/backend/catalog/storage.c | 545 +++++++++++++++++- src/backend/commands/tablecmds.c | 266 +++++++-- src/backend/replication/basebackup.c | 3 +- src/backend/storage/buffer/bufmgr.c | 88 +++ src/backend/storage/file/fd.c | 4 +- src/backend/storage/file/reinit.c | 344 +++++++---- src/backend/storage/smgr/md.c | 94 ++- src/backend/storage/smgr/smgr.c | 32 + src/backend/storage/sync/sync.c | 20 +- src/bin/pg_rewind/parsexlog.c | 24 + src/common/relpath.c | 47 +- src/include/catalog/storage.h | 3 + src/include/catalog/storage_xlog.h | 42 +- src/include/common/relpath.h | 9 +- src/include/storage/bufmgr.h | 2 + src/include/storage/fd.h | 1 + src/include/storage/md.h | 8 +- src/include/storage/reinit.h | 10 +- src/include/storage/smgr.h | 17 + src/test/recovery/t/027_persistence_change.pl | 263 +++++++++ 24 files changed, 1724 insertions(+), 182 deletions(-) create mode 100644 src/test/recovery/t/027_persistence_change.pl diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c index 7547813254..2c674e5de0 100644 --- a/src/backend/access/rmgrdesc/smgrdesc.c +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -40,6 +40,49 @@ smgr_desc(StringInfo buf, XLogReaderState *record) xlrec->blkno, xlrec->flags); pfree(path); } + else if (info == XLOG_SMGR_UNLINK) + { + xl_smgr_unlink *xlrec = (xl_smgr_unlink *) rec; + char *path = relpathperm(xlrec->rnode, xlrec->forkNum); + + appendStringInfoString(buf, path); + pfree(path); + } + else if (info == XLOG_SMGR_MARK) + { + xl_smgr_mark *xlrec = (xl_smgr_mark *) rec; + char *path = GetRelationPath(xlrec->rnode.dbNode, + xlrec->rnode.spcNode, + xlrec->rnode.relNode, + InvalidBackendId, + xlrec->forkNum, xlrec->mark); + char *action; + + switch (xlrec->action) + { + case XLOG_SMGR_MARK_CREATE: + action = "CREATE"; + break; + case XLOG_SMGR_MARK_UNLINK: + action = "DELETE"; + break; + default: + action = "<unknown action>"; + break; + } + + appendStringInfo(buf, "%s %s", action, path); + pfree(path); + } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = (xl_smgr_bufpersistence *) rec; + char *path = relpathperm(xlrec->rnode, MAIN_FORKNUM); + + appendStringInfoString(buf, path); + appendStringInfo(buf, " persistence %d", xlrec->persistence); + pfree(path); + } } const char * @@ -55,6 +98,15 @@ smgr_identify(uint8 info) case XLOG_SMGR_TRUNCATE: id = "TRUNCATE"; break; + case XLOG_SMGR_UNLINK: + id = "UNLINK"; + break; + case XLOG_SMGR_MARK: + id = "MARK"; + break; + case XLOG_SMGR_BUFPERSISTENCE: + id = "BUFPERSISTENCE"; + break; } return id; diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 1edc8180c1..b344bbe511 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -724,6 +724,14 @@ we must panic and abort recovery. The DBA will have to manually clean up then restart recovery. This is part of the reason for not writing a WAL entry until we've successfully done the original action. +The Smgr MARK files +-------------------------------- + +An smgr mark file is created when a new relation file is created to +mark the relfilenode needs to be cleaned up at recovery time. In +contrast to the four actions above, failure to remove smgr mark files +will lead to data loss, in which case the server will shut down. + Skipping WAL for New RelFileNode -------------------------------- diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index c9516e03fa..3c7010eb0f 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2197,6 +2197,9 @@ CommitTransaction(void) */ smgrDoPendingSyncs(true, is_parallel_worker); + /* Likewise delete mark files for files created during this transaction. */ + smgrDoPendingCleanups(true); + /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2447,6 +2450,9 @@ PrepareTransaction(void) */ smgrDoPendingSyncs(true, false); + /* Likewise delete mark files for files created during this transaction. */ + smgrDoPendingCleanups(true); + /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2772,6 +2778,7 @@ AbortTransaction(void) AfterTriggerEndXact(false); /* 'false' means it's abort */ AtAbort_Portals(); smgrDoPendingSyncs(false, is_parallel_worker); + smgrDoPendingCleanups(false); AtEOXact_LargeObject(false); AtAbort_Notify(); AtEOXact_RelationMap(false, is_parallel_worker); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index c9d4cbf3ff..7cab6a0170 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -40,6 +40,7 @@ #include "catalog/catversion.h" #include "catalog/pg_control.h" #include "catalog/pg_database.h" +#include "catalog/storage.h" #include "commands/progress.h" #include "commands/tablespace.h" #include "common/controldata_utils.h" @@ -4564,6 +4565,14 @@ ReadRecord(XLogReaderState *xlogreader, int emode, { ereport(DEBUG1, (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery"))); + + /* cleanup garbage files left during crash recovery */ + ResetUnloggedRelations(UNLOGGED_RELATION_DROP_BUFFER | + UNLOGGED_RELATION_CLEANUP); + + /* run rollback cleanup if any */ + smgrDoPendingDeletes(false); + InArchiveRecovery = true; if (StandbyModeRequested) StandbyMode = true; @@ -7824,6 +7833,14 @@ StartupXLOG(void) } } + /* cleanup garbage files left during crash recovery */ + if (!InArchiveRecovery) + ResetUnloggedRelations(UNLOGGED_RELATION_DROP_BUFFER | + UNLOGGED_RELATION_CLEANUP); + + /* run rollback cleanup if any */ + smgrDoPendingDeletes(false); + /* Allow resource managers to do any required cleanup. */ for (rmid = 0; rmid <= RM_MAX_ID; rmid++) { diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 9b8075536a..92a9451e90 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -19,6 +19,7 @@ #include "postgres.h" +#include "access/amapi.h" #include "access/parallel.h" #include "access/visibilitymap.h" #include "access/xact.h" @@ -66,6 +67,23 @@ typedef struct PendingRelDelete struct PendingRelDelete *next; /* linked-list link */ } PendingRelDelete; +#define PCOP_UNLINK_FORK (1 << 0) +#define PCOP_UNLINK_MARK (1 << 1) +#define PCOP_SET_PERSISTENCE (1 << 2) + +typedef struct PendingCleanup +{ + RelFileNode relnode; /* relation that may need to be deleted */ + int op; /* operation mask */ + bool bufpersistence; /* buffer persistence to set */ + int unlink_forknum; /* forknum to unlink */ + StorageMarks unlink_mark; /* mark to unlink */ + BackendId backend; /* InvalidBackendId if not a temp rel */ + bool atCommit; /* T=delete at commit; F=delete at abort */ + int nestLevel; /* xact nesting level of request */ + struct PendingCleanup *next; /* linked-list link */ +} PendingCleanup; + typedef struct PendingRelSync { RelFileNode rnode; @@ -73,6 +91,7 @@ typedef struct PendingRelSync } PendingRelSync; static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ +static PendingCleanup *pendingCleanups = NULL; /* head of linked list */ HTAB *pendingSyncHash = NULL; @@ -117,7 +136,8 @@ AddPendingSync(const RelFileNode *rnode) SMgrRelation RelationCreateStorage(RelFileNode rnode, char relpersistence) { - PendingRelDelete *pending; + PendingRelDelete *pendingdel; + PendingCleanup *pendingclean; SMgrRelation srel; BackendId backend; bool needs_wal; @@ -143,21 +163,41 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence) return NULL; /* placate compiler */ } + /* + * We are going to create a new storage file. If server crashes before the + * current transaction ends the file needs to be cleaned up. The + * SMGR_MARK_UNCOMMITED mark file prompts that work at the next startup. + */ srel = smgropen(rnode, backend); + log_smgrcreatemark(&rnode, MAIN_FORKNUM, SMGR_MARK_UNCOMMITTED); + smgrcreatemark(srel, MAIN_FORKNUM, SMGR_MARK_UNCOMMITTED, false); smgrcreate(srel, MAIN_FORKNUM, false); if (needs_wal) log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM); /* Add the relation to the list of stuff to delete at abort */ - pending = (PendingRelDelete *) + pendingdel = (PendingRelDelete *) MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete)); - pending->relnode = rnode; - pending->backend = backend; - pending->atCommit = false; /* delete if abort */ - pending->nestLevel = GetCurrentTransactionNestLevel(); - pending->next = pendingDeletes; - pendingDeletes = pending; + pendingdel->relnode = rnode; + pendingdel->backend = backend; + pendingdel->atCommit = false; /* delete if abort */ + pendingdel->nestLevel = GetCurrentTransactionNestLevel(); + pendingdel->next = pendingDeletes; + pendingDeletes = pendingdel; + + /* drop mark files at commit */ + pendingclean = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pendingclean->relnode = rnode; + pendingclean->op = PCOP_UNLINK_MARK; + pendingclean->unlink_forknum = MAIN_FORKNUM; + pendingclean->unlink_mark = SMGR_MARK_UNCOMMITTED; + pendingclean->backend = backend; + pendingclean->atCommit = true; + pendingclean->nestLevel = GetCurrentTransactionNestLevel(); + pendingclean->next = pendingCleanups; + pendingCleanups = pendingclean; if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded()) { @@ -168,6 +208,203 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence) return srel; } +/* + * RelationCreateInitFork + * Create physical storage for the init fork of a relation. + * + * Create the init fork for the relation. + * + * This function is transactional. The creation is WAL-logged, and if the + * transaction aborts later on, the init fork will be removed. + */ +void +RelationCreateInitFork(Relation rel) +{ + RelFileNode rnode = rel->rd_node; + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + SMgrRelation srel; + bool create = true; + + /* switch buffer persistence */ + SetRelationBuffersPersistence(RelationGetSmgr(rel), false, false); + + /* + * If we have entries for init-fork operations on this relation, that means + * that we have already registered pending delete entries to drop an + * init-fork preexisting since before the current transaction started. This + * function reverts that change just by removing the entries. + */ + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + + if (RelFileNodeEquals(rnode, pending->relnode) && + pending->unlink_forknum == INIT_FORKNUM) + { + if (prev) + prev->next = next; + else + pendingCleanups = next; + + pfree(pending); + /* prev does not change */ + + create = false; + } + else + prev = pending; + } + + if (!create) + return; + + /* + * We are going to create an init fork. If server crashes before the + * current transaction ends the init fork left alone corrupts data while + * recovery. The mark file works as the sentinel to identify that + * situation. + */ + srel = smgropen(rnode, InvalidBackendId); + log_smgrcreatemark(&rnode, INIT_FORKNUM, SMGR_MARK_UNCOMMITTED); + smgrcreatemark(srel, INIT_FORKNUM, SMGR_MARK_UNCOMMITTED, false); + + /* We don't have existing init fork, create it. */ + smgrcreate(srel, INIT_FORKNUM, false); + + /* + * index-init fork needs further initialization. ambuildempty shoud do + * WAL-log and file sync by itself but otherwise we do that by ourselves. + */ + if (rel->rd_rel->relkind == RELKIND_INDEX) + rel->rd_indam->ambuildempty(rel); + else + { + log_smgrcreate(&rnode, INIT_FORKNUM); + smgrimmedsync(srel, INIT_FORKNUM); + } + + /* drop the init fork, mark file and revert persistence at abort */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->relnode = rnode; + pending->op = PCOP_UNLINK_FORK | PCOP_UNLINK_MARK | PCOP_SET_PERSISTENCE; + pending->unlink_forknum = INIT_FORKNUM; + pending->unlink_mark = SMGR_MARK_UNCOMMITTED; + pending->bufpersistence = true; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + + /* drop mark file at commit */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->relnode = rnode; + pending->op = PCOP_UNLINK_MARK; + pending->unlink_forknum = INIT_FORKNUM; + pending->unlink_mark = SMGR_MARK_UNCOMMITTED; + pending->backend = InvalidBackendId; + pending->atCommit = true; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; +} + +/* + * RelationDropInitFork + * Delete physical storage for the init fork of a relation. + * + * Register pending-delete of the init fork. The real deletion is performed by + * smgrDoPendingDeletes at commit. + * + * This function is transactional. If the transaction aborts later on, the + * deletion doesn't happen. + */ +void +RelationDropInitFork(Relation rel) +{ + RelFileNode rnode = rel->rd_node; + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + bool inxact_created = false; + + /* switch buffer persistence */ + SetRelationBuffersPersistence(RelationGetSmgr(rel), true, false); + + /* + * If we have entries for init-fork operations of this relation, that means + * that we have created the init fork in the current transaction. We + * remove the init fork and mark file immediately in that case. Otherwise + * just register pending-delete for the existing init fork. + */ + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + + if (RelFileNodeEquals(rnode, pending->relnode) && + pending->unlink_forknum != INIT_FORKNUM) + { + /* unlink list entry */ + if (prev) + prev->next = next; + else + pendingCleanups = next; + + pfree(pending); + /* prev does not change */ + + inxact_created = true; + } + else + prev = pending; + } + + if (inxact_created) + { + SMgrRelation srel = smgropen(rnode, InvalidBackendId); + + /* + * INIT forks never be loaded to shared buffer so no point in dropping + * buffers for such files. + */ + log_smgrunlinkmark(&rnode, INIT_FORKNUM, SMGR_MARK_UNCOMMITTED); + smgrunlinkmark(srel, INIT_FORKNUM, SMGR_MARK_UNCOMMITTED, false); + log_smgrunlink(&rnode, INIT_FORKNUM); + smgrunlink(srel, INIT_FORKNUM, false); + return; + } + + /* register drop of this init fork file at commit */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->relnode = rnode; + pending->op = PCOP_UNLINK_FORK; + pending->unlink_forknum = INIT_FORKNUM; + pending->backend = InvalidBackendId; + pending->atCommit = true; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + + /* revert buffer-persistence changes at abort */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->relnode = rnode; + pending->op = PCOP_SET_PERSISTENCE; + pending->bufpersistence = false; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; +} + /* * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL. */ @@ -187,6 +424,88 @@ log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum) XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); } +/* + * Perform XLogInsert of an XLOG_SMGR_UNLINK record to WAL. + */ +void +log_smgrunlink(const RelFileNode *rnode, ForkNumber forkNum) +{ + xl_smgr_unlink xlrec; + + /* + * Make an XLOG entry reporting the file unlink. + */ + xlrec.rnode = *rnode; + xlrec.forkNum = forkNum; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_UNLINK | XLR_SPECIAL_REL_UPDATE); +} + +/* + * Perform XLogInsert of an XLOG_SMGR_CREATEMARK record to WAL. + */ +void +log_smgrcreatemark(const RelFileNode *rnode, ForkNumber forkNum, + StorageMarks mark) +{ + xl_smgr_mark xlrec; + + /* + * Make an XLOG entry reporting the file creation. + */ + xlrec.rnode = *rnode; + xlrec.forkNum = forkNum; + xlrec.mark = mark; + xlrec.action = XLOG_SMGR_MARK_CREATE; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_MARK | XLR_SPECIAL_REL_UPDATE); +} + +/* + * Perform XLogInsert of an XLOG_SMGR_UNLINKMARK record to WAL. + */ +void +log_smgrunlinkmark(const RelFileNode *rnode, ForkNumber forkNum, + StorageMarks mark) +{ + xl_smgr_mark xlrec; + + /* + * Make an XLOG entry reporting the file creation. + */ + xlrec.rnode = *rnode; + xlrec.forkNum = forkNum; + xlrec.mark = mark; + xlrec.action = XLOG_SMGR_MARK_UNLINK; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_MARK | XLR_SPECIAL_REL_UPDATE); +} + +/* + * Perform XLogInsert of an XLOG_SMGR_BUFPERSISTENCE record to WAL. + */ +void +log_smgrbufpersistence(const RelFileNode *rnode, bool persistence) +{ + xl_smgr_bufpersistence xlrec; + + /* + * Make an XLOG entry reporting the change of buffer persistence. + */ + xlrec.rnode = *rnode; + xlrec.persistence = persistence; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_BUFPERSISTENCE | XLR_SPECIAL_REL_UPDATE); +} + /* * RelationDropStorage * Schedule unlinking of physical storage at transaction commit. @@ -255,6 +574,7 @@ RelationPreserveStorage(RelFileNode rnode, bool atCommit) prev->next = next; else pendingDeletes = next; + pfree(pending); /* prev does not change */ } @@ -673,6 +993,88 @@ smgrDoPendingDeletes(bool isCommit) } } +/* + * smgrDoPendingUnmark() -- Clean up work that emits WAL records + * + * The operations handled in the function emits WAL records, which must be + * emitted before the commit record for the current transaction. + */ +void +smgrDoPendingCleanups(bool isCommit) +{ + int nestLevel = GetCurrentTransactionNestLevel(); + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + if (pending->nestLevel < nestLevel) + { + /* outer-level entries should not be processed yet */ + prev = pending; + } + else + { + /* unlink list entry first, so we don't retry on failure */ + if (prev) + prev->next = next; + else + pendingCleanups = next; + + /* do cleanup if called for */ + if (pending->atCommit == isCommit) + { + SMgrRelation srel; + + srel = smgropen(pending->relnode, pending->backend); + + Assert ((pending->op & + ~(PCOP_UNLINK_FORK | PCOP_UNLINK_MARK | + PCOP_SET_PERSISTENCE)) == 0); + + if (pending->op & PCOP_UNLINK_FORK) + { + /* other forks needs to drop buffers */ + Assert(pending->unlink_forknum == INIT_FORKNUM); + + /* Don't emit wal while recovery. */ + if (!InRecovery) + log_smgrunlink(&pending->relnode, + pending->unlink_forknum); + smgrunlink(srel, pending->unlink_forknum, false); + } + + if (pending->op & PCOP_UNLINK_MARK) + { + SMgrRelation srel; + + if (!InRecovery) + log_smgrunlinkmark(&pending->relnode, + pending->unlink_forknum, + pending->unlink_mark); + srel = smgropen(pending->relnode, pending->backend); + smgrunlinkmark(srel, pending->unlink_forknum, + pending->unlink_mark, InRecovery); + smgrclose(srel); + } + + if (pending->op & PCOP_SET_PERSISTENCE) + { + SetRelationBuffersPersistence(srel, pending->bufpersistence, + InRecovery); + } + } + + /* must explicitly free the list entry */ + pfree(pending); + /* prev does not change */ + } + } +} + /* * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact. */ @@ -933,6 +1335,15 @@ smgr_redo(XLogReaderState *record) reln = smgropen(xlrec->rnode, InvalidBackendId); smgrcreate(reln, xlrec->forkNum, true); } + else if (info == XLOG_SMGR_UNLINK) + { + xl_smgr_unlink *xlrec = (xl_smgr_unlink *) XLogRecGetData(record); + SMgrRelation reln; + + reln = smgropen(xlrec->rnode, InvalidBackendId); + smgrunlink(reln, xlrec->forkNum, true); + smgrclose(reln); + } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); @@ -1021,6 +1432,124 @@ smgr_redo(XLogReaderState *record) FreeFakeRelcacheEntry(rel); } + else if (info == XLOG_SMGR_MARK) + { + xl_smgr_mark *xlrec = (xl_smgr_mark *) XLogRecGetData(record); + SMgrRelation reln; + PendingCleanup *pending; + bool created = false; + + reln = smgropen(xlrec->rnode, InvalidBackendId); + + switch (xlrec->action) + { + case XLOG_SMGR_MARK_CREATE: + smgrcreatemark(reln, xlrec->forkNum, xlrec->mark, true); + created = true; + break; + case XLOG_SMGR_MARK_UNLINK: + smgrunlinkmark(reln, xlrec->forkNum, xlrec->mark, true); + break; + default: + elog(ERROR, "unknown smgr_mark action \"%c\"", xlrec->mark); + } + + if (created) + { + /* revert mark file operation at abort */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->relnode = xlrec->rnode; + pending->op = PCOP_UNLINK_MARK; + pending->unlink_forknum = xlrec->forkNum; + pending->unlink_mark = xlrec->mark; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + } + else + { + /* + * Delete pending action for this mark file if any. We should have + * at most one entry for this action. + */ + PendingCleanup *prev = NULL; + + for (pending = pendingCleanups; pending != NULL; + pending = pending->next) + { + if (RelFileNodeEquals(xlrec->rnode, pending->relnode) && + pending->unlink_forknum == xlrec->forkNum && + (pending->op & PCOP_UNLINK_MARK) != 0) + { + if (prev) + prev->next = pending->next; + else + pendingCleanups = pending->next; + + pfree(pending); + break; + } + + prev = pending; + } + } + } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = + (xl_smgr_bufpersistence *) XLogRecGetData(record); + SMgrRelation reln; + PendingCleanup *pending; + PendingCleanup *prev = NULL; + + reln = smgropen(xlrec->rnode, InvalidBackendId); + SetRelationBuffersPersistence(reln, xlrec->persistence, true); + + /* + * Delete pending action for persistence change if any. We should have + * at most one entry for this action. + */ + for (pending = pendingCleanups; pending != NULL; + pending = pending->next) + { + if (RelFileNodeEquals(xlrec->rnode, pending->relnode) && + (pending->op & PCOP_SET_PERSISTENCE) != 0) + { + Assert (pending->bufpersistence == xlrec->persistence); + + if (prev) + prev->next = pending->next; + else + pendingCleanups = pending->next; + + pfree(pending); + break; + } + + prev = pending; + } + + /* + * Revert buffer-persistence changes at abort if the relation is going + * to different persistence from before this transaction. + */ + if (!pending) + { + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->relnode = xlrec->rnode; + pending->op = PCOP_SET_PERSISTENCE; + pending->bufpersistence = !xlrec->persistence; + pending->backend = InvalidBackendId; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + } + } else elog(PANIC, "smgr_redo: unknown op code %u", info); } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 1f0654c2f5..9e673ba68f 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -52,6 +52,7 @@ #include "commands/defrem.h" #include "commands/event_trigger.h" #include "commands/policy.h" +#include "commands/progress.h" #include "commands/sequence.h" #include "commands/tablecmds.h" #include "commands/tablespace.h" @@ -5346,6 +5347,187 @@ ATParseTransformCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, return newcmd; } +/* + * RelationChangePersistence: do in-place persistence change of a relation + */ +static void +RelationChangePersistence(AlteredTableInfo *tab, char persistence, + LOCKMODE lockmode) +{ + Relation rel; + Relation classRel; + HeapTuple tuple, + newtuple; + Datum new_val[Natts_pg_class]; + bool new_null[Natts_pg_class], + new_repl[Natts_pg_class]; + int i; + List *relids; + ListCell *lc_oid; + + Assert(tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE); + Assert(lockmode == AccessExclusiveLock); + + /* + * Under the following condition, we need to call ATRewriteTable, which + * cannot be false in the AT_REWRITE_ALTER_PERSISTENCE case. + */ + Assert(tab->constraints == NULL && tab->partition_constraint == NULL && + tab->newvals == NULL && !tab->verify_new_notnull); + + rel = table_open(tab->relid, lockmode); + + Assert(rel->rd_rel->relpersistence != persistence); + + elog(DEBUG1, "perform in-place persistnce change"); + + /* + * First we collect all relations that we need to change persistence. + */ + + /* Collect OIDs of indexes and toast relations */ + relids = RelationGetIndexList(rel); + relids = lcons_oid(rel->rd_id, relids); + + /* Add toast relation if any */ + if (OidIsValid(rel->rd_rel->reltoastrelid)) + { + List *toastidx; + Relation toastrel = table_open(rel->rd_rel->reltoastrelid, lockmode); + + relids = lappend_oid(relids, rel->rd_rel->reltoastrelid); + toastidx = RelationGetIndexList(toastrel); + relids = list_concat(relids, toastidx); + pfree(toastidx); + table_close(toastrel, NoLock); + } + + table_close(rel, NoLock); + + /* Make changes in storage */ + classRel = table_open(RelationRelationId, RowExclusiveLock); + + foreach (lc_oid, relids) + { + Oid reloid = lfirst_oid(lc_oid); + Relation r = relation_open(reloid, lockmode); + + /* + * XXXX: Some access methods do not bear up an in-place persistence + * change. Specifically, GiST uses page LSNs to figure out whether a + * block has changed, where UNLOGGED GiST indexes use fake LSNs that + * are incompatible with real LSNs used for LOGGED ones. + * + * Maybe if gistGetFakeLSN behaved the same way for permanent and + * unlogged indexes, we could skip index rebuild in exchange of some + * extra WAL records emitted while it is unlogged. + * + * Check relam against a positive list so that we take this way for + * unknown AMs. + */ + if (r->rd_rel->relkind == RELKIND_INDEX && + /* GiST is excluded */ + r->rd_rel->relam != BTREE_AM_OID && + r->rd_rel->relam != HASH_AM_OID && + r->rd_rel->relam != GIN_AM_OID && + r->rd_rel->relam != SPGIST_AM_OID && + r->rd_rel->relam != BRIN_AM_OID) + { + int reindex_flags; + ReindexParams params = {0}; + + /* reindex doesn't allow concurrent use of the index */ + table_close(r, NoLock); + + reindex_flags = + REINDEX_REL_SUPPRESS_INDEX_USE | + REINDEX_REL_CHECK_CONSTRAINTS; + + /* Set the same persistence with the parent relation. */ + if (persistence == RELPERSISTENCE_UNLOGGED) + reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED; + else + reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; + + reindex_index(reloid, reindex_flags, persistence, ¶ms); + + continue; + } + + /* Create or drop init fork */ + if (persistence == RELPERSISTENCE_UNLOGGED) + RelationCreateInitFork(r); + else + RelationDropInitFork(r); + + /* + * When this relation gets WAL-logged, immediately sync all files but + * initfork to establish the initial state on storage. Buffers have + * already flushed out by RelationCreate(Drop)InitFork called just + * above. Initfork should have been synced as needed. + */ + if (persistence == RELPERSISTENCE_PERMANENT) + { + for (i = 0 ; i < INIT_FORKNUM ; i++) + { + if (smgrexists(RelationGetSmgr(r), i)) + smgrimmedsync(RelationGetSmgr(r), i); + } + } + + /* Update catalog */ + tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(reloid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", reloid); + + memset(new_val, 0, sizeof(new_val)); + memset(new_null, false, sizeof(new_null)); + memset(new_repl, false, sizeof(new_repl)); + + new_val[Anum_pg_class_relpersistence - 1] = CharGetDatum(persistence); + new_null[Anum_pg_class_relpersistence - 1] = false; + new_repl[Anum_pg_class_relpersistence - 1] = true; + + newtuple = heap_modify_tuple(tuple, RelationGetDescr(classRel), + new_val, new_null, new_repl); + + CatalogTupleUpdate(classRel, &newtuple->t_self, newtuple); + heap_freetuple(newtuple); + + /* + * While wal_level >= replica, switching to LOGGED requires the + * relation content to be WAL-logged to recover the table. + * We don't emit this fhile wal_level = minimal. + */ + if (persistence == RELPERSISTENCE_PERMANENT && XLogIsNeeded()) + { + ForkNumber fork; + xl_smgr_truncate xlrec; + + xlrec.blkno = 0; + xlrec.rnode = r->rd_node; + xlrec.flags = SMGR_TRUNCATE_ALL; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + + XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + + for (fork = 0; fork < INIT_FORKNUM ; fork++) + { + if (smgrexists(RelationGetSmgr(r), fork)) + log_newpage_range(r, fork, 0, + smgrnblocks(RelationGetSmgr(r), fork), + false); + } + } + + table_close(r, NoLock); + } + + table_close(classRel, NoLock); +} + /* * ATRewriteTables: ALTER TABLE phase 3 */ @@ -5476,47 +5658,55 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode, tab->relid, tab->rewrite); - /* - * Create transient table that will receive the modified data. - * - * Ensure it is marked correctly as logged or unlogged. We have - * to do this here so that buffers for the new relfilenode will - * have the right persistence set, and at the same time ensure - * that the original filenode's buffers will get read in with the - * correct setting (i.e. the original one). Otherwise a rollback - * after the rewrite would possibly result with buffers for the - * original filenode having the wrong persistence setting. - * - * NB: This relies on swap_relation_files() also swapping the - * persistence. That wouldn't work for pg_class, but that can't be - * unlogged anyway. - */ - OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, NewAccessMethod, - persistence, lockmode); + if (tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE) + RelationChangePersistence(tab, persistence, lockmode); + else + { + /* + * Create transient table that will receive the modified data. + * + * Ensure it is marked correctly as logged or unlogged. We + * have to do this here so that buffers for the new relfilenode + * will have the right persistence set, and at the same time + * ensure that the original filenode's buffers will get read in + * with the correct setting (i.e. the original one). Otherwise + * a rollback after the rewrite would possibly result with + * buffers for the original filenode having the wrong + * persistence setting. + * + * NB: This relies on swap_relation_files() also swapping the + * persistence. That wouldn't work for pg_class, but that can't + * be unlogged anyway. + */ + OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, + NewAccessMethod, + persistence, lockmode); - /* - * Copy the heap data into the new table with the desired - * modifications, and test the current data within the table - * against new constraints generated by ALTER TABLE commands. - */ - ATRewriteTable(tab, OIDNewHeap, lockmode); + /* + * Copy the heap data into the new table with the desired + * modifications, and test the current data within the table + * against new constraints generated by ALTER TABLE commands. + */ + ATRewriteTable(tab, OIDNewHeap, lockmode); - /* - * Swap the physical files of the old and new heaps, then rebuild - * indexes and discard the old heap. We can use RecentXmin for - * the table's new relfrozenxid because we rewrote all the tuples - * in ATRewriteTable, so no older Xid remains in the table. Also, - * we never try to swap toast tables by content, since we have no - * interest in letting this code work on system catalogs. - */ - finish_heap_swap(tab->relid, OIDNewHeap, - false, false, true, - !OidIsValid(tab->newTableSpace), - RecentXmin, - ReadNextMultiXactId(), - persistence); + /* + * Swap the physical files of the old and new heaps, then + * rebuild indexes and discard the old heap. We can use + * RecentXmin for the table's new relfrozenxid because we + * rewrote all the tuples in ATRewriteTable, so no older Xid + * remains in the table. Also, we never try to swap toast + * tables by content, since we have no interest in letting this + * code work on system catalogs. + */ + finish_heap_swap(tab->relid, OIDNewHeap, + false, false, true, + !OidIsValid(tab->newTableSpace), + RecentXmin, + ReadNextMultiXactId(), + persistence); - InvokeObjectPostAlterHook(RelationRelationId, tab->relid, 0); + InvokeObjectPostAlterHook(RelationRelationId, tab->relid, 0); + } } else { diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 3afbbe7e02..3f16b5f58c 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -1102,6 +1102,7 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, bool excludeFound; ForkNumber relForkNum; /* Type of fork if file is a relation */ int relOidChars; /* Chars in filename that are the rel oid */ + StorageMarks mark; /* Skip special stuff */ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) @@ -1152,7 +1153,7 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, /* Exclude all forks for unlogged tables except the init fork */ if (isDbDir && parse_filename_for_nontemp_relation(de->d_name, &relOidChars, - &relForkNum)) + &relForkNum, &mark)) { /* Never exclude init forks */ if (relForkNum != INIT_FORKNUM) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index a2512e750c..6384b4efbe 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -37,6 +37,7 @@ #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/storage.h" +#include "catalog/storage_xlog.h" #include "executor/instrument.h" #include "lib/binaryheap.h" #include "miscadmin.h" @@ -3154,6 +3155,93 @@ DropRelFileNodeBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, } } +/* --------------------------------------------------------------------- + * SetRelFileNodeBuffersPersistence + * + * This function changes the persistence of all buffer pages of a relation + * then writes all dirty pages of the relation out to disk when switching + * to PERMANENT. (or more accurately, out to kernel disk buffers), + * ensuring that the kernel has an up-to-date view of the relation. + * + * Generally, the caller should be holding AccessExclusiveLock on the + * target relation to ensure that no other backend is busy dirtying + * more blocks of the relation; the effects can't be expected to last + * after the lock is released. + * + * XXX currently it sequentially searches the buffer pool, should be + * changed to more clever ways of searching. This routine is not + * used in any performance-critical code paths, so it's not worth + * adding additional overhead to normal paths to make it go faster; + * but see also DropRelFileNodeBuffers. + * -------------------------------------------------------------------- + */ +void +SetRelationBuffersPersistence(SMgrRelation srel, bool permanent, bool isRedo) +{ + int i; + RelFileNodeBackend rnode = srel->smgr_rnode; + + Assert (!RelFileNodeBackendIsTemp(rnode)); + + if (!isRedo) + log_smgrbufpersistence(&srel->smgr_rnode.node, permanent); + + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + for (i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node)) + continue; + + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(bufHdr); + + if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node)) + { + UnlockBufHdr(bufHdr, buf_state); + continue; + } + + if (permanent) + { + /* Init fork is being dropped, drop buffers for it. */ + if (bufHdr->tag.forkNum == INIT_FORKNUM) + { + InvalidateBuffer(bufHdr); + continue; + } + + buf_state |= BM_PERMANENT; + pg_atomic_write_u32(&bufHdr->state, buf_state); + + /* we flush this buffer when switching to PERMANENT */ + if ((buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), + LW_SHARED); + FlushBuffer(bufHdr, srel); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + UnpinBuffer(bufHdr, true); + } + else + UnlockBufHdr(bufHdr, buf_state); + } + else + { + /* init fork is always BM_PERMANENT. See BufferAlloc */ + if (bufHdr->tag.forkNum != INIT_FORKNUM) + buf_state &= ~BM_PERMANENT; + + UnlockBufHdr(bufHdr, buf_state); + } + } +} + /* --------------------------------------------------------------------- * DropRelFileNodesAllBuffers * diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 14b77f2861..2fc9f17c28 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -349,8 +349,6 @@ static void pre_sync_fname(const char *fname, bool isdir, int elevel); static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel); -static int fsync_parent_path(const char *fname, int elevel); - /* * pg_fsync --- do fsync with or without writethrough @@ -3759,7 +3757,7 @@ fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel) * This is aimed at making file operations persistent on disk in case of * an OS crash or power failure. */ -static int +int fsync_parent_path(const char *fname, int elevel) { char parentpath[MAXPGPATH]; diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index f053fe0495..1124e95d0d 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -16,29 +16,49 @@ #include <unistd.h> +#include "access/xlog.h" +#include "catalog/pg_tablespace_d.h" #include "common/relpath.h" #include "postmaster/startup.h" +#include "storage/bufmgr.h" #include "storage/copydir.h" #include "storage/fd.h" +#include "storage/md.h" #include "storage/reinit.h" +#include "storage/smgr.h" #include "utils/hsearch.h" #include "utils/memutils.h" static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, - int op); + Oid tspid, int op); static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, - int op); + Oid tspid, Oid dbid, int op); typedef struct { Oid reloid; /* hash key */ -} unlogged_relation_entry; + bool has_init; /* has INIT fork */ + bool dirty_init; /* needs to remove INIT fork */ + bool dirty_all; /* needs to remove all forks */ +} relfile_entry; /* - * Reset unlogged relations from before the last restart. + * Clean up and reset relation files from before the last restart. * - * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any - * relation with an "init" fork, except for the "init" fork itself. + * If op includes UNLOGGED_RELATION_CLEANUP, we perform different operations + * depending on the existence of the "cleanup" forks. + * + * If SMGR_MARK_UNCOMMITTED mark file for init fork is present, we remove the + * init fork along with the mark file. + * + * If SMGR_MARK_UNCOMMITTED mark file for main fork is present we remove the + * whole relation along with the mark file. + * + * Otherwise, if the "init" fork is found. we remove all forks of any relation + * with the "init" fork, except for the "init" fork itself. + * + * If op includes UNLOGGED_RELATION_DROP_BUFFER, we drop all buffers for all + * relations that have the "cleanup" and/or the "init" forks. * * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main * fork. @@ -72,7 +92,7 @@ ResetUnloggedRelations(int op) /* * First process unlogged files in pg_default ($PGDATA/base) */ - ResetUnloggedRelationsInTablespaceDir("base", op); + ResetUnloggedRelationsInTablespaceDir("base", DEFAULTTABLESPACE_OID, op); /* * Cycle through directories for all non-default tablespaces. @@ -81,13 +101,19 @@ ResetUnloggedRelations(int op) while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL) { + Oid tspid; + if (strcmp(spc_de->d_name, ".") == 0 || strcmp(spc_de->d_name, "..") == 0) continue; snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); - ResetUnloggedRelationsInTablespaceDir(temp_path, op); + + tspid = atooid(spc_de->d_name); + + Assert(tspid != 0); + ResetUnloggedRelationsInTablespaceDir(temp_path, tspid, op); } FreeDir(spc_dir); @@ -103,7 +129,8 @@ ResetUnloggedRelations(int op) * Process one tablespace directory for ResetUnloggedRelations */ static void -ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) +ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, + Oid tspid, int op) { DIR *ts_dir; struct dirent *de; @@ -130,6 +157,8 @@ ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) while ((de = ReadDir(ts_dir, tsdirname)) != NULL) { + Oid dbid; + /* * We're only interested in the per-database directories, which have * numeric names. Note that this code will also (properly) ignore "." @@ -148,7 +177,10 @@ ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) ereport_startup_progress("resetting unlogged relations (cleanup), elapsed time: %ld.%02d s, current path: %s", dbspace_path); - ResetUnloggedRelationsInDbspaceDir(dbspace_path, op); + dbid = atooid(de->d_name); + Assert(dbid != 0); + + ResetUnloggedRelationsInDbspaceDir(dbspace_path, tspid, dbid, op); } FreeDir(ts_dir); @@ -158,125 +190,227 @@ ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) * Process one per-dbspace directory for ResetUnloggedRelations */ static void -ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) +ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, + Oid tspid, Oid dbid, int op) { DIR *dbspace_dir; struct dirent *de; char rm_path[MAXPGPATH * 2]; + HTAB *hash; + HASHCTL ctl; /* Caller must specify at least one operation. */ - Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0); + Assert((op & (UNLOGGED_RELATION_CLEANUP | + UNLOGGED_RELATION_DROP_BUFFER | + UNLOGGED_RELATION_INIT)) != 0); /* * Cleanup is a two-pass operation. First, we go through and identify all * the files with init forks. Then, we go through again and nuke * everything with the same OID except the init fork. */ + + /* + * It's possible that someone could create tons of unlogged relations in + * the same database & tablespace, so we'd better use a hash table rather + * than an array or linked list to keep track of which files need to be + * reset. Otherwise, this cleanup operation would be O(n^2). + */ + memset(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(relfile_entry); + hash = hash_create("relfilenode cleanup hash", + 32, &ctl, HASH_ELEM | HASH_BLOBS); + + /* Collect INIT fork and mark files in the directory. */ + dbspace_dir = AllocateDir(dbspacedirname); + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + int oidchars; + ForkNumber forkNum; + StorageMarks mark; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, + &forkNum, &mark)) + continue; + + if (forkNum == INIT_FORKNUM || mark == SMGR_MARK_UNCOMMITTED) + { + Oid key; + relfile_entry *ent; + bool found; + + /* + * Record the relfilenode information. If it has + * SMGR_MARK_UNCOMMITTED mark files, the relfilenode is in dirty + * state, where clean up is needed. + */ + key = atooid(de->d_name); + ent = hash_search(hash, &key, HASH_ENTER, &found); + + if (!found) + { + ent->has_init = false; + ent->dirty_init = false; + ent->dirty_all = false; + } + + if (forkNum == INIT_FORKNUM && mark == SMGR_MARK_UNCOMMITTED) + ent->dirty_init = true; + else if (forkNum == MAIN_FORKNUM && mark == SMGR_MARK_UNCOMMITTED) + ent->dirty_all = true; + else + { + Assert(forkNum == INIT_FORKNUM); + ent->has_init = true; + } + } + } + + /* Done with the first pass. */ + FreeDir(dbspace_dir); + + /* nothing to do if we don't have init nor cleanup forks */ + if (hash_get_num_entries(hash) < 1) + { + hash_destroy(hash); + return; + } + + if ((op & UNLOGGED_RELATION_DROP_BUFFER) != 0) + { + /* + * When we come here after recovery, smgr object for this file might + * have been created. In that case we need to drop all buffers then the + * smgr object before initializing the unlogged relation. This is safe + * as far as no other backends have accessed the relation before + * starting archive recovery. + */ + HASH_SEQ_STATUS status; + relfile_entry *ent; + SMgrRelation *srels = palloc(sizeof(SMgrRelation) * 8); + int maxrels = 8; + int nrels = 0; + int i; + + Assert(!HotStandbyActive()); + + hash_seq_init(&status, hash); + while((ent = (relfile_entry *) hash_seq_search(&status)) != NULL) + { + RelFileNodeBackend rel; + + /* + * The relation is persistent and stays remain persistent. Don't + * drop the buffers for this relation. + */ + if (ent->has_init && ent->dirty_init) + continue; + + if (maxrels <= nrels) + { + maxrels *= 2; + srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); + } + + rel.backend = InvalidBackendId; + rel.node.spcNode = tspid; + rel.node.dbNode = dbid; + rel.node.relNode = ent->reloid; + + srels[nrels++] = smgropen(rel.node, InvalidBackendId); + } + + DropRelFileNodesAllBuffers(srels, nrels); + + for (i = 0 ; i < nrels ; i++) + smgrclose(srels[i]); + } + + /* + * Now, make a second pass and remove anything that matches. + */ if ((op & UNLOGGED_RELATION_CLEANUP) != 0) { - HTAB *hash; - HASHCTL ctl; - - /* - * It's possible that someone could create a ton of unlogged relations - * in the same database & tablespace, so we'd better use a hash table - * rather than an array or linked list to keep track of which files - * need to be reset. Otherwise, this cleanup operation would be - * O(n^2). - */ - ctl.keysize = sizeof(Oid); - ctl.entrysize = sizeof(unlogged_relation_entry); - ctl.hcxt = CurrentMemoryContext; - hash = hash_create("unlogged relation OIDs", 32, &ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - /* Scan the directory. */ dbspace_dir = AllocateDir(dbspacedirname); while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { - ForkNumber forkNum; - int oidchars; - unlogged_relation_entry ent; + ForkNumber forkNum; + StorageMarks mark; + int oidchars; + Oid key; + relfile_entry *ent; + RelFileNodeBackend rel; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, - &forkNum)) - continue; - - /* Also skip it unless this is the init fork. */ - if (forkNum != INIT_FORKNUM) - continue; - - /* - * Put the OID portion of the name into the hash table, if it - * isn't already. - */ - ent.reloid = atooid(de->d_name); - (void) hash_search(hash, &ent, HASH_ENTER, NULL); - } - - /* Done with the first pass. */ - FreeDir(dbspace_dir); - - /* - * If we didn't find any init forks, there's no point in continuing; - * we can bail out now. - */ - if (hash_get_num_entries(hash) == 0) - { - hash_destroy(hash); - return; - } - - /* - * Now, make a second pass and remove anything that matches. - */ - dbspace_dir = AllocateDir(dbspacedirname); - while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) - { - ForkNumber forkNum; - int oidchars; - unlogged_relation_entry ent; - - /* Skip anything that doesn't look like a relation data file. */ - if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, - &forkNum)) - continue; - - /* We never remove the init fork. */ - if (forkNum == INIT_FORKNUM) + &forkNum, &mark)) continue; /* * See whether the OID portion of the name shows up in the hash * table. If so, nuke it! */ - ent.reloid = atooid(de->d_name); - if (hash_search(hash, &ent, HASH_FIND, NULL)) + key = atooid(de->d_name); + ent = hash_search(hash, &key, HASH_FIND, NULL); + + if (!ent) + continue; + + if (!ent->dirty_all) { - snprintf(rm_path, sizeof(rm_path), "%s/%s", - dbspacedirname, de->d_name); - if (unlink(rm_path) < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", - rm_path))); + /* clean permanent relations don't need cleanup */ + if (!ent->has_init) + continue; + + if (ent->dirty_init) + { + /* + * The crashed trasaction did SET UNLOGGED. This relation + * is restored to a LOGGED relation. + */ + if (forkNum != INIT_FORKNUM) + continue; + } else - elog(DEBUG2, "unlinked file \"%s\"", rm_path); + { + /* + * we don't remove the INIT fork of a non-dirty + * relfilenode + */ + if (forkNum == INIT_FORKNUM && mark == SMGR_MARK_NONE) + continue; + } } + + /* so, nuke it! */ + snprintf(rm_path, sizeof(rm_path), "%s/%s", + dbspacedirname, de->d_name); + if (unlink(rm_path) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + rm_path))); + + rel.backend = InvalidBackendId; + rel.node.spcNode = tspid; + rel.node.dbNode = dbid; + rel.node.relNode = atooid(de->d_name); + + ForgetRelationForkSyncRequests(rel, forkNum); } /* Cleanup is complete. */ FreeDir(dbspace_dir); - hash_destroy(hash); } + hash_destroy(hash); + hash = NULL; + /* * Initialization happens after cleanup is complete: we copy each init - * fork file to the corresponding main fork file. Note that if we are - * asked to do both cleanup and init, we may never get here: if the - * cleanup code determines that there are no init forks in this dbspace, - * it will return before we get to this point. + * fork file to the corresponding main fork file. */ if ((op & UNLOGGED_RELATION_INIT) != 0) { @@ -285,6 +419,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; + StorageMarks mark; int oidchars; char oidbuf[OIDCHARS + 1]; char srcpath[MAXPGPATH * 2]; @@ -292,9 +427,11 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, - &forkNum)) + &forkNum, &mark)) continue; + Assert(mark == SMGR_MARK_NONE); + /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; @@ -328,15 +465,18 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; + StorageMarks mark; int oidchars; char oidbuf[OIDCHARS + 1]; char mainpath[MAXPGPATH]; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, - &forkNum)) + &forkNum, &mark)) continue; + Assert(mark == SMGR_MARK_NONE); + /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; @@ -379,7 +519,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) */ bool parse_filename_for_nontemp_relation(const char *name, int *oidchars, - ForkNumber *fork) + ForkNumber *fork, StorageMarks *mark) { int pos; @@ -410,11 +550,19 @@ parse_filename_for_nontemp_relation(const char *name, int *oidchars, for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar) ; - if (segchar <= 1) - return false; - pos += segchar; + if (segchar > 1) + pos += segchar; } + /* mark file? */ + if (name[pos] == '.' && name[pos + 1] != 0) + { + *mark = name[pos + 1]; + pos += 2; + } + else + *mark = SMGR_MARK_NONE; + /* Now we should be at the end. */ if (name[pos] != '\0') return false; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index d26c915f90..007efe68a5 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -139,7 +139,8 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno, BlockNumber blkno, bool skipFsync, int behavior); static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); - +static bool mdmarkexists(SMgrRelation reln, ForkNumber forkNum, + StorageMarks mark); /* * mdinit() -- Initialize private state for magnetic disk storage manager. @@ -169,6 +170,82 @@ mdexists(SMgrRelation reln, ForkNumber forkNum) return (mdopenfork(reln, forkNum, EXTENSION_RETURN_NULL) != NULL); } +/* + * mdcreatemark() -- Create a mark file. + * + * If isRedo is true, it's okay for the file to exist already. + */ +void +mdcreatemark(SMgrRelation reln, ForkNumber forkNum, StorageMarks mark, + bool isRedo) +{ + char *path =markpath(reln->smgr_rnode, forkNum, mark); + int fd; + + /* See mdcreate for details.. */ + TablespaceCreateDbspace(reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + isRedo); + + fd = BasicOpenFile(path, O_WRONLY | O_CREAT | O_EXCL); + if (fd < 0 && (!isRedo || errno != EEXIST)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not crete mark file \"%s\": %m", path))); + + pg_fsync(fd); + close(fd); + + /* + * To guarantee that the creation of the file is persistent, fsync its + * parent directory. + */ + fsync_parent_path(path, ERROR); + + pfree(path); +} + + +/* + * mdunlinkmark() -- Delete the mark file + * + * If isRedo is true, it's okay for the file being not found. + */ +void +mdunlinkmark(SMgrRelation reln, ForkNumber forkNum, StorageMarks mark, + bool isRedo) +{ + char *path = markpath(reln->smgr_rnode, forkNum, mark); + + if (!isRedo || mdmarkexists(reln, forkNum, mark)) + durable_unlink(path, ERROR); + + pfree(path); +} + +/* + * mdmarkexists() -- Check if the file exists. + */ +static bool +mdmarkexists(SMgrRelation reln, ForkNumber forkNum, StorageMarks mark) +{ + char *path = markpath(reln->smgr_rnode, forkNum, mark); + int fd; + + fd = BasicOpenFile(path, O_RDONLY); + if (fd < 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access mark file \"%s\": %m", path))); + pfree(path); + + if (fd < 0) + return false; + + close(fd); + return true; +} + /* * mdcreate() -- Create a new relation on magnetic disk. * @@ -1025,6 +1102,15 @@ register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum, RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ ); } +/* + * ForgetRelationForkSyncRequests -- forget any fsyncs and unlinks for a fork + */ +void +ForgetRelationForkSyncRequests(RelFileNodeBackend rnode, ForkNumber forknum) +{ + register_forget_request(rnode, forknum, 0); +} + /* * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB */ @@ -1378,12 +1464,14 @@ mdsyncfiletag(const FileTag *ftag, char *path) * Return 0 on success, -1 on failure, with errno set. */ int -mdunlinkfiletag(const FileTag *ftag, char *path) +mdunlinkfiletag(const FileTag *ftag, char *path, StorageMarks mark) { char *p; /* Compute the path. */ - p = relpathperm(ftag->rnode, MAIN_FORKNUM); + p = GetRelationPath(ftag->rnode.dbNode, ftag->rnode.spcNode, + ftag->rnode.relNode, InvalidBackendId, MAIN_FORKNUM, + mark); strlcpy(path, p, MAXPGPATH); pfree(p); diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index eb701dce57..4819b5c404 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -62,6 +62,10 @@ typedef struct f_smgr void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_createmark) (SMgrRelation reln, ForkNumber forknum, + StorageMarks mark, bool isRedo); + void (*smgr_unlinkmark) (SMgrRelation reln, ForkNumber forknum, + StorageMarks mark, bool isRedo); } f_smgr; static const f_smgr smgrsw[] = { @@ -82,6 +86,8 @@ static const f_smgr smgrsw[] = { .smgr_nblocks = mdnblocks, .smgr_truncate = mdtruncate, .smgr_immedsync = mdimmedsync, + .smgr_createmark = mdcreatemark, + .smgr_unlinkmark = mdunlinkmark, } }; @@ -335,6 +341,26 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo); } +/* + * smgrcreatemark() -- Create a mark file + */ +void +smgrcreatemark(SMgrRelation reln, ForkNumber forknum, StorageMarks mark, + bool isRedo) +{ + smgrsw[reln->smgr_which].smgr_createmark(reln, forknum, mark, isRedo); +} + +/* + * smgrunlinkmark() -- Delete a mark file + */ +void +smgrunlinkmark(SMgrRelation reln, ForkNumber forknum, StorageMarks mark, + bool isRedo) +{ + smgrsw[reln->smgr_which].smgr_unlinkmark(reln, forknum, mark, isRedo); +} + /* * smgrdosyncall() -- Immediately sync all forks of all given relations * @@ -662,6 +688,12 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum) smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); } +void +smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ + smgrsw[reln->smgr_which].smgr_unlink(reln->smgr_rnode, forknum, isRedo); +} + /* * AtEOXact_SMgr * diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c index 11fa17ddea..ddc344dad2 100644 --- a/src/backend/storage/sync/sync.c +++ b/src/backend/storage/sync/sync.c @@ -89,7 +89,8 @@ static CycleCtr checkpoint_cycle_ctr = 0; typedef struct SyncOps { int (*sync_syncfiletag) (const FileTag *ftag, char *path); - int (*sync_unlinkfiletag) (const FileTag *ftag, char *path); + int (*sync_unlinkfiletag) (const FileTag *ftag, char *path, + StorageMarks mark); bool (*sync_filetagmatches) (const FileTag *ftag, const FileTag *candidate); } SyncOps; @@ -222,7 +223,8 @@ SyncPostCheckpoint(void) /* Unlink the file */ if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag, - path) < 0) + path, + SMGR_MARK_NONE) < 0) { /* * There's a race condition, when the database is dropped at the @@ -236,6 +238,20 @@ SyncPostCheckpoint(void) (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); } + else if (syncsw[entry->tag.handler].sync_unlinkfiletag( + &entry->tag, path, + SMGR_MARK_UNCOMMITTED) < 0) + { + /* + * And we may have SMGR_MARK_UNCOMMITTED file. Remove it if the + * fork files has been successfully removed. It's ok if the file + * does not exist. + */ + if (errno != ENOENT) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + } /* Mark the list entry as canceled, just in case */ entry->canceled = true; diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 9143797458..b21d01d04a 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -407,6 +407,30 @@ extractPageInfo(XLogReaderState *record) * source system. */ } + else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_UNLINK) + { + /* + * We can safely ignore these. When we compare the sizes later on, + * we'll notice that they differ, and copy the missing tail from + * source system. + */ + } + else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_MARK) + { + /* + * We can safely ignore these. When we compare the sizes later on, + * we'll notice that they differ, and copy the missing tail from + * source system. + */ + } + else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_BUFPERSISTENCE) + { + /* + * We can safely ignore these. When we compare the sizes later on, + * we'll notice that they differ, and copy the missing tail from + * source system. + */ + } else if (rmid == RM_XACT_ID && ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT || (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED || diff --git a/src/common/relpath.c b/src/common/relpath.c index 636c96efd3..1c19e16fea 100644 --- a/src/common/relpath.c +++ b/src/common/relpath.c @@ -139,9 +139,15 @@ GetDatabasePath(Oid dbNode, Oid spcNode) */ char * GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, - int backendId, ForkNumber forkNumber) + int backendId, ForkNumber forkNumber, char mark) { char *path; + char markstr[4]; + + if (mark == 0) + markstr[0] = 0; + else + snprintf(markstr, sizeof(markstr), ".%c", mark); if (spcNode == GLOBALTABLESPACE_OID) { @@ -149,10 +155,10 @@ GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, Assert(dbNode == 0); Assert(backendId == InvalidBackendId); if (forkNumber != MAIN_FORKNUM) - path = psprintf("global/%u_%s", - relNode, forkNames[forkNumber]); + path = psprintf("global/%u_%s%s", + relNode, forkNames[forkNumber], markstr); else - path = psprintf("global/%u", relNode); + path = psprintf("global/%u%s", relNode, markstr); } else if (spcNode == DEFAULTTABLESPACE_OID) { @@ -160,22 +166,22 @@ GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, if (backendId == InvalidBackendId) { if (forkNumber != MAIN_FORKNUM) - path = psprintf("base/%u/%u_%s", + path = psprintf("base/%u/%u_%s%s", dbNode, relNode, - forkNames[forkNumber]); + forkNames[forkNumber], markstr); else - path = psprintf("base/%u/%u", - dbNode, relNode); + path = psprintf("base/%u/%u%s", + dbNode, relNode, markstr); } else { if (forkNumber != MAIN_FORKNUM) - path = psprintf("base/%u/t%d_%u_%s", + path = psprintf("base/%u/t%d_%u_%s%s", dbNode, backendId, relNode, - forkNames[forkNumber]); + forkNames[forkNumber], markstr); else - path = psprintf("base/%u/t%d_%u", - dbNode, backendId, relNode); + path = psprintf("base/%u/t%d_%u%s", + dbNode, backendId, relNode, markstr); } } else @@ -184,27 +190,28 @@ GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, if (backendId == InvalidBackendId) { if (forkNumber != MAIN_FORKNUM) - path = psprintf("pg_tblspc/%u/%s/%u/%u_%s", + path = psprintf("pg_tblspc/%u/%s/%u/%u_%s%s", spcNode, TABLESPACE_VERSION_DIRECTORY, dbNode, relNode, - forkNames[forkNumber]); + forkNames[forkNumber], markstr); else - path = psprintf("pg_tblspc/%u/%s/%u/%u", + path = psprintf("pg_tblspc/%u/%s/%u/%u%s", spcNode, TABLESPACE_VERSION_DIRECTORY, - dbNode, relNode); + dbNode, relNode, markstr); } else { if (forkNumber != MAIN_FORKNUM) - path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u_%s", + path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u_%s%s", spcNode, TABLESPACE_VERSION_DIRECTORY, dbNode, backendId, relNode, - forkNames[forkNumber]); + forkNames[forkNumber], markstr); else - path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u", + path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u%s", spcNode, TABLESPACE_VERSION_DIRECTORY, - dbNode, backendId, relNode); + dbNode, backendId, relNode, markstr); } } + return path; } diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index 9ffc741913..d362d62ed2 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -23,6 +23,8 @@ extern int wal_skip_threshold; extern SMgrRelation RelationCreateStorage(RelFileNode rnode, char relpersistence); +extern void RelationCreateInitFork(Relation rel); +extern void RelationDropInitFork(Relation rel); extern void RelationDropStorage(Relation rel); extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit); extern void RelationPreTruncate(Relation rel); @@ -41,6 +43,7 @@ extern void RestorePendingSyncs(char *startAddress); extern void smgrDoPendingDeletes(bool isCommit); extern void smgrDoPendingSyncs(bool isCommit, bool isParallelWorker); extern int smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr); +extern void smgrDoPendingCleanups(bool isCommit); extern void AtSubCommit_smgr(void); extern void AtSubAbort_smgr(void); extern void PostPrepare_smgr(void); diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index 622de22b03..8139308634 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -18,17 +18,23 @@ #include "lib/stringinfo.h" #include "storage/block.h" #include "storage/relfilenode.h" +#include "storage/smgr.h" /* * Declarations for smgr-related XLOG records * - * Note: we log file creation and truncation here, but logging of deletion - * actions is handled by xact.c, because it is part of transaction commit. + * Note: we log file creation, truncation and buffer persistence change here, + * but logging of deletion actions is handled mainly by xact.c, because it is + * part of transaction commit in most cases. However, there's a case where + * init forks are deleted outside control of transaction. */ /* XLOG gives us high 4 bits */ #define XLOG_SMGR_CREATE 0x10 #define XLOG_SMGR_TRUNCATE 0x20 +#define XLOG_SMGR_UNLINK 0x30 +#define XLOG_SMGR_MARK 0x40 +#define XLOG_SMGR_BUFPERSISTENCE 0x50 typedef struct xl_smgr_create { @@ -36,6 +42,32 @@ typedef struct xl_smgr_create ForkNumber forkNum; } xl_smgr_create; +typedef struct xl_smgr_unlink +{ + RelFileNode rnode; + ForkNumber forkNum; +} xl_smgr_unlink; + +typedef enum smgr_mark_action +{ + XLOG_SMGR_MARK_CREATE = 'c', + XLOG_SMGR_MARK_UNLINK = 'u' +} smgr_mark_action; + +typedef struct xl_smgr_mark +{ + RelFileNode rnode; + ForkNumber forkNum; + StorageMarks mark; + smgr_mark_action action; +} xl_smgr_mark; + +typedef struct xl_smgr_bufpersistence +{ + RelFileNode rnode; + bool persistence; +} xl_smgr_bufpersistence; + /* flags for xl_smgr_truncate */ #define SMGR_TRUNCATE_HEAP 0x0001 #define SMGR_TRUNCATE_VM 0x0002 @@ -51,6 +83,12 @@ typedef struct xl_smgr_truncate } xl_smgr_truncate; extern void log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum); +extern void log_smgrunlink(const RelFileNode *rnode, ForkNumber forkNum); +extern void log_smgrcreatemark(const RelFileNode *rnode, ForkNumber forkNum, + StorageMarks mark); +extern void log_smgrunlinkmark(const RelFileNode *rnode, ForkNumber forkNum, + StorageMarks mark); +extern void log_smgrbufpersistence(const RelFileNode *rnode, bool persistence); extern void smgr_redo(XLogReaderState *record); extern void smgr_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/common/relpath.h b/src/include/common/relpath.h index a4b5dc853b..a864c91614 100644 --- a/src/include/common/relpath.h +++ b/src/include/common/relpath.h @@ -67,7 +67,7 @@ extern int forkname_chars(const char *str, ForkNumber *fork); extern char *GetDatabasePath(Oid dbNode, Oid spcNode); extern char *GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, - int backendId, ForkNumber forkNumber); + int backendId, ForkNumber forkNumber, char mark); /* * Wrapper macros for GetRelationPath. Beware of multiple @@ -77,7 +77,7 @@ extern char *GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, /* First argument is a RelFileNode */ #define relpathbackend(rnode, backend, forknum) \ GetRelationPath((rnode).dbNode, (rnode).spcNode, (rnode).relNode, \ - backend, forknum) + backend, forknum, 0) /* First argument is a RelFileNode */ #define relpathperm(rnode, forknum) \ @@ -87,4 +87,9 @@ extern char *GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode, #define relpath(rnode, forknum) \ relpathbackend((rnode).node, (rnode).backend, forknum) +/* First argument is a RelFileNodeBackend */ +#define markpath(rnode, forknum, mark) \ + GetRelationPath((rnode).node.dbNode, (rnode).node.spcNode, \ + (rnode).node.relNode, \ + (rnode).backend, forknum, mark) #endif /* RELPATH_H */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index dd01841c30..739b386216 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -206,6 +206,8 @@ extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels) extern void FlushDatabaseBuffers(Oid dbid); extern void DropRelFileNodeBuffers(struct SMgrRelationData *smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock); +extern void SetRelationBuffersPersistence(struct SMgrRelationData *srel, + bool permanent, bool isRedo); extern void DropRelFileNodesAllBuffers(struct SMgrRelationData **smgr_reln, int nnodes); extern void DropDatabaseBuffers(Oid dbid); diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 29209e2724..8bf746bf45 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -185,6 +185,7 @@ extern ssize_t pg_pwritev_with_retry(int fd, extern int pg_truncate(const char *path, off_t length); extern void fsync_fname(const char *fname, bool isdir); extern int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); +extern int fsync_parent_path(const char *fname, int elevel); extern int durable_rename(const char *oldfile, const char *newfile, int loglevel); extern int durable_unlink(const char *fname, int loglevel); extern int durable_rename_excl(const char *oldfile, const char *newfile, int loglevel); diff --git a/src/include/storage/md.h b/src/include/storage/md.h index ffffa40db7..046afdb5fb 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -23,6 +23,10 @@ extern void mdinit(void); extern void mdopen(SMgrRelation reln); extern void mdclose(SMgrRelation reln, ForkNumber forknum); +extern void mdcreatemark(SMgrRelation reln, ForkNumber forknum, + StorageMarks mark, bool isRedo); +extern void mdunlinkmark(SMgrRelation reln, ForkNumber forknum, + StorageMarks mark, bool isRedo); extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern bool mdexists(SMgrRelation reln, ForkNumber forknum); extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); @@ -41,12 +45,14 @@ extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); +extern void ForgetRelationForkSyncRequests(RelFileNodeBackend rnode, + ForkNumber forknum); extern void ForgetDatabaseSyncRequests(Oid dbid); extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo); /* md sync callbacks */ extern int mdsyncfiletag(const FileTag *ftag, char *path); -extern int mdunlinkfiletag(const FileTag *ftag, char *path); +extern int mdunlinkfiletag(const FileTag *ftag, char *path, StorageMarks mark); extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate); #endif /* MD_H */ diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h index bf2c10d443..e399aec0c7 100644 --- a/src/include/storage/reinit.h +++ b/src/include/storage/reinit.h @@ -16,13 +16,15 @@ #define REINIT_H #include "common/relpath.h" - +#include "storage/smgr.h" extern void ResetUnloggedRelations(int op); -extern bool parse_filename_for_nontemp_relation(const char *name, - int *oidchars, ForkNumber *fork); +extern bool parse_filename_for_nontemp_relation(const char *name, int *oidchars, + ForkNumber *fork, + StorageMarks *mark); #define UNLOGGED_RELATION_CLEANUP 0x0001 -#define UNLOGGED_RELATION_INIT 0x0002 +#define UNLOGGED_RELATION_DROP_BUFFER 0x0002 +#define UNLOGGED_RELATION_INIT 0x0004 #endif /* REINIT_H */ diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 052e0b8426..48e69ab69b 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -18,6 +18,18 @@ #include "storage/block.h" #include "storage/relfilenode.h" +/* + * Storage marks is a file of which existence suggests something about a + * file. The name of such files is "<filename>.<mark>", where the mark is one + * of the values of StorageMarks. Since ".<digit>" means segment files so don't + * use digits for the mark character. + */ +typedef enum StorageMarks +{ + SMGR_MARK_NONE = 0, + SMGR_MARK_UNCOMMITTED = 'u' /* the file is not committed yet */ +} StorageMarks; + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present) @@ -85,7 +97,12 @@ extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln); extern void smgrclose(SMgrRelation reln); extern void smgrcloseall(void); extern void smgrclosenode(RelFileNodeBackend rnode); +extern void smgrcreatemark(SMgrRelation reln, ForkNumber forknum, + StorageMarks mark, bool isRedo); +extern void smgrunlinkmark(SMgrRelation reln, ForkNumber forknum, + StorageMarks mark, bool isRedo); extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern void smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern void smgrdosyncall(SMgrRelation *rels, int nrels); extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo); extern void smgrextend(SMgrRelation reln, ForkNumber forknum, diff --git a/src/test/recovery/t/027_persistence_change.pl b/src/test/recovery/t/027_persistence_change.pl new file mode 100644 index 0000000000..261c4cf943 --- /dev/null +++ b/src/test/recovery/t/027_persistence_change.pl @@ -0,0 +1,263 @@ + +# Copyright (c) 2021, PostgreSQL Global Development Group + +# Test relation persistence change +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use Test::More tests => 30; +use IPC::Run qw(pump finish timer); +use Config; + +my $data_unit = 2000; + +# Initialize primary node. +my $node = PostgreSQL::Test::Cluster->new('node'); +$node->init; +# we don't want checkpointing +$node->append_conf('postgresql.conf', qq( +checkpoint_timeout = '24h' +)); +$node->start; +create($node); + +my $relfilenodes1 = relfilenodes(); + +# correctly recover empty tables +$node->stop('immediate'); +$node->start; +insert($node, 0, $data_unit, 0); + +# data persists after a crash +$node->stop('immediate'); +$node->start; +checkdataloss($data_unit, 'crash logged 1'); + +set_unlogged($node); +# SET UNLOGGED shouldn't change relfilenode +my $relfilenodes2 = relfilenodes(); +checkrelfilenodes($relfilenodes1, $relfilenodes2, 'logged->unlogged'); + +# data cleanly vanishes after a crash +$node->stop('immediate'); +$node->start; +checkdataloss(0, 'crash unlogged'); + +insert($node, 0, $data_unit, 0); +set_logged($node); + +$node->stop('immediate'); +$node->start; +# SET LOGGED shouldn't change relfilenode and data should survive the crash +my $relfilenodes3 = relfilenodes(); +checkrelfilenodes($relfilenodes2, $relfilenodes3, 'unlogged->logged'); +checkdataloss($data_unit, 'crash logged 2'); + +# unlogged insert -> graceful stop +set_unlogged($node); +insert($node, $data_unit, $data_unit, 0); +$node->stop; +$node->start; +checkdataloss($data_unit * 2, 'unlogged graceful restart'); + +# crash during transaction +set_logged($node); +$node->stop('immediate'); +$node->start; +insert($node, $data_unit * 2, $data_unit, 0); + +my $h; + +# insert(,,,1) requires IO::Pty. Skip the test if the module is not +# available, but do the insert to make the expected situation for the +# later tests. +eval { require IO::Pty; }; +if ($@) +{ + insert($node, $data_unit * 3, $data_unit, 0); + ok (1, 'SKIPPED: IO::Pty is needed'); + ok (1, 'SKIPPED: IO::Pty is needed'); +} +else +{ + $h = insert($node, $data_unit * 3, $data_unit, 1); ## this is aborted +} + +$node->stop('immediate'); + +# finishing $h stalls this case, just tear it off. +$h = undef; + +# check if indexes are working +$node->start; +# drop first half of data to reduce run time +$node->safe_psql('postgres', 'DELETE FROM t WHERE bt < ' . $data_unit * 2); +check($node, $data_unit * 2, $data_unit * 3 - 1, 'final check'); + +sub create +{ + my ($node) = @_; + + $node->psql('postgres', qq( + CREATE TABLE t (bt int, gin int[], gist point, hash int, + brin int, spgist point); + CREATE INDEX i_bt ON t USING btree (bt); + CREATE INDEX i_gin ON t USING gin (gin); + CREATE INDEX i_gist ON t USING gist (gist); + CREATE INDEX i_hash ON t USING hash (hash); + CREATE INDEX i_brin ON t USING brin (brin); + CREATE INDEX i_spgist ON t USING spgist (spgist);)); +} + + +sub insert +{ + my ($node, $st, $num, $interactive) = @_; + my $ed = $st + $num - 1; + my $query = qq(BEGIN; +INSERT INTO t + (SELECT i, ARRAY[i, i * 2], point(i, i * 2), i, i, point(i, i) + FROM generate_series($st, $ed) i); +); + + if ($interactive) + { + my $in = ''; + my $out = ''; + my $timer = timer(10); + + my $h = $node->interactive_psql('postgres', \$in, \$out, $timer); + like($out, qr/psql/, "print startup banner"); + + $in .= "$query\n"; + pump $h until ($out =~ /[\n\r]+INSERT 0 $num[\n\r]+/ || + $timer->is_expired); + ok(($out =~ /[\n\r]+INSERT 0 $num[\n\r]+/), "inserted-$st-$num"); + return $h + # the trasaction is not terminated + } + else + { + $node->psql('postgres', $query . "COMMIT;"); + return undef; + } +} + +sub check +{ + my ($node, $st, $ed, $head) = @_; + my $num_data = $ed - $st + 1; + + is($node->safe_psql('postgres', qq( + SET enable_seqscan TO true; + SET enable_indexscan TO false; + SELECT COUNT(*) FROM t, generate_series($st, $ed) i + WHERE bt = i)), + $num_data, "$head: heap is not broken"); + is($node->safe_psql('postgres', qq( + SET enable_seqscan TO false; + SET enable_indexscan TO true; + SELECT COUNT(*) FROM t, generate_series($st, $ed) i + WHERE bt = i)), + $num_data, "$head: btree is not broken"); + is($node->safe_psql('postgres', qq( + SET enable_seqscan TO false; + SET enable_indexscan TO true; + SELECT COUNT(*) FROM t, generate_series($st, $ed) i + WHERE gin = ARRAY[i, i * 2];)), + $num_data, "$head: gin is not broken"); + is($node->safe_psql('postgres', qq( + SET enable_seqscan TO false; + SET enable_indexscan TO true; + SELECT COUNT(*) FROM t, generate_series($st, $ed) i + WHERE gist <@ box(point(i-0.5, i*2-0.5),point(i+0.5, i*2+0.5));)), + $num_data, "$head: gist is not broken"); + is($node->safe_psql('postgres', qq( + SET enable_seqscan TO false; + SET enable_indexscan TO true; + SELECT COUNT(*) FROM t, generate_series($st, $ed) i + WHERE hash = i;)), + $num_data, "$head: hash is not broken"); + is($node->safe_psql('postgres', qq( + SET enable_seqscan TO false; + SET enable_indexscan TO true; + SELECT COUNT(*) FROM t, generate_series($st, $ed) i + WHERE brin = i;)), + $num_data, "$head: brin is not broken"); + is($node->safe_psql('postgres', qq( + SET enable_seqscan TO false; + SET enable_indexscan TO true; + SELECT COUNT(*) FROM t, generate_series($st, $ed) i + WHERE spgist <@ box(point(i-0.5,i-0.5),point(i+0.5,i+0.5));)), + $num_data, "$head: spgist is not broken"); +} + +sub set_unlogged +{ + my ($node) = @_; + + $node->psql('postgres', qq( + ALTER TABLE t SET UNLOGGED; +)); +} + +sub set_logged +{ + my ($node) = @_; + + $node->psql('postgres', qq( + ALTER TABLE t SET LOGGED; +)); +} + +sub relfilenodes +{ + my $result = $node->safe_psql('postgres', qq{ + SELECT relname, relfilenode FROM pg_class + WHERE relname + IN ('t', 'i_bt','i_gin','i_gist','i_hash','i_brin','i_spgist');}); + + my %relfilenodes; + + foreach my $l (split(/\n/, $result)) + { + die "unexpected format: $l" if ($l !~ /^([^|]+)\|([0-9]+)$/); + $relfilenodes{$1} = $2; + } + + # the number must correspond to the in list above + is (scalar %relfilenodes, 7, "number of relations is correct"); + + return \%relfilenodes; +} + +sub checkrelfilenodes +{ + my ($rnodes1, $rnodes2, $s) = @_; + + foreach my $n (keys %{$rnodes1}) + { + if ($n eq 'i_gist') + { + # persistence of GiST index is not changed in-place + isnt($rnodes1->{$n}, $rnodes2->{$n}, + "$s: relfilenode is changed: $n"); + } + else + { + # otherwise all relations are processed in-place + is($rnodes1->{$n}, $rnodes2->{$n}, + "$s: relfilenode is not changed: $n"); + } + } +} + +sub checkdataloss +{ + my ($expected, $s) = @_; + + is($node->safe_psql('postgres', "SELECT count(*) FROM t;"), $expected, + "$s: data in table t is in the expected state"); +} -- 2.27.0 From f621f134e7c48b52a65e3b60ad42c0259e226a40 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horikyoga.ntt@gmail.com> Date: Wed, 11 Nov 2020 23:21:09 +0900 Subject: [PATCH v17 2/2] New command ALTER TABLE ALL IN TABLESPACE SET LOGGED/UNLOGGED To ease invoking ALTER TABLE SET LOGGED/UNLOGGED, this command changes relation persistence of all tables in the specified tablespace. --- doc/src/sgml/ref/alter_table.sgml | 15 +++ src/backend/commands/tablecmds.c | 140 +++++++++++++++++++++++ src/backend/nodes/copyfuncs.c | 16 +++ src/backend/nodes/equalfuncs.c | 15 +++ src/backend/parser/gram.y | 42 +++++++ src/backend/tcop/utility.c | 11 ++ src/include/commands/tablecmds.h | 2 + src/include/nodes/nodes.h | 1 + src/include/nodes/parsenodes.h | 10 ++ src/test/regress/expected/tablespace.out | 76 ++++++++++++ src/test/regress/sql/tablespace.sql | 41 +++++++ 11 files changed, 369 insertions(+) diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml index a76e2e7322..6f108980af 100644 --- a/doc/src/sgml/ref/alter_table.sgml +++ b/doc/src/sgml/ref/alter_table.sgml @@ -33,6 +33,8 @@ ALTER TABLE [ IF EXISTS ] <replaceable class="parameter">name</replaceable> SET SCHEMA <replaceable class="parameter">new_schema</replaceable> ALTER TABLE ALL IN TABLESPACE <replaceable class="parameter">name</replaceable> [ OWNED BY <replaceable class="parameter">role_name</replaceable>[, ... ] ] SET TABLESPACE <replaceable class="parameter">new_tablespace</replaceable> [ NOWAIT ] +ALTER TABLE ALL IN TABLESPACE <replaceable class="parameter">name</replaceable> [ OWNED BY <replaceable class="parameter">role_name</replaceable>[, ... ] ] + SET { LOGGED | UNLOGGED } [ NOWAIT ] ALTER TABLE [ IF EXISTS ] <replaceable class="parameter">name</replaceable> ATTACH PARTITION <replaceable class="parameter">partition_name</replaceable> { FOR VALUES <replaceable class="parameter">partition_bound_spec</replaceable>| DEFAULT } ALTER TABLE [ IF EXISTS ] <replaceable class="parameter">name</replaceable> @@ -753,6 +755,19 @@ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REM (see <xref linkend="sql-createtable-unlogged"/>). It cannot be applied to a temporary table. </para> + + <para> + All tables in the current database in a tablespace can be changed by using + the <literal>ALL IN TABLESPACE</literal> form, which will lock all tables + to be changed first and then change each one. This form also supports + <literal>OWNED BY</literal>, which will only change tables owned by the + roles specified. If the <literal>NOWAIT</literal> option is specified + then the command will fail if it is unable to acquire all of the locks + required immediately. The <literal>information_schema</literal> + relations are not considered part of the system catalogs and will be + changed. See also + <link linkend="sql-createtablespace"><command>CREATE TABLESPACE</command></link>. + </para> </listitem> </varlistentry> diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 9e673ba68f..25bbdb5664 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -14769,6 +14769,146 @@ AlterTableMoveAll(AlterTableMoveAllStmt *stmt) return new_tablespaceoid; } +/* + * Alter Table ALL ... SET LOGGED/UNLOGGED + * + * Allows a user to change persistence of all objects in a given tablespace in + * the current database. Objects can be chosen based on the owner of the + * object also, to allow users to change persistene only their objects. The + * main permissions handling is done by the lower-level change persistence + * function. + * + * All to-be-modified objects are locked first. If NOWAIT is specified and the + * lock can't be acquired then we ereport(ERROR). + */ +void +AlterTableSetLoggedAll(AlterTableSetLoggedAllStmt *stmt) +{ + List *relations = NIL; + ListCell *l; + ScanKeyData key[1]; + Relation rel; + TableScanDesc scan; + HeapTuple tuple; + Oid tablespaceoid; + List *role_oids = roleSpecsToIds(stmt->roles); + + /* Ensure we were not asked to change something we can't */ + if (stmt->objtype != OBJECT_TABLE) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("only tables can be specified"))); + + /* Get the tablespace OID */ + tablespaceoid = get_tablespace_oid(stmt->tablespacename, false); + + /* + * Now that the checks are done, check if we should set either to + * InvalidOid because it is our database's default tablespace. + */ + if (tablespaceoid == MyDatabaseTableSpace) + tablespaceoid = InvalidOid; + + /* + * Walk the list of objects in the tablespace to pick up them. This will + * only find objects in our database, of course. + */ + ScanKeyInit(&key[0], + Anum_pg_class_reltablespace, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(tablespaceoid)); + + rel = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 1, key); + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + Form_pg_class relForm = (Form_pg_class) GETSTRUCT(tuple); + Oid relOid = relForm->oid; + + /* + * Do not pick-up objects in pg_catalog as part of this, if an admin + * really wishes to do so, they can issue the individual ALTER + * commands directly. + * + * Also, explicitly avoid any shared tables, temp tables, or TOAST + * (TOAST will be changed with the main table). + */ + if (IsCatalogNamespace(relForm->relnamespace) || + relForm->relisshared || + isAnyTempNamespace(relForm->relnamespace) || + IsToastNamespace(relForm->relnamespace)) + continue; + + /* Only pick up the object type requested */ + if (relForm->relkind != RELKIND_RELATION) + continue; + + /* Check if we are only picking-up objects owned by certain roles */ + if (role_oids != NIL && !list_member_oid(role_oids, relForm->relowner)) + continue; + + /* + * Handle permissions-checking here since we are locking the tables + * and also to avoid doing a bunch of work only to fail part-way. Note + * that permissions will also be checked by AlterTableInternal(). + * + * Caller must be considered an owner on the table of which we're going + * to change persistence. + */ + if (!pg_class_ownercheck(relOid, GetUserId())) + aclcheck_error(ACLCHECK_NOT_OWNER, get_relkind_objtype(get_rel_relkind(relOid)), + NameStr(relForm->relname)); + + if (stmt->nowait && + !ConditionalLockRelationOid(relOid, AccessExclusiveLock)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("aborting because lock on relation \"%s.%s\" is not available", + get_namespace_name(relForm->relnamespace), + NameStr(relForm->relname)))); + else + LockRelationOid(relOid, AccessExclusiveLock); + + /* + * Add to our list of objects of which we're going to change + * persistence. + */ + relations = lappend_oid(relations, relOid); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + if (relations == NIL) + ereport(NOTICE, + (errcode(ERRCODE_NO_DATA_FOUND), + errmsg("no matching relations in tablespace \"%s\" found", + tablespaceoid == InvalidOid ? "(database default)" : + get_tablespace_name(tablespaceoid)))); + + /* + * Everything is locked, loop through and change persistence of all of the + * relations. + */ + foreach(l, relations) + { + List *cmds = NIL; + AlterTableCmd *cmd = makeNode(AlterTableCmd); + + if (stmt->logged) + cmd->subtype = AT_SetLogged; + else + cmd->subtype = AT_SetUnLogged; + + cmds = lappend(cmds, cmd); + + EventTriggerAlterTableStart((Node *) stmt); + /* OID is set by AlterTableInternal */ + AlterTableInternal(lfirst_oid(l), cmds, false); + EventTriggerAlterTableEnd(); + } +} + static void index_copy_data(Relation rel, RelFileNode newrnode) { diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 90b5da51c9..bbc9eb28e6 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -4273,6 +4273,19 @@ _copyAlterTableMoveAllStmt(const AlterTableMoveAllStmt *from) return newnode; } +static AlterTableSetLoggedAllStmt * +_copyAlterTableSetLoggedAllStmt(const AlterTableSetLoggedAllStmt *from) +{ + AlterTableSetLoggedAllStmt *newnode = makeNode(AlterTableSetLoggedAllStmt); + + COPY_STRING_FIELD(tablespacename); + COPY_SCALAR_FIELD(objtype); + COPY_SCALAR_FIELD(logged); + COPY_SCALAR_FIELD(nowait); + + return newnode; +} + static CreateExtensionStmt * _copyCreateExtensionStmt(const CreateExtensionStmt *from) { @@ -5639,6 +5652,9 @@ copyObjectImpl(const void *from) case T_AlterTableMoveAllStmt: retval = _copyAlterTableMoveAllStmt(from); break; + case T_AlterTableSetLoggedAllStmt: + retval = _copyAlterTableSetLoggedAllStmt(from); + break; case T_CreateExtensionStmt: retval = _copyCreateExtensionStmt(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 06345da3ba..603bd2a044 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -1916,6 +1916,18 @@ _equalAlterTableMoveAllStmt(const AlterTableMoveAllStmt *a, return true; } +static bool +_equalAlterTableSetLoggedAllStmt(const AlterTableSetLoggedAllStmt *a, + const AlterTableSetLoggedAllStmt *b) +{ + COMPARE_STRING_FIELD(tablespacename); + COMPARE_SCALAR_FIELD(objtype); + COMPARE_SCALAR_FIELD(logged); + COMPARE_SCALAR_FIELD(nowait); + + return true; +} + static bool _equalCreateExtensionStmt(const CreateExtensionStmt *a, const CreateExtensionStmt *b) { @@ -3636,6 +3648,9 @@ equal(const void *a, const void *b) case T_AlterTableMoveAllStmt: retval = _equalAlterTableMoveAllStmt(a, b); break; + case T_AlterTableSetLoggedAllStmt: + retval = _equalAlterTableSetLoggedAllStmt(a, b); + break; case T_CreateExtensionStmt: retval = _equalCreateExtensionStmt(a, b); break; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index b5966712ce..682684c2ee 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -1984,6 +1984,48 @@ AlterTableStmt: n->nowait = $13; $$ = (Node *)n; } + | ALTER TABLE ALL IN_P TABLESPACE name SET LOGGED opt_nowait + { + AlterTableSetLoggedAllStmt *n = + makeNode(AlterTableSetLoggedAllStmt); + n->tablespacename = $6; + n->objtype = OBJECT_TABLE; + n->logged = true; + n->nowait = $9; + $$ = (Node *)n; + } + | ALTER TABLE ALL IN_P TABLESPACE name OWNED BY role_list SET LOGGED opt_nowait + { + AlterTableSetLoggedAllStmt *n = + makeNode(AlterTableSetLoggedAllStmt); + n->tablespacename = $6; + n->objtype = OBJECT_TABLE; + n->roles = $9; + n->logged = true; + n->nowait = $12; + $$ = (Node *)n; + } + | ALTER TABLE ALL IN_P TABLESPACE name SET UNLOGGED opt_nowait + { + AlterTableSetLoggedAllStmt *n = + makeNode(AlterTableSetLoggedAllStmt); + n->tablespacename = $6; + n->objtype = OBJECT_TABLE; + n->logged = false; + n->nowait = $9; + $$ = (Node *)n; + } + | ALTER TABLE ALL IN_P TABLESPACE name OWNED BY role_list SET UNLOGGED opt_nowait + { + AlterTableSetLoggedAllStmt *n = + makeNode(AlterTableSetLoggedAllStmt); + n->tablespacename = $6; + n->objtype = OBJECT_TABLE; + n->roles = $9; + n->logged = false; + n->nowait = $12; + $$ = (Node *)n; + } | ALTER INDEX qualified_name alter_table_cmds { AlterTableStmt *n = makeNode(AlterTableStmt); diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 83e4e37c78..750e0ecac9 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -162,6 +162,7 @@ ClassifyUtilityCommandAsReadOnly(Node *parsetree) case T_AlterTSConfigurationStmt: case T_AlterTSDictionaryStmt: case T_AlterTableMoveAllStmt: + case T_AlterTableSetLoggedAllStmt: case T_AlterTableSpaceOptionsStmt: case T_AlterTableStmt: case T_AlterTypeStmt: @@ -1747,6 +1748,12 @@ ProcessUtilitySlow(ParseState *pstate, commandCollected = true; break; + case T_AlterTableSetLoggedAllStmt: + AlterTableSetLoggedAll((AlterTableSetLoggedAllStmt *) parsetree); + /* commands are stashed in AlterTableSetLoggedAll */ + commandCollected = true; + break; + case T_DropStmt: ExecDropStmt((DropStmt *) parsetree, isTopLevel); /* no commands stashed for DROP */ @@ -2669,6 +2676,10 @@ CreateCommandTag(Node *parsetree) tag = AlterObjectTypeCommandTag(((AlterTableMoveAllStmt *) parsetree)->objtype); break; + case T_AlterTableSetLoggedAllStmt: + tag = AlterObjectTypeCommandTag(((AlterTableSetLoggedAllStmt *) parsetree)->objtype); + break; + case T_AlterTableStmt: tag = AlterObjectTypeCommandTag(((AlterTableStmt *) parsetree)->objtype); break; diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h index 5d4037f26e..c381dad3e5 100644 --- a/src/include/commands/tablecmds.h +++ b/src/include/commands/tablecmds.h @@ -42,6 +42,8 @@ extern void AlterTableInternal(Oid relid, List *cmds, bool recurse); extern Oid AlterTableMoveAll(AlterTableMoveAllStmt *stmt); +extern void AlterTableSetLoggedAll(AlterTableSetLoggedAllStmt *stmt); + extern ObjectAddress AlterTableNamespace(AlterObjectSchemaStmt *stmt, Oid *oldschema); diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index f9ddafd345..a83c66cad6 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -429,6 +429,7 @@ typedef enum NodeTag T_AlterCollationStmt, T_CallStmt, T_AlterStatsStmt, + T_AlterTableSetLoggedAllStmt, /* * TAGS FOR PARSE TREE NODES (parsenodes.h) diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 3e9bdc781f..f19bd3c569 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2351,6 +2351,16 @@ typedef struct AlterTableMoveAllStmt bool nowait; } AlterTableMoveAllStmt; +typedef struct AlterTableSetLoggedAllStmt +{ + NodeTag type; + char *tablespacename; + ObjectType objtype; /* Object type to move */ + List *roles; /* List of roles to change objects of */ + bool logged; + bool nowait; +} AlterTableSetLoggedAllStmt; + /* ---------------------- * Create/Alter Extension Statements * ---------------------- diff --git a/src/test/regress/expected/tablespace.out b/src/test/regress/expected/tablespace.out index 2dfbcfdebe..c02afdcb68 100644 --- a/src/test/regress/expected/tablespace.out +++ b/src/test/regress/expected/tablespace.out @@ -943,5 +943,81 @@ drop cascades to table testschema.asexecute drop cascades to table testschema.part drop cascades to table testschema.atable drop cascades to table testschema.tablespace_acl +-- +-- Check persistence change in a tablespace +CREATE SCHEMA testschema; +GRANT CREATE ON SCHEMA testschema TO regress_tablespace_user1; +CREATE TABLESPACE regress_tablespace LOCATION ''; +GRANT CREATE ON TABLESPACE regress_tablespace TO regress_tablespace_user1; +CREATE TABLE testschema.lsu(a int) TABLESPACE regress_tablespace; +CREATE UNLOGGED TABLE testschema.usu(a int) TABLESPACE regress_tablespace; +CREATE TABLE testschema._lsu(a int) TABLESPACE pg_default; +CREATE UNLOGGED TABLE testschema._usu(a int) TABLESPACE pg_default; +SET ROLE regress_tablespace_user1; +CREATE TABLE testschema.lu1(a int) TABLESPACE regress_tablespace; +CREATE UNLOGGED TABLE testschema.uu1(a int) TABLESPACE regress_tablespace; +CREATE TABLE testschema._lu1(a int) TABLESPACE pg_default; +CREATE UNLOGGED TABLE testschema._uu1(a int) TABLESPACE pg_default; +SELECT relname, t.spcname, relpersistence + FROM pg_class c LEFT JOIN pg_tablespace t ON (c.reltablespace = t.oid) + WHERE relnamespace = 'testschema'::regnamespace ORDER BY spcname, c.oid; + relname | spcname | relpersistence +---------+--------------------+---------------- + lsu | regress_tablespace | p + usu | regress_tablespace | u + lu1 | regress_tablespace | p + uu1 | regress_tablespace | u + _lsu | | p + _usu | | u + _lu1 | | p + _uu1 | | u +(8 rows) + +ALTER TABLE ALL IN TABLESPACE regress_tablespace + OWNED BY regress_tablespace_user1 SET LOGGED; +SELECT relname, t.spcname, relpersistence + FROM pg_class c LEFT JOIN pg_tablespace t ON (c.reltablespace = t.oid) + WHERE relnamespace = 'testschema'::regnamespace ORDER BY spcname, c.oid; + relname | spcname | relpersistence +---------+--------------------+---------------- + lsu | regress_tablespace | p + usu | regress_tablespace | u + lu1 | regress_tablespace | p + uu1 | regress_tablespace | p + _lsu | | p + _usu | | u + _lu1 | | p + _uu1 | | u +(8 rows) + +RESET ROLE; +ALTER TABLE ALL IN TABLESPACE regress_tablespace SET UNLOGGED; +SELECT relname, t.spcname, relpersistence + FROM pg_class c LEFT JOIN pg_tablespace t ON (c.reltablespace = t.oid) + WHERE relnamespace = 'testschema'::regnamespace ORDER BY spcname, c.oid; + relname | spcname | relpersistence +---------+--------------------+---------------- + lsu | regress_tablespace | u + usu | regress_tablespace | u + lu1 | regress_tablespace | u + uu1 | regress_tablespace | u + _lsu | | p + _usu | | u + _lu1 | | p + _uu1 | | u +(8 rows) + +-- Should succeed +DROP SCHEMA testschema CASCADE; +NOTICE: drop cascades to 8 other objects +DETAIL: drop cascades to table testschema.lsu +drop cascades to table testschema.usu +drop cascades to table testschema._lsu +drop cascades to table testschema._usu +drop cascades to table testschema.lu1 +drop cascades to table testschema.uu1 +drop cascades to table testschema._lu1 +drop cascades to table testschema._uu1 +DROP TABLESPACE regress_tablespace; DROP ROLE regress_tablespace_user1; DROP ROLE regress_tablespace_user2; diff --git a/src/test/regress/sql/tablespace.sql b/src/test/regress/sql/tablespace.sql index 896f05cea3..4e407eb8c0 100644 --- a/src/test/regress/sql/tablespace.sql +++ b/src/test/regress/sql/tablespace.sql @@ -419,5 +419,46 @@ DROP TABLESPACE regress_tblspace_renamed; DROP SCHEMA testschema CASCADE; + +-- +-- Check persistence change in a tablespace +CREATE SCHEMA testschema; +GRANT CREATE ON SCHEMA testschema TO regress_tablespace_user1; +CREATE TABLESPACE regress_tablespace LOCATION ''; +GRANT CREATE ON TABLESPACE regress_tablespace TO regress_tablespace_user1; + +CREATE TABLE testschema.lsu(a int) TABLESPACE regress_tablespace; +CREATE UNLOGGED TABLE testschema.usu(a int) TABLESPACE regress_tablespace; +CREATE TABLE testschema._lsu(a int) TABLESPACE pg_default; +CREATE UNLOGGED TABLE testschema._usu(a int) TABLESPACE pg_default; +SET ROLE regress_tablespace_user1; +CREATE TABLE testschema.lu1(a int) TABLESPACE regress_tablespace; +CREATE UNLOGGED TABLE testschema.uu1(a int) TABLESPACE regress_tablespace; +CREATE TABLE testschema._lu1(a int) TABLESPACE pg_default; +CREATE UNLOGGED TABLE testschema._uu1(a int) TABLESPACE pg_default; + +SELECT relname, t.spcname, relpersistence + FROM pg_class c LEFT JOIN pg_tablespace t ON (c.reltablespace = t.oid) + WHERE relnamespace = 'testschema'::regnamespace ORDER BY spcname, c.oid; + +ALTER TABLE ALL IN TABLESPACE regress_tablespace + OWNED BY regress_tablespace_user1 SET LOGGED; + +SELECT relname, t.spcname, relpersistence + FROM pg_class c LEFT JOIN pg_tablespace t ON (c.reltablespace = t.oid) + WHERE relnamespace = 'testschema'::regnamespace ORDER BY spcname, c.oid; + +RESET ROLE; + +ALTER TABLE ALL IN TABLESPACE regress_tablespace SET UNLOGGED; + +SELECT relname, t.spcname, relpersistence + FROM pg_class c LEFT JOIN pg_tablespace t ON (c.reltablespace = t.oid) + WHERE relnamespace = 'testschema'::regnamespace ORDER BY spcname, c.oid; + +-- Should succeed +DROP SCHEMA testschema CASCADE; +DROP TABLESPACE regress_tablespace; + DROP ROLE regress_tablespace_user1; DROP ROLE regress_tablespace_user2; -- 2.27.0
pgsql-hackers by date: