From 791cbcc97a896f425155bc5d6126d1721f43a737 Mon Sep 17 00:00:00 2001 From: Amul Sul Date: Fri, 19 Jun 2020 06:29:36 -0400 Subject: [PATCH v15 1/3] Implement wal prohibit state using global barrier. Implementation: 1. When a user tried to change server state to WAL-Prohibited by calling pg_prohibit_wal(true) sql function, the current state generation to inprogress in shared memory marked and signaled checkpointer process. Checkpointer, noticing that the current state transition, does the barrier request, and then acknowledges back to the backend who requested the state change once the transition has been completed. Final state will be updated in control file to make it persistent across the system restarts. 2. When a backend receives the WAL-Prohibited barrier, at that moment if it is already in a transaction and the transaction already assigned XID, then the backend will be killed by throwing FATAL(XXX: need more discussion on this) 3. Otherwise, if that backend running transaction without valid XID then, we don't need to do anything special right now, simply call ResetLocalXLogInsertAllowed() so that any future WAL insert in will check XLogInsertAllowed() first which set ready only state appropriately. 4. A new transaction (in an existing or in a new backend) starts as a read-only transaction. 5. Autovacuum launcher as well as checkpointer will not do anything in WAL-Prohibited server state until someone wakes us up. E.g. a backend might later on request us to put the system back to read-write. 6. At shutdown in WAL-Prohibited mode, we'll skip shutdown checkpoint and xlog rotation. Starting up again will perform crash recovery(XXX: need some discussion on this as well) but the end-of-recovery checkpoint, necessary wal write and control file update to start a server normally will be skipped and it will be performed when the system changed to WAL-Permitted mode. Until then "Database cluster state" will be "in crash recovery". 7. Altering WAL-Prohibited mode is restricted on standby server except the "in crash recovery" state described in the previous point. 8. The presence of RecoverySignalFile will implicitly pull out the server from the read-only (wal prohibited) state permanently. 9. Add system_is_read_only GUC show the system state -- will true when system is wal prohibited or in recovery. --- src/backend/access/transam/Makefile | 1 + src/backend/access/transam/walprohibit.c | 404 +++++++++++++++++++++++ src/backend/access/transam/xact.c | 37 ++- src/backend/access/transam/xlog.c | 336 ++++++++++++++----- src/backend/catalog/system_views.sql | 2 + src/backend/postmaster/autovacuum.c | 9 +- src/backend/postmaster/bgwriter.c | 2 +- src/backend/postmaster/checkpointer.c | 19 ++ src/backend/postmaster/pgstat.c | 6 + src/backend/storage/ipc/ipci.c | 6 + src/backend/storage/ipc/procsignal.c | 24 +- src/backend/storage/sync/sync.c | 30 +- src/backend/tcop/utility.c | 1 + src/backend/utils/misc/guc.c | 26 ++ src/bin/pg_controldata/pg_controldata.c | 2 + src/include/access/walprohibit.h | 104 ++++++ src/include/access/xlog.h | 4 + src/include/catalog/pg_control.h | 3 + src/include/catalog/pg_proc.dat | 4 + src/include/pgstat.h | 2 + src/include/postmaster/bgwriter.h | 2 + src/include/storage/procsignal.h | 7 +- src/tools/pgindent/typedefs.list | 1 + 23 files changed, 894 insertions(+), 138 deletions(-) create mode 100644 src/backend/access/transam/walprohibit.c create mode 100644 src/include/access/walprohibit.h diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 595e02de722..b5322a69954 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -26,6 +26,7 @@ OBJS = \ twophase.o \ twophase_rmgr.o \ varsup.o \ + walprohibit.o \ xact.o \ xlog.o \ xlogarchive.o \ diff --git a/src/backend/access/transam/walprohibit.c b/src/backend/access/transam/walprohibit.c new file mode 100644 index 00000000000..1de27529b69 --- /dev/null +++ b/src/backend/access/transam/walprohibit.c @@ -0,0 +1,404 @@ +/*------------------------------------------------------------------------- + * + * walprohibit.c + * PostgreSQL write-ahead log prohibit states + * + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/backend/access/transam/walprohibit.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/walprohibit.h" +#include "fmgr.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "postmaster/bgwriter.h" +#include "postmaster/interrupt.h" +#include "storage/condition_variable.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "utils/acl.h" +#include "utils/fmgroids.h" +#include "utils/fmgrprotos.h" + +/* + * Shared-memory WAL prohibit state + */ +typedef struct WALProhibitData +{ + /* + * Indicates current WAL prohibit state counter and the last two bits of + * this counter indicates current wal prohibit state. + */ + pg_atomic_uint32 wal_prohibit_counter; + + /* Signaled when requested WAL prohibit state changes */ + ConditionVariable wal_prohibit_cv; +} WALProhibitData; + +static WALProhibitData *WALProhibit = NULL; + +static void CompleteWALProhibitChange(void); +static uint32 GetWALProhibitCounter(void); + +/* + * ProcessBarrierWALProhibit() + * + * Handle WAL prohibit state change request. + */ +bool +ProcessBarrierWALProhibit(void) +{ + /* + * Kill off any transactions that have an XID *before* allowing the system + * to go WAL prohibit state. + */ + if (FullTransactionIdIsValid(GetTopFullTransactionIdIfAny())) + { + /* + * Should be here only while transiting towards the WAL prohibit state. + */ + Assert(GetWALProhibitState(GetWALProhibitCounter()) == + WALPROHIBIT_STATE_GOING_READ_ONLY); + + /* + * XXX: Kill off the whole session by throwing FATAL instead of + * killing transaction by throwing ERROR due to following reasons that + * need be thought: + * + * 1. Due to some presents challenges with the wire protocol, we could + * not simply kill of idle transaction. + * + * 2. If we are here in subtransaction then the ERROR will kill the + * current subtransaction only. In the case of invalidations, that + * might be good enough, but for XID assignment it's not, because + * assigning an XID to a subtransaction also causes higher + * sub-transaction levels and the parent transaction to get XIDs. + */ + ereport(FATAL, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("system is now read only"), + errhint("Sessions with open write transactions must be terminated."))); + } + + /* Return to "check" state */ + ResetLocalXLogInsertAllowed(); + + return true; +} + +/* + * pg_prohibit_wal() + * + * SQL callable function to toggle WAL prohibit state. + */ +Datum +pg_prohibit_wal(PG_FUNCTION_ARGS) +{ + bool walprohibit = PG_GETARG_BOOL(0); + uint32 wal_prohibit_counter; + uint32 target_counter_value; + bool increment; + + /* + * WAL prohibit state changes not allowed during recovery except the crash + * recovery case. In startup process, we skip the end-of-recovery + * checkpoint, and related wal write operation while booting read only (wal + * prohibited) server, which should be completed before changing the system + * state to read write. To disallow any other backend from writing a wal + * record before the end of crash recovery checkpoint finishes, we let the + * server in recovery mode. + */ + if (!StartupCrashRecoveryIsPending()) + PreventCommandDuringRecovery("pg_prohibit_wal()"); + + wal_prohibit_counter = GetWALProhibitCounter(); + + /* For more detail on state transition, see comment for WALProhibitState */ + switch (GetWALProhibitState(wal_prohibit_counter)) + { + case WALPROHIBIT_STATE_READ_WRITE: + if (!walprohibit) + PG_RETURN_VOID(); /* already in the requested state */ + increment = true; + break; + + case WALPROHIBIT_STATE_GOING_READ_WRITE: + if (walprohibit) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("system state transition to read write is already in progress"), + errhint("Try after sometime again."))); + increment = false; + break; + + case WALPROHIBIT_STATE_READ_ONLY: + if (walprohibit) + PG_RETURN_VOID(); /* already in the requested state */ + increment = true; + break; + + case WALPROHIBIT_STATE_GOING_READ_ONLY: + if (!walprohibit) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("system state transition to read only is already in progress"), + errhint("Try after sometime again."))); + increment = false; + break; + } + + if (increment) + wal_prohibit_counter = + pg_atomic_add_fetch_u32(&WALProhibit->wal_prohibit_counter, 1); + target_counter_value = wal_prohibit_counter + 1; + +#ifdef USE_ASSERT_CHECKING + { + /* Target state must be the requested one. */ + WALProhibitState target_state = GetWALProhibitState(target_counter_value); + Assert((walprohibit && target_state == WALPROHIBIT_STATE_READ_ONLY) || + (!walprohibit && target_state == WALPROHIBIT_STATE_READ_WRITE)); + } +#endif + + /* + * If in a standalone backend, just do it ourselves. + */ + if (!IsPostmasterEnvironment) + { + CompleteWALProhibitChange(); + PG_RETURN_VOID(); + } + + /* + * It is not a final state since we yet to convey this WAL prohibit state to + * all backend. Checkpointer will do that and update the shared memory wal + * prohibit state counter. + * + * If the end-of-recovery checkpoint and required wal write to start the + * server normally, has been skipped previously, then do that now. + */ + if (StartupCrashRecoveryIsPending()) + { + /* Should to be here while changing system to read write. */ + Assert(!walprohibit); + PerformPendingStartupOperations(); + } + else if (!SendSignalToCheckpointer(SIGUSR1)) + { + ereport(WARNING, + (errmsg("could not change system state now"), + errdetail("Checkpointer might not running."), + errhint("The relaunched checkpointer process will automatically complete the system state change."))); + PG_RETURN_VOID(); /* no wait */ + } + + /* Wait for the state counter in shared memory to change. */ + ConditionVariablePrepareToSleep(&WALProhibit->wal_prohibit_cv); + + /* + * We'll be done once the wal prohibit state counter reaches to target + * value. + */ + while (GetWALProhibitCounter() < target_counter_value) + ConditionVariableSleep(&WALProhibit->wal_prohibit_cv, + WAIT_EVENT_WALPROHIBIT_STATE_CHANGE); + ConditionVariableCancelSleep(); + + PG_RETURN_VOID(); +} + +/* + * Is the system still in WAL prohibited state? + */ +bool +IsWALProhibited(void) +{ + WALProhibitState cur_state = GetWALProhibitState(GetWALProhibitCounter()); + + return (cur_state != WALPROHIBIT_STATE_READ_WRITE && + cur_state != WALPROHIBIT_STATE_GOING_READ_WRITE); +} + +/* + * CompleteWALProhibitChange() + * + * Complete the requested WAL prohibit state transition. + */ +static void +CompleteWALProhibitChange(void) +{ + uint64 barrier_gen; + bool wal_prohibited; + + /* Fetch shared wal prohibit state counter value */ + uint32 wal_prohibit_counter = GetWALProhibitCounter(); + WALProhibitState cur_state = GetWALProhibitState(wal_prohibit_counter); + + /* + * Must be called from checkpointer. Otherwise, it must be single-user + * backend. + */ + Assert(AmCheckpointerProcess() || !IsPostmasterEnvironment); + + /* Should be here in transition state */ + Assert(cur_state == WALPROHIBIT_STATE_GOING_READ_ONLY || + cur_state == WALPROHIBIT_STATE_GOING_READ_WRITE); + + /* + * WAL prohibit state change is initiated. We need to complete the state + * transition by setting requested WAL prohibit state in all backends. + */ + elog(DEBUG1, "waiting for backends to adopt requested WAL prohibit state change"); + + /* Emit global barrier */ + barrier_gen = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_WALPROHIBIT); + WaitForProcSignalBarrier(barrier_gen); + + /* Return to "check" state */ + ResetLocalXLogInsertAllowed(); + + /* + * We don't need to be too aggressive to flush XLOG data right away since + * XLogFlush is not restricted in the wal prohibited state as well. + */ + XLogFlush(GetXLogWriteRecPtr()); + + /* + * There won't be any other process for the final state transition so that + * the shared wal prohibit state counter shouldn't have been changed by now. + */ + Assert(GetWALProhibitCounter() == wal_prohibit_counter); + + /* Increment wal prohibit state counter in share memory. */ + wal_prohibit_counter = + pg_atomic_add_fetch_u32(&WALProhibit->wal_prohibit_counter, 1); + + /* Should have set counter for the final state */ + cur_state = GetWALProhibitState(wal_prohibit_counter); + Assert(cur_state == WALPROHIBIT_STATE_READ_ONLY || + cur_state == WALPROHIBIT_STATE_READ_WRITE); + + wal_prohibited = (cur_state == WALPROHIBIT_STATE_READ_ONLY); + + /* Update the control file to make state persistent */ + SetControlFileWALProhibitFlag(wal_prohibited); + + if (wal_prohibited) + ereport(LOG, (errmsg("system is now read only"))); + else + ereport(LOG, (errmsg("system is now read write"))); + + /* Wake up all backends waiting on this. */ + ConditionVariableBroadcast(&WALProhibit->wal_prohibit_cv); +} + +/* + * ProcessWALProhibitStateChangeRequest() + * + * Checkpointer will complete wal prohibit state change request. + */ +void +ProcessWALProhibitStateChangeRequest(void) +{ + WALProhibitState cur_state; + + /* + * Must be called by the checkpointer process. Checkpointer has to be sure + * it has processed all pending wal prohibit state change requests as soon + * as possible. Since CreateCheckPoint and ProcessSyncRequests sometimes + * runs in non-checkpointer processes, do nothing if not checkpointer. + */ + if (!AmCheckpointerProcess()) + return; + + cur_state = GetWALProhibitState(GetWALProhibitCounter()); + + while (cur_state != WALPROHIBIT_STATE_READ_WRITE) + { + if (cur_state == WALPROHIBIT_STATE_GOING_READ_ONLY || + cur_state == WALPROHIBIT_STATE_GOING_READ_WRITE) + { + CompleteWALProhibitChange(); + } + else if (cur_state == WALPROHIBIT_STATE_READ_ONLY) + { + int rc; + + /* + * Don't let Checkpointer process do anything until someone wakes it + * up. For example a backend might later on request us to put the + * system back to read-write state. + */ + rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1, + WAIT_EVENT_WALPROHIBIT_STATE); + + /* + * If the postmaster dies or a shutdown request is received, just + * bail out. + */ + if (rc & WL_POSTMASTER_DEATH || ShutdownRequestPending) + return; + } + + /* Get the latest state */ + cur_state = GetWALProhibitState(GetWALProhibitCounter()); + } +} + +/* + * GetWALProhibitCounter() + * + * Atomically return the current server WAL prohibited state counter. + */ +static uint32 +GetWALProhibitCounter(void) +{ + return pg_atomic_read_u32(&WALProhibit->wal_prohibit_counter); +} + +/* + * WALProhibitStateCounterInit() + * + * Initialization of shared wal prohibit state counter. + */ +void +WALProhibitStateCounterInit(bool wal_prohibited) +{ + WALProhibitState new_state; + + Assert(AmStartupProcess() || !IsPostmasterEnvironment); + + new_state = wal_prohibited ? + WALPROHIBIT_STATE_READ_ONLY : WALPROHIBIT_STATE_READ_WRITE; + + pg_atomic_init_u32(&WALProhibit->wal_prohibit_counter, (uint32) new_state); +} + +/* + * WALProhibitStateShmemInit() + * + * Initialization of shared memory for WAL prohibit state. + */ +void +WALProhibitStateShmemInit(void) +{ + bool found; + + WALProhibit = (WALProhibitData *) + ShmemInitStruct("WAL Prohibit State", + sizeof(WALProhibitData), + &found); + + if (!found) + { + /* First time through ... */ + memset(WALProhibit, 0, sizeof(WALProhibitData)); + ConditionVariableInit(&WALProhibit->wal_prohibit_cv); + } +} diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 17fbc41bbb7..e37dbada4db 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1962,23 +1962,28 @@ StartTransaction(void) Assert(s->prevSecContext == 0); /* - * Make sure we've reset xact state variables + * Reset xact state variables. * - * If recovery is still in progress, mark this transaction as read-only. - * We have lower level defences in XLogInsert and elsewhere to stop us - * from modifying data during recovery, but this gives the normal - * indication to the user that the transaction is read-only. - */ - if (RecoveryInProgress()) - { - s->startedInRecovery = true; - XactReadOnly = true; - } - else - { - s->startedInRecovery = false; - XactReadOnly = DefaultXactReadOnly; - } + * If it is not currently possible to insert write-ahead log records, + * either because we are still in recovery or because ALTER SYSTEM READ + * ONLY has been executed, force this to be a read-only transaction. + * We have lower level defences in XLogBeginInsert() and elsewhere to stop + * us from modifying data during recovery when !XLogInsertAllowed(), but + * this gives the normal indication to the user that the transaction is + * read-only. + * + * On the other hand, we only need to set the startedInRecovery flag when + * the transaction started during recovery, and not when WAL is otherwise + * prohibited. This information is used by RelationGetIndexScan() to + * decide whether to permit (1) relying on existing killed-tuple markings + * and (2) further killing of index tuples. Even when WAL is prohibited + * on the master, it's still the master, so the former is OK; and since + * killing index tuples doesn't generate WAL, the latter is also OK. + * See comments in RelationGetIndexScan() and MarkBufferDirtyHint(). + */ + XactReadOnly = DefaultXactReadOnly || !XLogInsertAllowed(); + s->startedInRecovery = RecoveryInProgress(); + XactDeferrable = DefaultXactDeferrable; XactIsoLevel = DefaultXactIsoLevel; forceSyncCommit = false; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8e3b5df7dcb..0df545fc612 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -31,6 +31,7 @@ #include "access/timeline.h" #include "access/transam.h" #include "access/twophase.h" +#include "access/walprohibit.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "access/xlogarchive.h" @@ -247,9 +248,10 @@ static bool LocalPromoteIsTriggered = false; * 0: unconditionally not allowed to insert XLOG * -1: must check RecoveryInProgress(); disallow until it is false * Most processes start with -1 and transition to 1 after seeing that recovery - * is not in progress. But we can also force the value for special cases. - * The coding in XLogInsertAllowed() depends on the first two of these states - * being numerically the same as bool true and false. + * is not in progress or the server state is not a WAL prohibited state. But + * we can also force the value for special cases. The coding in + * XLogInsertAllowed() depends on the first two of these states being + * numerically the same as bool true and false. */ static int LocalXLogInsertAllowed = -1; @@ -730,6 +732,14 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* + * startupCrashRecoveryPending indicates if the last recovery checkpoint and + * required wal write to start the normal server are skipped. Lock + * protection is not needed since it isn't going to be read and/or updated + * concurrently. + */ + bool startupCrashRecoveryPending; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -978,6 +988,13 @@ static void WALInsertLockAcquireExclusive(void); static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); +static bool XLogAcceptWrites(bool needChkpt, bool bgwriterLaunched, + bool localPromoteIsTriggered, + XLogReaderState *xlogreader, + bool archiveRecoveryRequested, + TimeLineID endOfLogTLI, XLogRecPtr endOfLog, + TimeLineID thisTimeLineID); + /* * Insert an XLOG record represented by an already-constructed chain of data * chunks. This is a low-level routine; to construct the WAL record header @@ -6187,6 +6204,16 @@ SetLatestXTime(TimestampTz xtime) SpinLockRelease(&XLogCtl->info_lck); } +/* + * Return value of startupCrashRecoveryPending flag. + */ +bool +StartupCrashRecoveryIsPending(void) +{ + /* Read the latest value */ + return ((volatile XLogCtlData *) XLogCtl)->startupCrashRecoveryPending; +} + /* * Fetch timestamp of latest processed commit/abort record. */ @@ -6392,6 +6419,7 @@ StartupXLOG(void) XLogPageReadPrivate private; bool promoted = false; struct stat st; + bool needChkpt = false; /* * We should have an aux process resource owner to use, and we should not @@ -6541,13 +6569,22 @@ StartupXLOG(void) (errmsg("starting archive recovery"))); } - /* - * Take ownership of the wakeup latch if we're going to sleep during - * recovery. - */ if (ArchiveRecoveryRequested) + { + /* + * Take ownership of the wakeup latch if we're going to sleep during + * recovery. + */ OwnLatch(&XLogCtl->recoveryWakeupLatch); + /* + * Since archive recovery is requested, we cannot be in a read only (wal + * prohibited) state. + */ + ControlFile->wal_prohibited = false; + + } + /* Set up XLOG reader facility */ MemSet(&private, 0, sizeof(XLogPageReadPrivate)); xlogreader = @@ -7785,16 +7822,130 @@ StartupXLOG(void) XLogCtl->LogwrtRqst.Flush = EndOfLog; /* - * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE - * record before resource manager writes cleanup WAL records or checkpoint - * record is written. + * Preallocate additional log files, if wanted. + */ + PreallocXlogFiles(EndOfLog); + + /* + * Okay, we're officially UP. + */ + needChkpt = InRecovery; + InRecovery = false; + + /* start the archive_timeout timer and LSN running */ + XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); + XLogCtl->lastSegSwitchLSN = EndOfLog; + + /* also initialize latestCompletedXid, to nextXid - 1 */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; + FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid); + LWLockRelease(ProcArrayLock); + + /* + * Start up subtrans, if not already done for hot standby. (commit + * timestamps are started below, if necessary.) */ + if (standbyState == STANDBY_DISABLED) + StartupSUBTRANS(oldestActiveXID); + + /* + * Update full_page_writes in shared memory. XLOG_FPW_CHANGE record will be + * written later in XLogAcceptWrites. + */ Insert->fullPageWrites = lastFullPageWrites; + + /* + * Perform end of recovery actions for any SLRUs that need it. + */ + TrimCLOG(); + TrimMultiXact(); + + /* Reload shared-memory state for prepared transactions */ + RecoverPreparedTransactions(); + + /* + * Shutdown the recovery environment. This must occur after + * RecoverPreparedTransactions(), see notes for lock_twophase_recover() + */ + if (standbyState != STANDBY_DISABLED) + ShutdownRecoveryTransactionEnvironment(); + + /* + * Before enabling WAL insertion, initialize WAL prohibit state in shared + * memory that will decide the further WAL insert should be allowed or not. + */ + WALProhibitStateCounterInit(ControlFile->wal_prohibited); + + /* + * Skip wal writes and end of recovery checkpoint if the system is in WAL + * prohibited state. + */ + if (IsWALProhibited()) + { + /* + * We do start in recovery since at shutdown in wal prohibit state we + * skip shutdown checkpoint, that forces recovery on restart. + */ + Assert(needChkpt); + XLogCtl->startupCrashRecoveryPending = true; + + ereport(LOG, + (errmsg("skipping startup checkpoint because the system is read only"))); + } + else + { + promoted = XLogAcceptWrites(needChkpt, bgwriterLaunched, + LocalPromoteIsTriggered, xlogreader, + ArchiveRecoveryRequested, + EndOfLogTLI, EndOfLog, ThisTimeLineID); + } + + /* Shut down xlogreader */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + XLogReaderFree(xlogreader); + + /* + * If there were cascading standby servers connected to us, nudge any wal + * sender processes to notice that we've been promoted. + */ + WalSndWakeup(); + + /* + * If this was a promotion, request an (online) checkpoint now. This + * isn't required for consistency, but the last restartpoint might be far + * back, and in case of a crash, recovering from it might take a longer + * than is appropriate now that we're not in standby mode anymore. + */ + if (promoted) + RequestCheckpoint(CHECKPOINT_FORCE); +} + +/* + * It is an end part of StartupXLOG doing wal writes necessary before starting a + * server normally. Only the Startup process can call this function directly. + */ +static bool +XLogAcceptWrites(bool needChkpt, bool bgwriterLaunched, + bool localPromoteIsTriggered, XLogReaderState *xlogreader, + bool archiveRecoveryRequested, TimeLineID endOfLogTLI, + XLogRecPtr endOfLog, TimeLineID thisTimeLineID) +{ + bool promoted = false; + + /* + * Write an XLOG_FPW_CHANGE record before resource manager writes cleanup + * WAL records or checkpoint record is written. + */ LocalSetXLogInsertAllowed(); UpdateFullPageWrites(); LocalXLogInsertAllowed = -1; - if (InRecovery) + if (needChkpt) { /* * Perform a checkpoint to update all our recovery activity to disk. @@ -7812,15 +7963,17 @@ StartupXLOG(void) */ if (bgwriterLaunched) { - if (LocalPromoteIsTriggered) + if (localPromoteIsTriggered) { - checkPointLoc = ControlFile->checkPoint; + XLogRecord *record; /* * Confirm the last checkpoint is available for us to recover * from if we fail. */ - record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false); + record = ReadCheckpointRecord(xlogreader, + ControlFile->checkPoint, + 1, false); if (record != NULL) { promoted = true; @@ -7849,8 +8002,10 @@ StartupXLOG(void) CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE); } - if (ArchiveRecoveryRequested) + if (archiveRecoveryRequested) { + Assert(AmStartupProcess()); + /* * And finally, execute the recovery_end_command, if any. */ @@ -7868,7 +8023,7 @@ StartupXLOG(void) * pre-allocated files containing garbage. In any case, they are not * part of the new timeline's history so we don't need them. */ - RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID); + RemoveNonParentXlogFiles(endOfLog, thisTimeLineID); /* * If the switch happened in the middle of a segment, what to do with @@ -7899,14 +8054,14 @@ StartupXLOG(void) * restored from the archive to begin with, it's expected to have a * .done file). */ - if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 && + if (XLogSegmentOffset(endOfLog, wal_segment_size) != 0 && XLogArchivingActive()) { char origfname[MAXFNAMELEN]; XLogSegNo endLogSegNo; - XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size); - XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size); + XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size); + XLogFileName(origfname, endOfLogTLI, endLogSegNo, wal_segment_size); if (!XLogArchiveIsReadyOrDone(origfname)) { @@ -7914,7 +8069,7 @@ StartupXLOG(void) char partialfname[MAXFNAMELEN]; char partialpath[MAXPGPATH]; - XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size); + XLogFilePath(origpath, endOfLogTLI, endLogSegNo, wal_segment_size); snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname); snprintf(partialpath, MAXPGPATH, "%s.partial", origpath); @@ -7930,63 +8085,13 @@ StartupXLOG(void) } } - /* - * Preallocate additional log files, if wanted. - */ - PreallocXlogFiles(EndOfLog); - - /* - * Okay, we're officially UP. - */ - InRecovery = false; - - /* start the archive_timeout timer and LSN running */ - XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); - XLogCtl->lastSegSwitchLSN = EndOfLog; - - /* also initialize latestCompletedXid, to nextXid - 1 */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; - FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid); - LWLockRelease(ProcArrayLock); - - /* - * Start up subtrans, if not already done for hot standby. (commit - * timestamps are started below, if necessary.) - */ - if (standbyState == STANDBY_DISABLED) - StartupSUBTRANS(oldestActiveXID); - - /* - * Perform end of recovery actions for any SLRUs that need it. - */ - TrimCLOG(); - TrimMultiXact(); - - /* Reload shared-memory state for prepared transactions */ - RecoverPreparedTransactions(); - - /* - * Shutdown the recovery environment. This must occur after - * RecoverPreparedTransactions(), see notes for lock_twophase_recover() - */ - if (standbyState != STANDBY_DISABLED) - ShutdownRecoveryTransactionEnvironment(); - - /* Shut down xlogreader */ - if (readFile >= 0) - { - close(readFile); - readFile = -1; - } - XLogReaderFree(xlogreader); - /* * If any of the critical GUCs have changed, log them before we allow * backends to write WAL. */ LocalSetXLogInsertAllowed(); XLogReportParameters(); + LocalXLogInsertAllowed = -1; /* * Local WAL inserts enabled, so it's time to finish initialization of @@ -8020,20 +8125,51 @@ StartupXLOG(void) UpdateControlFile(); LWLockRelease(ControlFileLock); - /* - * If there were cascading standby servers connected to us, nudge any wal - * sender processes to notice that we've been promoted. - */ - WalSndWakeup(); + return promoted; +} + +/* + * This function should be called only if wal write accepts operation in the + * startup process had skipped due to read only system state (wal prohibited) + * and should be called only once while changing the system to read write. + */ +void +PerformPendingStartupOperations(void) +{ + Assert(StartupCrashRecoveryIsPending()); /* - * If this was a promotion, request an (online) checkpoint now. This - * isn't required for consistency, but the last restartpoint might be far - * back, and in case of a crash, recovering from it might take a longer - * than is appropriate now that we're not in standby mode anymore. + * When we do skip the end of recovery checkpoint we always have + * InRecovery = true, for more detail see the place where + * startupCrashRecoveryPending flag is set in StartupXLOG. Now we are + * performing this operation here which means we do have got the + * necessary auxiliary process therefore bgwriterLaunched is also true. + * This end of recovery checkpoint will never be skipped if + * ArchiveRecoveryRequested = true, at that time system implicitly get + * out for the wal prohibit state and does allows all the wal write + * operation in the startup. Therefore ArchiveRecoveryRequested is false + * here, and value for the rest of the parameters will be inapplicable. */ - if (promoted) - RequestCheckpoint(CHECKPOINT_FORCE); + (void) XLogAcceptWrites(true, /* needChkpt */ + true, /* bgwriterLaunched */ + false, /* localPromoteIsTriggered */ + NULL, /* xlogreader */ + false, /* archiveRecoveryRequested */ + 0, /* endOfLogTLI */ + InvalidXLogRecPtr, /* endOfLog */ + 0); /* thisTimeLineID */ + + XLogCtl->startupCrashRecoveryPending = false; +} + +/* Set ControlFile's WAL prohibit flag */ +void +SetControlFileWALProhibitFlag(bool walProhibited) +{ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->wal_prohibited = walProhibited; + UpdateControlFile(); + LWLockRelease(ControlFileLock); } /* @@ -8251,9 +8387,9 @@ HotStandbyActiveInReplay(void) /* * Is this process allowed to insert new WAL records? * - * Ordinarily this is essentially equivalent to !RecoveryInProgress(). - * But we also have provisions for forcing the result "true" or "false" - * within specific processes regardless of the global state. + * Ordinarily this is essentially equivalent to !RecoveryInProgress() and + * !IsWALProhibited(). But we also have provisions for forcing the result + * "true" or "false" within specific processes regardless of the global state. */ bool XLogInsertAllowed(void) @@ -8272,9 +8408,20 @@ XLogInsertAllowed(void) if (RecoveryInProgress()) return false; + /* Or, in WAL prohibited state */ + if (IsWALProhibited()) + { + /* + * Set it to "unconditionally false" to avoid checking until it gets + * reset. + */ + LocalXLogInsertAllowed = 0; + return false; + } + /* - * On exit from recovery, reset to "unconditionally true", since there is - * no need to keep checking. + * On exit from recovery or WAL prohibited state, reset to "unconditionally + * true", since there is no need to keep checking. */ LocalXLogInsertAllowed = 1; return true; @@ -8296,6 +8443,12 @@ LocalSetXLogInsertAllowed(void) InitXLOGAccess(); } +void +ResetLocalXLogInsertAllowed(void) +{ + LocalXLogInsertAllowed = -1; +} + /* * Subroutine to try to fetch and validate a prior checkpoint record. * @@ -8585,9 +8738,13 @@ ShutdownXLOG(int code, Datum arg) */ WalSndWaitStopping(); + /* + * The restartpoint, checkpoint, or xlog rotation will be performed if the + * WAL writing is permitted. + */ if (RecoveryInProgress()) CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); - else + else if (XLogInsertAllowed()) { /* * If archiving is enabled, rotate the last XLOG file so that all the @@ -8600,6 +8757,9 @@ ShutdownXLOG(int code, Datum arg) CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); } + else + ereport(LOG, + (errmsg("skipping shutdown checkpoint because the system is read only"))); } /* diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index fa58afd9d78..da154254a4d 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1525,6 +1525,8 @@ REVOKE EXECUTE ON FUNCTION pg_stat_file(text,boolean) FROM public; REVOKE EXECUTE ON FUNCTION pg_ls_dir(text) FROM public; REVOKE EXECUTE ON FUNCTION pg_ls_dir(text,boolean,boolean) FROM public; +REVOKE EXECUTE ON FUNCTION pg_prohibit_wal(bool) FROM public; + -- -- We also set up some things as accessible to standard roles. -- diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 8da5e5c9c39..0fb9748a527 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -700,10 +700,13 @@ AutoVacLauncherMain(int argc, char *argv[]) /* * There are some conditions that we need to check before trying to - * start a worker. First, we need to make sure that there is a worker - * slot available. Second, we need to make sure that no other worker - * failed while starting up. + * start a worker. First, the system is not read only i.e. wal writes + * permitted. Second, we need to make sure that there is a worker slot + * available. Third, we need to make sure that no other worker failed + * while starting up. */ + if (!XLogInsertAllowed()) + continue; current_time = GetCurrentTimestamp(); LWLockAcquire(AutovacuumLock, LW_SHARED); diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 715d5195bb6..5157237731c 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -278,7 +278,7 @@ BackgroundWriterMain(void) * Checkpointer, when active, is barely ever in its mainloop and thus * makes it hard to log regularly. */ - if (XLogStandbyInfoActive() && !RecoveryInProgress()) + if (XLogStandbyInfoActive() && XLogInsertAllowed()) { TimestampTz timeout = 0; TimestampTz now = GetCurrentTimestamp(); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 54a818bf611..033f8a7bdd9 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -39,6 +39,7 @@ #include #include +#include "access/walprohibit.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "libpq/pqsignal.h" @@ -351,6 +352,7 @@ CheckpointerMain(void) */ AbsorbSyncRequests(); HandleCheckpointerInterrupts(); + ProcessWALProhibitStateChangeRequest(); /* * Detect a pending checkpoint request by checking whether the flags @@ -688,6 +690,9 @@ CheckpointWriteDelay(int flags, double progress) if (!AmCheckpointerProcess()) return; + /* Check for wal prohibit state change request */ + ProcessWALProhibitStateChangeRequest(); + /* * Perform the usual duties and take a nap, unless we're behind schedule, * in which case we just try to catch up as quickly as possible. @@ -1335,3 +1340,17 @@ FirstCallSinceLastCheckpoint(void) return FirstCall; } + +/* + * SendSignalToCheckpointer allows a process to send a signal to the checkpoint process. + */ +bool +SendSignalToCheckpointer(int signum) +{ + if (CheckpointerShmem->checkpointer_pid == 0) + return false; + + if (kill(CheckpointerShmem->checkpointer_pid, signum) != 0) + return false; + return true; /* Signaled checkpointer successfully */ +} diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index f75b52719dd..63d52825497 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -4307,6 +4307,12 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_REPLICATION_SLOT_WRITE: event_name = "ReplicationSlotWrite"; break; + case WAIT_EVENT_WALPROHIBIT_STATE: + event_name = "SystemWALProhibitState"; + break; + case WAIT_EVENT_WALPROHIBIT_STATE_CHANGE: + event_name = "SystemWALProhibitStateChange"; + break; case WAIT_EVENT_SLRU_FLUSH_SYNC: event_name = "SLRUFlushSync"; break; diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index f9bbe97b507..c3c5ec641cf 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -22,6 +22,7 @@ #include "access/subtrans.h" #include "access/syncscan.h" #include "access/twophase.h" +#include "access/walprohibit.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -222,6 +223,11 @@ CreateSharedMemoryAndSemaphores(void) MultiXactShmemInit(); InitBufferPool(); + /* + * Set up wal probibit shared state + */ + WALProhibitStateShmemInit(); + /* * Set up lock manager */ diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index c43cdd685b4..31383a11d08 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -18,6 +18,7 @@ #include #include "access/parallel.h" +#include "access/walprohibit.h" #include "port/pg_bitutils.h" #include "commands/async.h" #include "miscadmin.h" @@ -98,7 +99,6 @@ static volatile ProcSignalSlot *MyProcSignalSlot = NULL; static bool CheckProcSignal(ProcSignalReason reason); static void CleanupProcSignalState(int status, Datum arg); static void ResetProcSignalBarrierBits(uint32 flags); -static bool ProcessBarrierPlaceholder(void); /* * ProcSignalShmemSize @@ -538,8 +538,8 @@ ProcessProcSignalBarrier(void) type = (ProcSignalBarrierType) pg_rightmost_one_pos32(flags); switch (type) { - case PROCSIGNAL_BARRIER_PLACEHOLDER: - processed = ProcessBarrierPlaceholder(); + case PROCSIGNAL_BARRIER_WALPROHIBIT: + processed = ProcessBarrierWALProhibit(); break; } @@ -604,24 +604,6 @@ ResetProcSignalBarrierBits(uint32 flags) InterruptPending = true; } -static bool -ProcessBarrierPlaceholder(void) -{ - /* - * XXX. This is just a placeholder until the first real user of this - * machinery gets committed. Rename PROCSIGNAL_BARRIER_PLACEHOLDER to - * PROCSIGNAL_BARRIER_SOMETHING_ELSE where SOMETHING_ELSE is something - * appropriately descriptive. Get rid of this function and instead have - * ProcessBarrierSomethingElse. Most likely, that function should live in - * the file pertaining to that subsystem, rather than here. - * - * The return value should be 'true' if the barrier was successfully - * absorbed and 'false' if not. Note that returning 'false' can lead to - * very frequent retries, so try hard to make that an uncommon case. - */ - return true; -} - /* * CheckProcSignal - check to see if a particular reason has been * signaled, and clear the signal flag. Should be called after receiving diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c index fe143151cc5..1c7b40563b5 100644 --- a/src/backend/storage/sync/sync.c +++ b/src/backend/storage/sync/sync.c @@ -21,6 +21,7 @@ #include "access/commit_ts.h" #include "access/clog.h" #include "access/multixact.h" +#include "access/walprohibit.h" #include "access/xlog.h" #include "access/xlogutils.h" #include "commands/tablespace.h" @@ -236,10 +237,17 @@ SyncPostCheckpoint(void) pfree(entry); /* - * As in ProcessSyncRequests, we don't want to stop absorbing fsync + * As in ProcessSyncRequests, we don't want to stop wal prohibit change * requests for a long time when there are many deletions to be done. - * We can safely call AbsorbSyncRequests() at this point in the loop - * (note it might try to delete list entries). + * It needs to be check and processed by checkpointer as soon as + * possible. + */ + ProcessWALProhibitStateChangeRequest(); + + /* + * Similarly, we don't want to stop absorbing fsync requests for the + * long time. We can safely call AbsorbSyncRequests() at this point in + * the loop (note it might try to delete list entries). */ if (--absorb_counter <= 0) { @@ -278,6 +286,9 @@ ProcessSyncRequests(void) if (!pendingOps) elog(ERROR, "cannot sync without a pendingOps table"); + /* Check for wal prohibit state change request for checkpointer */ + ProcessWALProhibitStateChangeRequest(); + /* * If we are in the checkpointer, the sync had better include all fsync * requests that were queued by backends up to this point. The tightest @@ -336,6 +347,13 @@ ProcessSyncRequests(void) { int failures; + /* + * Don't want to stop wal prohibit change requests for a long time when + * there are many fsync requests to be processed. It needs to be check + * and processed by checkpointer as soon as possible. + */ + ProcessWALProhibitStateChangeRequest(); + /* * If the entry is new then don't process it this time; it is new. * Note "continue" bypasses the hash-remove call at the bottom of the @@ -422,6 +440,12 @@ ProcessSyncRequests(void) errmsg("could not fsync file \"%s\" but retrying: %m", path))); + /* + * For the same reason mentioned previously for the wal prohibit + * state change request check. + */ + ProcessWALProhibitStateChangeRequest(); + /* * Absorb incoming requests and check to see if a cancel * arrived for this relation fork. diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 05bb698cf45..582f99609d9 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -19,6 +19,7 @@ #include "access/htup_details.h" #include "access/reloptions.h" #include "access/twophase.h" +#include "access/walprohibit.h" #include "access/xact.h" #include "access/xlog.h" #include "catalog/catalog.h" diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index eafdb1118ed..8fb43cc55ca 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -227,6 +227,7 @@ static bool check_recovery_target_lsn(char **newval, void **extra, GucSource sou static void assign_recovery_target_lsn(const char *newval, void *extra); static bool check_primary_slot_name(char **newval, void **extra, GucSource source); static bool check_default_with_oids(bool *newval, void **extra, GucSource source); +static const char *show_system_is_read_only(void); /* Private functions in guc-file.l that need to be called from guc.c */ static ConfigVariable *ProcessConfigFileInternal(GucContext context, @@ -618,6 +619,7 @@ static char *recovery_target_string; static char *recovery_target_xid_string; static char *recovery_target_name_string; static char *recovery_target_lsn_string; +static bool system_is_read_only; /* should be static, but commands/variable.c needs to get at this */ @@ -2048,6 +2050,18 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + /* Not for general use */ + {"system_is_read_only", PGC_INTERNAL, WAL, + gettext_noop("Shows whether the system is read only."), + NULL, + GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE + }, + &system_is_read_only, + false, + NULL, NULL, show_system_is_read_only + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL @@ -12218,4 +12232,16 @@ check_default_with_oids(bool *newval, void **extra, GucSource source) return true; } +/* + * NB: The return string should be the same as the _ShowOption() for boolean + * type. + */ +static const char * +show_system_is_read_only(void) +{ + if (!XLogInsertAllowed()) + return "on"; + return "off"; +} + #include "guc-file.c" diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 3e00ac0f701..922cd9641d8 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -290,6 +290,8 @@ main(int argc, char *argv[]) (uint32) ControlFile->backupEndPoint); printf(_("End-of-backup record required: %s\n"), ControlFile->backupEndRequired ? _("yes") : _("no")); + printf(_("WAL write prohibited: %s\n"), + ControlFile->wal_prohibited ? _("yes") : _("no")); printf(_("wal_level setting: %s\n"), wal_level_str(ControlFile->wal_level)); printf(_("wal_log_hints setting: %s\n"), diff --git a/src/include/access/walprohibit.h b/src/include/access/walprohibit.h new file mode 100644 index 00000000000..1fe7dde0504 --- /dev/null +++ b/src/include/access/walprohibit.h @@ -0,0 +1,104 @@ +/* + * walprohibit.h + * + * PostgreSQL write-ahead log prohibit states + * + * Portions Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/include/access/walprohibit.h + */ +#ifndef WALPROHIBIT_H +#define WALPROHIBIT_H + +#include "access/xact.h" +#include "access/xlog.h" +#include "miscadmin.h" +#include "nodes/parsenodes.h" + +extern bool ProcessBarrierWALProhibit(void); +extern void MarkCheckPointSkippedInWalProhibitState(void); +extern void WALProhibitStateCounterInit(bool wal_prohibited); +extern void WALProhibitStateShmemInit(void); +extern bool IsWALProhibited(void); +extern void ProcessWALProhibitStateChangeRequest(void); + +/* + * WAL Prohibit States. + * + * There are four possible states. A brand new database cluster is always + * initially WALPROHIBIT_STATE_READ_WRITE. If the user tries to make it read + * only, then we enter the state WALPROHIBIT_STATE_GOING_READ_ONLY. When the + * transition is complete, we enter the state WALPROHIBIT_STATE_READ_ONLY. If + * the user subsequently tries to make it read write, we will enter the state + * WALPROHIBIT_STATE_GOING_READ_WRITE. When that transition is complete, we + * will enter the state WALPROHIBIT_STATE_READ_WRITE. These four state + * transitions are the only ones possible; for example, if we're currently in + * state WALPROHIBIT_STATE_GOING_READ_ONLY, an attempt to go read-write will + * produce an error, and a second attempt to go read-only will not cause a state + * change. Thus, we can represent the state as a shared-memory counter whose + * value only ever changes by adding 1. The initial value at postmaster startup + * is either 0 or 2, depending on whether the control file specifies the system + * is starting read-write or read-only. + */ +typedef enum +{ + WALPROHIBIT_STATE_READ_WRITE = 0, /* WAL permitted */ + WALPROHIBIT_STATE_GOING_READ_ONLY = 1, + WALPROHIBIT_STATE_READ_ONLY = 2, /* WAL prohibited */ + WALPROHIBIT_STATE_GOING_READ_WRITE = 3 +} WALProhibitState; + +static inline WALProhibitState +GetWALProhibitState(uint32 wal_prohibit_counter) +{ + /* Extract last two bits */ + return (WALProhibitState) (wal_prohibit_counter & 3); +} + +/* Never reaches when WAL is prohibited. */ +static inline void +AssertWALPermitted(void) +{ + /* + * Recovery in the startup process never is in wal prohibited state. + */ + Assert(InRecovery || XLogInsertAllowed()); + +#ifdef USE_ASSERT_CHECKING + walpermit_checked_state = WALPERMIT_CHECKED; +#endif +} + +/* + * XID-bearing transactions are killed off by "ALTER SYSTEM READ ONLY", so any + * part of the code that can only be reached with an XID assigned is never + * reached when WAL is prohibited. + */ +static inline void +AssertWALPermittedHaveXID(void) +{ + /* Must be performing an INSERT, UPDATE or DELETE, so we'll have an XID */ + Assert(FullTransactionIdIsValid(GetTopFullTransactionIdIfAny())); + AssertWALPermitted(); +} + +/* + * In opposite to the above assertion if a transaction doesn't have valid XID + * (e.g. VACUUM) then it won't be killed while changing the system state to WAL + * prohibited. Therefore, we need to explicitly error out before entering into + * the critical section. + */ +static inline void +CheckWALPermitted(void) +{ + if (!XLogInsertAllowed()) + ereport(ERROR, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("system is now read only"))); + +#ifdef USE_ASSERT_CHECKING + walpermit_checked_state = WALPERMIT_CHECKED; +#endif +} + +#endif /* WALPROHIBIT_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 75ec1073bd0..7bff0adc2cd 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -306,6 +306,7 @@ extern RecoveryState GetRecoveryState(void); extern bool HotStandbyActive(void); extern bool HotStandbyActiveInReplay(void); extern bool XLogInsertAllowed(void); +extern void ResetLocalXLogInsertAllowed(void); extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); extern XLogRecPtr GetXLogInsertRecPtr(void); @@ -314,6 +315,7 @@ extern bool RecoveryIsPaused(void); extern void SetRecoveryPause(bool recoveryPause); extern TimestampTz GetLatestXTime(void); extern TimestampTz GetCurrentChunkReplayStartTime(void); +extern bool StartupCrashRecoveryIsPending(void); extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); @@ -325,6 +327,8 @@ extern void XLOGShmemInit(void); extern void BootStrapXLOG(void); extern void LocalProcessControlFile(bool reset); extern void StartupXLOG(void); +extern void PerformPendingStartupOperations(void); +extern void SetControlFileWALProhibitFlag(bool wal_prohibited); extern void ShutdownXLOG(int code, Datum arg); extern void InitXLOGAccess(void); extern void CreateCheckPoint(int flags); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index e3f48158ce7..f6a1f3b9826 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -182,6 +182,9 @@ typedef struct ControlFileData int max_locks_per_xact; bool track_commit_timestamp; + /* WAL prohibited determines if the WAL insert is allowed or not. */ + bool wal_prohibited; + /* * This data is used to check for hardware-architecture compatibility of * the database and the backend executable. We need not check endianness diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 1487710d590..62b8ac41702 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -11379,6 +11379,10 @@ proname => 'pg_partition_root', prorettype => 'regclass', proargtypes => 'regclass', prosrc => 'pg_partition_root' }, +{ oid => '4543', descr => 'permit or prohibit wal writes', + proname => 'pg_prohibit_wal', prorettype => 'void', + proargtypes => 'bool', prosrc => 'pg_prohibit_wal' }, + { oid => '4350', descr => 'Unicode normalization', proname => 'normalize', prorettype => 'text', proargtypes => 'text text', prosrc => 'unicode_normalize_func' }, diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 724068cf87e..8f4fc4f1e15 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -1067,6 +1067,8 @@ typedef enum WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC, WAIT_EVENT_REPLICATION_SLOT_SYNC, WAIT_EVENT_REPLICATION_SLOT_WRITE, + WAIT_EVENT_WALPROHIBIT_STATE, + WAIT_EVENT_WALPROHIBIT_STATE_CHANGE, WAIT_EVENT_SLRU_FLUSH_SYNC, WAIT_EVENT_SLRU_READ, WAIT_EVENT_SLRU_SYNC, diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index c430b1b2366..bee495f05da 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -42,4 +42,6 @@ extern void CheckpointerShmemInit(void); extern bool FirstCallSinceLastCheckpoint(void); +extern bool SendSignalToCheckpointer(int signum); + #endif /* _BGWRITER_H */ diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 4ae7dc33b8e..9e834247871 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -48,12 +48,7 @@ typedef enum typedef enum { - /* - * XXX. PROCSIGNAL_BARRIER_PLACEHOLDER should be replaced when the first - * real user of the ProcSignalBarrier mechanism is added. It's just here - * for now because we can't have an empty enum. - */ - PROCSIGNAL_BARRIER_PLACEHOLDER = 0 + PROCSIGNAL_BARRIER_WALPROHIBIT = 0 } ProcSignalBarrierType; /* diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index bab4f3adb3b..cd89ff06790 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2698,6 +2698,7 @@ WALAvailability WALInsertLock WALInsertLockPadded WALOpenSegment +WALProhibitStateData WALReadError WALSegmentCloseCB WALSegmentContext -- 2.18.0