From 609b02f6f439e631ba5f3dffe0a574fc36d00ec6 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Tue, 1 Sep 2020 15:45:38 +0200 Subject: [PATCH] Support checksum enable/disable in running cluster v20 This allows data checksums to be enabled, or disabled, in a running cluster without restricting access to the cluster during processing. A new value "inprogress" is added for data_checksums during which writes will set the checksum but reads wont enforce it. When all pages have been checksummed the value will change to "on" which will enforce the checksums on read. At this point, the cluster has the same state as if checksums were enabled via initdb. Checksums are being added via a background worker ChecksumHelper which will process all pages in all databases. Pages accessed via concurrent write operations will be checksummed with the normal process. Daniel Gustafsson, Magnus Hagander --- doc/src/sgml/catalogs.sgml | 11 + doc/src/sgml/func.sgml | 68 + doc/src/sgml/ref/initdb.sgml | 1 + doc/src/sgml/wal.sgml | 97 ++ src/backend/access/rmgrdesc/xlogdesc.c | 16 + src/backend/access/transam/xlog.c | 173 ++- src/backend/access/transam/xlogfuncs.c | 86 ++ src/backend/catalog/heap.c | 1 + src/backend/catalog/system_views.sql | 5 + src/backend/postmaster/Makefile | 1 + src/backend/postmaster/bgworker.c | 10 + src/backend/postmaster/datachecksumsworker.c | 1266 ++++++++++++++++++ src/backend/postmaster/pgstat.c | 6 + src/backend/replication/basebackup.c | 2 +- src/backend/replication/logical/decode.c | 1 + src/backend/storage/ipc/ipci.c | 2 + src/backend/storage/ipc/procsignal.c | 37 +- src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/storage/page/README | 4 +- src/backend/storage/page/bufpage.c | 6 +- src/backend/utils/adt/pgstatfuncs.c | 4 +- src/backend/utils/cache/relcache.c | 4 + src/backend/utils/init/miscinit.c | 6 + src/backend/utils/init/postinit.c | 5 + src/backend/utils/misc/guc.c | 36 +- src/bin/pg_checksums/pg_checksums.c | 2 +- src/bin/pg_upgrade/controldata.c | 9 + src/bin/pg_upgrade/pg_upgrade.h | 2 +- src/include/access/xlog.h | 15 +- src/include/access/xlog_internal.h | 7 + src/include/catalog/pg_class.h | 3 + src/include/catalog/pg_control.h | 1 + src/include/catalog/pg_proc.dat | 16 + src/include/miscadmin.h | 2 + src/include/pgstat.h | 2 + src/include/postmaster/datachecksumsworker.h | 42 + src/include/storage/bufpage.h | 2 + src/include/storage/checksum.h | 8 + src/include/storage/procsignal.h | 9 +- src/test/Makefile | 3 +- src/test/checksum/.gitignore | 2 + src/test/checksum/Makefile | 23 + src/test/checksum/README | 22 + src/test/checksum/t/001_basic.pl | 86 ++ src/test/checksum/t/002_restarts.pl | 97 ++ src/test/checksum/t/003_standby_checksum.pl | 96 ++ 46 files changed, 2250 insertions(+), 48 deletions(-) create mode 100644 src/backend/postmaster/datachecksumsworker.c create mode 100644 src/include/postmaster/datachecksumsworker.h create mode 100644 src/test/checksum/.gitignore create mode 100644 src/test/checksum/Makefile create mode 100644 src/test/checksum/README create mode 100644 src/test/checksum/t/001_basic.pl create mode 100644 src/test/checksum/t/002_restarts.pl create mode 100644 src/test/checksum/t/003_standby_checksum.pl diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 1d1b8ce8fb..b02cb1ae9f 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -2162,6 +2162,17 @@ SCRAM-SHA-256$<iteration count>:&l + + + relhaschecksums bool + + + True if relation has data checksums on all pages. This state is only + used during checksum processing; this field should never be consulted + for cluster checksum status. + + + relrewrite oid diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 2efd80baa4..71d1a90fe1 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -25120,6 +25120,74 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup()); + + Data Checksum Functions + + + The functions shown in can + be used to enable or disable data checksums in a running cluster. + See for details. + + + + Data Checksum Functions + + + + + Function + + + Description + + + + + + + + + pg_enable_data_checksums + + pg_enable_data_checksums ( cost_delay int, cost_limit int ) + boolean + + + Initiates data checksums for the cluster. This will switch the data + checksums mode to inprogress as well as start a + background worker that will process all data in the database and enable + checksums for it. When all data pages have had checksums enabled, the + cluster will automatically switch data checksums mode to + on. Returns true if processing + was started. + + + If cost_delay and cost_limit are + specified, the speed of the process is throttled using the same principles as + Cost-based Vacuum Delay. + + + + + + + pg_disable_data_checksums + + pg_disable_data_checksums () + boolean + + + Disables data checksums for the cluster with immediate effect. + Returns false in case data checksums are disabled + already. + + + + +
+ +
+ Database Object Management Functions diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 385ac25150..e3b0048806 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -219,6 +219,7 @@ PostgreSQL documentation failures will be reported in the pg_stat_database view. + See for details.
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index d1c3893b14..f2de982626 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -230,6 +230,103 @@ + + Data Checksums + + checksums + + + + Data pages are not checksum protected by default, but this can optionally be + enabled for a cluster. When enabled, each data page will be assigned a + checksum that is updated when the page is written and verified every time + the page is read. Only data pages are protected by checksums, internal data + structures and temporary files are not. + + + + Checksums are normally enabled when the cluster is initialized using initdb. + They can also be enabled or disabled at a later time, either as an offline + operation or in a running cluster. In all cases, checksums are enabled or + disabled at the full cluster level, and cannot be specified individually for + databases or tables. + + + + The current state of checksums in the cluster can be verified by viewing the + value of the read-only configuration variable by issuing the command SHOW + data_checksums. + + + + When attempting to recover from corrupt data it may be necessary to bypass + the checksum protection in order to recover data. To do this, temporarily + set the configuration parameter . + + + + On-line Enabling of Checksums + + + Checksums can be enabled or disabled online, by calling the appropriate + functions. + Disabling of checksums takes effect immediately when the function is called. + + + + Enabling checksums will put the cluster checksum mode in + inprogress mode. During this time, checksums will be + written but not verified. In addition to this, a background worker process + is started that enables checksums on all existing data in the cluster. Once + this worker has completed processing all databases in the cluster, the + checksum mode will automatically switch to on. The + processing will consume a background worker process, make sure that + max_worker_processes allows for at least one more + additional process. + + + + The process will initially wait for all open transactions to finish before + it starts, so that it can be certain that there are no tables that have been + created inside a transaction that has not committed yet and thus would not + be visible to the process enabling checksums. It will also, for each database, + wait for all pre-existing temporary tables to get removed before it finishes. + If long-lived temporary tables are used in the application it may be necessary + to terminate these application connections to allow the process to complete. + + + + If the cluster is stopped while in inprogress mode, for + any reason, then this process must be restarted manually. To do this, + re-execute the function pg_enable_data_checksums() + once the cluster has been restarted. The background worker will attempt + to resume the work from where it was interrupted. + + + + + Enabling checksums can cause significant I/O to the system, as most of the + database pages will need to be rewritten, and will be written both to the + data files and the WAL. + + + + + + + Off-line Enabling of Checksums + + + The pg_checksums + application can be used to enable or disable data checksums, as well as + verify checksums, on an offline cluster. + + + + + Write-Ahead Logging (<acronym>WAL</acronym>) diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 3200f777f5..fcfad06900 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -18,6 +18,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/pg_control.h" +#include "storage/bufpage.h" #include "utils/guc.h" #include "utils/timestamp.h" @@ -140,6 +141,18 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xlrec.ThisTimeLineID, xlrec.PrevTimeLineID, timestamptz_to_str(xlrec.end_time)); } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state xlrec; + + memcpy(&xlrec, rec, sizeof(xl_checksum_state)); + if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_VERSION) + appendStringInfo(buf, "on"); + else if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + appendStringInfo(buf, "inprogress"); + else + appendStringInfo(buf, "off"); + } } const char * @@ -185,6 +198,9 @@ xlog_identify(uint8 info) case XLOG_FPI_FOR_HINT: id = "FPI_FOR_HINT"; break; + case XLOG_CHECKSUMS: + id = "CHECKSUMS"; + break; } return id; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 09c01ed4ae..b8e5e55c0b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -38,6 +38,7 @@ #include "access/xlogreader.h" #include "access/xlogutils.h" #include "catalog/catversion.h" +#include "catalog/pg_class.h" #include "catalog/pg_control.h" #include "catalog/pg_database.h" #include "commands/progress.h" @@ -251,6 +252,11 @@ static bool LocalPromoteIsTriggered = false; */ static int LocalXLogInsertAllowed = -1; +/* + * Local state for Controlfile data_checksum_version + */ +static uint32 LocalDataChecksumVersion = 0; + /* * When ArchiveRecoveryRequested is set, archive recovery was requested, * ie. signal files were present. When InArchiveRecovery is set, we are @@ -892,6 +898,7 @@ static void SetLatestXTime(TimestampTz xtime); static void SetCurrentChunkStartTime(TimestampTz xtime); static void CheckRequiredParameterValues(void); static void XLogReportParameters(void); +static void XlogChecksums(ChecksumType new_type); static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI); static void LocalSetXLogInsertAllowed(void); @@ -1077,7 +1084,7 @@ XLogInsertRecord(XLogRecData *rdata, Assert(RedoRecPtr < Insert->RedoRecPtr); RedoRecPtr = Insert->RedoRecPtr; } - doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); + doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites || DataChecksumsOnInProgress()); if (doPageWrites && (!prevDoPageWrites || @@ -4888,9 +4895,7 @@ ReadControlFile(void) CalculateCheckpointSegments(); - /* Make the initdb settings visible as GUC variables, too */ - SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", - PGC_INTERNAL, PGC_S_OVERRIDE); + LocalDataChecksumVersion = ControlFile->data_checksum_version; } /* @@ -4927,10 +4932,116 @@ GetMockAuthenticationNonce(void) * Are checksums enabled for data pages? */ bool -DataChecksumsEnabled(void) +DataChecksumsNeedWrite(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); +} + +bool +DataChecksumsNeedVerify(void) +{ + /* + * Only verify checksums if they are fully enabled in the cluster. In + * "inprogress" state they are only updated, not verified. + */ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION); +} + +bool +DataChecksumsOnInProgress(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); +} + +void +SetDataChecksumsOnInProgress(void) +{ + Assert(ControlFile != NULL); + + if (LocalDataChecksumVersion > 0) + return; + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON)); + + XlogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); +} + +void +AbsorbChecksumsOnInProgressBarrier(void) +{ + Assert(LocalDataChecksumVersion == 0 || LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + LocalDataChecksumVersion = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION; +} + +void +SetDataChecksumsOn(void) { Assert(ControlFile != NULL); - return (ControlFile->data_checksum_version > 0); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + { + LWLockRelease(ControlFileLock); + elog(ERROR, "checksums not in \"inprogress\" mode"); + } + + ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON)); + + XlogChecksums(PG_DATA_CHECKSUM_VERSION); +} + +void +AbsorbChecksumsOnBarrier(void) +{ + Assert(LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + LocalDataChecksumVersion = PG_DATA_CHECKSUM_VERSION; +} + +void +SetDataChecksumsOff(void) +{ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + ControlFile->data_checksum_version = 0; + XlogChecksums(0); + UpdateControlFile(); + LWLockRelease(ControlFileLock); + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF)); +} + +void +AbsorbChecksumsOffBarrier(void) +{ + LocalDataChecksumVersion = 0; +} + +void +InitLocalControldata(void) +{ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + LocalDataChecksumVersion = ControlFile->data_checksum_version; + LWLockRelease(ControlFileLock); +} + +/* guc hook */ +const char * +show_data_checksums(void) +{ + if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION) + return "on"; + else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + return "inprogress"; + else + return "off"; } /* @@ -7916,6 +8027,18 @@ StartupXLOG(void) */ CompleteCommitTsInitialization(); + /* + * If we reach this point with checksums in progress state (either being + * enabled or being disabled), we notify the user that they need to + * manually restart the process to enable checksums. This is because we + * cannot launch a dynamic background worker directly from here, it has to + * be launched from a regular backend. + */ + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + ereport(WARNING, + (errmsg("data checksums are being enabled, but no worker is running"), + errhint("Either disable or enable data checksums by calling the pg_disable_data_checksums() or pg_enable_data_checksums() functions."))); + /* * All done with end-of-recovery actions. * @@ -9773,6 +9896,24 @@ XLogReportParameters(void) } } +/* + * Log the new state of checksums + */ +static void +XlogChecksums(ChecksumType new_type) +{ + xl_checksum_state xlrec; + XLogRecPtr recptr; + + xlrec.new_checksumtype = new_type; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state)); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKSUMS); + XLogFlush(recptr); +} + /* * Update full_page_writes in shared memory, and write an * XLOG_FPW_CHANGE record if necessary. @@ -10228,6 +10369,26 @@ xlog_redo(XLogReaderState *record) /* Keep track of full_page_writes */ lastFullPageWrites = fpw; } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state state; + + memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state)); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->data_checksum_version = state.new_checksumtype; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + if (state.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON)); + else if (state.new_checksumtype == PG_DATA_CHECKSUM_VERSION) + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON)); + else + { + Assert(state.new_checksumtype == 0); + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF)); + } + } } #ifdef WAL_DEBUG diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 290658b22c..dc88c6409e 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -25,6 +25,7 @@ #include "catalog/pg_type.h" #include "funcapi.h" #include "miscadmin.h" +#include "postmaster/datachecksumsworker.h" #include "pgstat.h" #include "replication/walreceiver.h" #include "storage/fd.h" @@ -784,3 +785,88 @@ pg_promote(PG_FUNCTION_ARGS) (errmsg("server did not promote within %d seconds", wait_seconds))); PG_RETURN_BOOL(false); } + +/* + * Disables checksums for the cluster, unless already disabled. + * + * Has immediate effect - the checksums are set to off right away. + */ +Datum +disable_data_checksums(PG_FUNCTION_ARGS) +{ + /* + * If we don't need to write new checksums, then clearly they are already + * disabled. + */ + if (!DataChecksumsNeedWrite()) + PG_RETURN_BOOL(false); + + /* + * Shutting down a concurrently running datachecksumsworker will not block + * until the worker shuts down and exits, but we can continue turning off + * checksums anyway since it will at most finish the block it had already + * started and then abort. + */ + ShutdownDatachecksumsWorkerIfRunning(); + + SetDataChecksumsOff(); + + PG_RETURN_BOOL(true); +} + +/* + * Enables checksums for the cluster, unless already enabled. + * + * Supports vacuum-like cost-based throttling, to limit system load. + * Starts a background worker that updates checksums on existing data. + */ +Datum +enable_data_checksums(PG_FUNCTION_ARGS) +{ + int cost_delay = PG_GETARG_INT32(0); + int cost_limit = PG_GETARG_INT32(1); + + if (cost_delay < 0) + ereport(ERROR, + (errmsg("cost delay cannot be less than zero"))); + + if (cost_limit <= 0) + ereport(ERROR, + (errmsg("cost limit must be a positive value"))); + + if (DataChecksumsWorkerStarted()) + PG_RETURN_BOOL(false); + + /* + * Data checksums on -> on is not a valid state transition as there is + * nothing to do. + */ + if (DataChecksumsNeedVerify()) + PG_RETURN_BOOL(false); + + /* + * If the state is set to "inprogress" but the worker isn't running, then + * the data checksumming was prematurely terminated. Attempt to continue + * processing data pages where we left off based on state stored in the + * catalog. + */ + if (DataChecksumsOnInProgress()) + { + ereport(NOTICE, + (errmsg("data checksums partly enabled, continuing processing"))); + + StartDatachecksumsWorkerLauncher(ENABLE_CHECKSUMS, cost_delay, cost_limit); + } + + /* + * We are starting a checksumming process from scratch, and need to start + * by clearing the state in pg_class in case checksums have ever been + * enabled before (either fully or partly). As soon as we set the checksum + * state to "inprogress", new relations will set relhaschecksums in + * pg_class so it must be done first. + */ + else + StartDatachecksumsWorkerLauncher(RESET_STATE, cost_delay, cost_limit); + + PG_RETURN_BOOL(true); +} diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index abd5bdb866..bd94d8cfc3 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -956,6 +956,7 @@ InsertPgClassTuple(Relation pg_class_desc, values[Anum_pg_class_relispopulated - 1] = BoolGetDatum(rd_rel->relispopulated); values[Anum_pg_class_relreplident - 1] = CharGetDatum(rd_rel->relreplident); values[Anum_pg_class_relispartition - 1] = BoolGetDatum(rd_rel->relispartition); + values[Anum_pg_class_relhaschecksums - 1] = BoolGetDatum(DataChecksumsNeedWrite()); values[Anum_pg_class_relrewrite - 1] = ObjectIdGetDatum(rd_rel->relrewrite); values[Anum_pg_class_relfrozenxid - 1] = TransactionIdGetDatum(rd_rel->relfrozenxid); values[Anum_pg_class_relminmxid - 1] = MultiXactIdGetDatum(rd_rel->relminmxid); diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index a2d61302f9..171ab83aeb 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1225,6 +1225,11 @@ CREATE OR REPLACE FUNCTION RETURNS boolean STRICT VOLATILE LANGUAGE INTERNAL AS 'pg_promote' PARALLEL SAFE; +CREATE OR REPLACE FUNCTION pg_enable_data_checksums ( + cost_delay int DEFAULT 0, cost_limit int DEFAULT 100) + RETURNS boolean STRICT VOLATILE LANGUAGE internal AS 'enable_data_checksums' + PARALLEL RESTRICTED; + -- legacy definition for compatibility with 9.3 CREATE OR REPLACE FUNCTION json_populate_record(base anyelement, from_json json, use_json_as_text boolean DEFAULT false) diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index bfdf6a833d..59b82ee9ce 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -17,6 +17,7 @@ OBJS = \ bgworker.o \ bgwriter.o \ checkpointer.o \ + datachecksumsworker.o \ fork_process.o \ interrupt.o \ pgarch.o \ diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index beb5e85434..cf4a33eebe 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -18,6 +18,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgworker_internals.h" +#include "postmaster/datachecksumsworker.h" #include "postmaster/interrupt.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" @@ -128,6 +129,15 @@ static const struct }, { "ApplyWorkerMain", ApplyWorkerMain + }, + { + "DatachecksumsWorkerLauncherMain", DatachecksumsWorkerLauncherMain + }, + { + "DatachecksumsWorkerMain", DatachecksumsWorkerMain + }, + { + "ResetDataChecksumsStateInDatabase", ResetDataChecksumsStateInDatabase } }; diff --git a/src/backend/postmaster/datachecksumsworker.c b/src/backend/postmaster/datachecksumsworker.c new file mode 100644 index 0000000000..73ce6c8b39 --- /dev/null +++ b/src/backend/postmaster/datachecksumsworker.c @@ -0,0 +1,1266 @@ +/*------------------------------------------------------------------------- + * + * datachecksumsworker.c + * Background worker for enabling or disabling data checksums online + * + * When enabling data checksums on a database at initdb time or with + * pg_checksums, no extra process is required as each page is checksummed, and + * verified, when accessed. When enabling checksums on an already running + * cluster, which does not run with checksums enabled, this worker will ensure + * that all pages are checksummed before verification of the checksums is + * turned on. In the case of disabling checksums, the state transition is + * recorded in the catalog and control file, and no changes are performed + * on the data pages or in the catalog. + * + * Checksums can be either enabled or disabled cluster-wide, with on/off being + * the end state for data_checkums. + * + * Enabling checksums + * ------------------ + * When enabling checkums in an online cluster, data_checksums will be set to + * "inprogress" which signals that write operations MUST compute and write + * the checksum on the data page, but during reading the checksum SHALL NOT be + * verified. This ensures that all objects created during checksumming will + * have checksums set, but no reads will fail due to incorrect checksum. The + * DataChecksumsWorker will compile a list of databases which exist at the + * start of checksumming, and all of these which haven't been dropped during + * the processing MUST have been processed successfully in order for checksums + * to be enabled. Any new relation created during processing will see the + * in-progress state and will automatically be checksummed as well as have its + * state recorded in the catalog to avoid the datachecksumsworker having to + * process it when already checksummed. + * + * For each database, all relations which have storage are read and every data + * page is marked dirty to force a write with the checksum. This will generate + * a lot of WAL as the entire database is read and written. Once all datapages + * in a relation have been written, pg_class.relhaschecksums is set to true to + * indicate that the relation is done. + * + * If the processing is interrupted by a cluster restart, it will be restarted + * from where it left off given that pg_class.relhaschecksums track state of + * processed relations and the in-progress state will ensure all new writes + * performed with checksums. Each database will be reprocessed, but relations + * where pg_class.relhaschecksums is true are skipped. + * + * "If checksums are enabled, then disabled, and then re-enabled, every + * relation's pg_class.relhaschecksums field will be reset to false before + * entering the in-progress mode. + * + * + * Disabling checksums + * ------------------- + * Disabling checksums is an immediate operation as it only updates the control + * file and accompanying local state in the backends. No changes to + * pg_class.relhaschecksums are necessary as it only tracks state during + * enabling. + * + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/datachecksumsworker.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "catalog/indexing.h" +#include "catalog/pg_class.h" +#include "catalog/pg_database.h" +#include "commands/vacuum.h" +#include "common/relpath.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/lmgr.h" +#include "storage/ipc.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/ps_status.h" +#include "utils/syscache.h" + +#define DATACHECKSUMSWORKER_MAX_DB_RETRIES 5 + +typedef enum +{ + DATACHECKSUMSWORKER_SUCCESSFUL = 0, + DATACHECKSUMSWORKER_ABORTED, + DATACHECKSUMSWORKER_FAILED, + DATACHECKSUMSWORKER_RETRYDB, +} DatachecksumsWorkerResult; + +typedef struct DatachecksumsWorkerShmemStruct +{ + /* + * Access to launcher_started and abort must be protected by + * DatachecksumsWorkerLock. + */ + bool launcher_started; + bool abort; + + /* + * Variables for the worker to signal the launcher, or subsequent workers + * in other databases. As there is only a single worker, and the launcher + * won't read these until the worker exits, they can be accessed without + * the need for a lock. If multiple workers are supported then this will + * have to be revisited. + */ + DatachecksumsWorkerResult success; + bool process_shared_catalogs; + + /* + * The below members are set when the launcher starts, and are only + * accessed read-only by the single worker. Thus, we can access these + * without a lock. If multiple workers, or dynamic cost parameters, are + * supported at some point then this would need to be revisited. + */ + int cost_delay; + int cost_limit; + DataChecksumOperation operation; +} DatachecksumsWorkerShmemStruct; + +/* Shared memory segment for datachecksumsworker */ +static DatachecksumsWorkerShmemStruct * DatachecksumsWorkerShmem; + +/* Bookkeeping for work to do */ +typedef struct DatachecksumsWorkerDatabase +{ + Oid dboid; + char *dbname; +} DatachecksumsWorkerDatabase; + +typedef struct DatachecksumsWorkerRelation +{ + Oid reloid; + char relkind; +} DatachecksumsWorkerRelation; + +typedef struct DatachecksumsWorkerResultEntry +{ + Oid dboid; + DatachecksumsWorkerResult result; + int retries; +} DatachecksumsWorkerResultEntry; + + +/* Prototypes */ +static List *BuildDatabaseList(void); +static List *BuildRelationList(bool include_shared); +static List *BuildTempTableList(void); +static DatachecksumsWorkerResult ProcessDatabase(DatachecksumsWorkerDatabase * db); +static bool ProcessAllDatabases(bool already_connected); +static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy); +static void launcher_cancel_handler(SIGNAL_ARGS); +static void SetRelHasChecksums(Oid relOid); +static void WaitForAllTransactionsToFinish(void); + +/* + * DataChecksumsWorkerStarted + * Informational function to query the state of the worker + */ +bool +DataChecksumsWorkerStarted(void) +{ + bool started; + + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + started = DatachecksumsWorkerShmem->launcher_started && !DatachecksumsWorkerShmem->abort; + LWLockRelease(DatachecksumsWorkerLock); + + return started; +} + +/* + * StartDataChecksumsWorkerLauncher + * Main entry point for datachecksumsworker launcher process. + */ +void +StartDatachecksumsWorkerLauncher(DataChecksumOperation op, + int cost_delay, int cost_limit) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + + /* + * This can be hit during a short window during which the worker is + * shutting down. Once done the worker will clear the abort flag and + * re-processing can be performed. + */ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + if (DatachecksumsWorkerShmem->abort) + { + LWLockRelease(DatachecksumsWorkerLock); + ereport(ERROR, + (errmsg("data checksums worker has been aborted"))); + } + + if (DatachecksumsWorkerShmem->launcher_started) + { + /* Failed to set means somebody else started */ + LWLockRelease(DatachecksumsWorkerLock); + ereport(NOTICE, + (errmsg("data checksums worker is already running"))); + return; + } + + /* Whether to enable or disable data checksums */ + DatachecksumsWorkerShmem->operation = op; + + /* Backoff parameters to throttle the load during enabling */ + DatachecksumsWorkerShmem->cost_delay = cost_delay; + DatachecksumsWorkerShmem->cost_limit = cost_limit; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DatachecksumsWorkerLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksumsworker launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksumsworker launcher"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = (Datum) 0; + + DatachecksumsWorkerShmem->launcher_started = true; + LWLockRelease(DatachecksumsWorkerLock); + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->launcher_started = false; + LWLockRelease(DatachecksumsWorkerLock); + ereport(ERROR, + (errmsg("failed to start background worker to process data checksums"))); + } +} + +/* + * ShutdownDatachecksumsWorkerIfRunning + * Request shutdown of the datachecksumsworker + * + * This does not turn off processing immediately, it signals the checksum + * process to end when done with the current block. + */ +void +ShutdownDatachecksumsWorkerIfRunning(void) +{ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + + /* If the launcher isn't started, there is nothing to shut down */ + if (DatachecksumsWorkerShmem->launcher_started) + DatachecksumsWorkerShmem->abort = true; + + LWLockRelease(DatachecksumsWorkerLock); +} + +/* + * ProcessSingleRelationFork + * Enable data checksums in a single relation/fork. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy) +{ + BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum); + BlockNumber blknum; + char activity[NAMEDATALEN * 2 + 128]; + + /* + * We are looping over the blocks which existed at the time of process + * start, which is safe since new blocks are created with checksums set + * already due to the state being "inprogress". + */ + for (blknum = 0; blknum < numblocks; blknum++) + { + Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy); + + /* + * Report to pgstat every 100 blocks to keep from overwhelming the + * activity reporting with close to identical reports. + */ + if ((blknum % 100) == 0) + { + snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s block %d/%d)", + get_namespace_name(RelationGetNamespace(reln)), RelationGetRelationName(reln), + forkNames[forkNum], blknum, numblocks); + pgstat_report_activity(STATE_RUNNING, activity); + } + + /* Need to get an exclusive lock before we can flag as dirty */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Mark the buffer as dirty and force a full page write. We have to + * re-write the page to WAL even if the checksum hasn't changed, + * because if there is a replica it might have a slightly different + * version of the page with an invalid checksum, caused by unlogged + * changes (e.g. hintbits) on the master happening while checksums + * were off. This can happen if there was a valid checksum on the page + * at one point in the past, so only when checksums are first on, then + * off, and then turned on again. Iff wal_level is set to "minimal", + * this could be avoided iff the checksum is calculated to be correct. + */ + START_CRIT_SECTION(); + MarkBufferDirty(buf); + log_newpage_buffer(buf, false); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + /* + * This is the only place where we check if we are asked to abort, the + * abortion will bubble up from here. It's safe to check this without + * a lock, because if we miss it being set, we will try again soon. + */ + if (DatachecksumsWorkerShmem->abort) + return false; + + vacuum_delay_point(); + } + + return true; +} + +/* + * ProcessSingleRelationByOid + * Process a single relation based on oid. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy) +{ + Relation rel; + ForkNumber fnum; + bool aborted = false; + + StartTransactionCommand(); + + elog(DEBUG2, + "adding data checksums to relation with OID %u", + relationId); + + rel = try_relation_open(relationId, AccessShareLock); + if (rel == NULL) + { + /* + * Relation no longer exists. We don't consider this an error since + * there are no pages in it that need checksums, and thus return true. + */ + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, NULL); + return true; + } + RelationOpenSmgr(rel); + + for (fnum = 0; fnum <= MAX_FORKNUM; fnum++) + { + if (smgrexists(rel->rd_smgr, fnum)) + { + if (!ProcessSingleRelationFork(rel, fnum, strategy)) + { + aborted = true; + break; + } + } + } + relation_close(rel, AccessShareLock); + elog(DEBUG2, + "data checksum processing done for relation with OID %u: %s", + relationId, (aborted ? "aborted" : "finished")); + + if (!aborted) + SetRelHasChecksums(relationId); + + CommitTransactionCommand(); + + pgstat_report_activity(STATE_IDLE, NULL); + + return !aborted; +} + +/* + * SetRelHasChecksums + * + * Sets the pg_class.relhaschecksums flag for the relation specified by relOid + * to true. The corresponding function for clearing state is + * ResetDataChecksumsStateInDatabase which operate on all relations in a + * database. + */ +static void +SetRelHasChecksums(Oid relOid) +{ + Relation rel; + Form_pg_class pg_class_tuple; + HeapTuple tuple; + + rel = table_open(RelationRelationId, RowExclusiveLock); + + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relOid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", relOid); + + pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple); + pg_class_tuple->relhaschecksums = true; + + CatalogTupleUpdate(rel, &tuple->t_self, tuple); + + ReleaseSysCache(tuple); + + table_close(rel, RowExclusiveLock); +} + +/* + * ProcessDatabase + * Enable data checksums in a single database. + * + * We do this by launching a dynamic background worker into this database, and + * waiting for it to finish. We have to do this in a separate worker, since + * each process can only be connected to one database during its lifetime. + */ +static DatachecksumsWorkerResult +ProcessDatabase(DatachecksumsWorkerDatabase * db) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + BgwHandleStatus status; + pid_t pid; + char activity[NAMEDATALEN + 64]; + + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_FAILED; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + if (DatachecksumsWorkerShmem->operation == ENABLE_CHECKSUMS) + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DatachecksumsWorkerMain"); + else if (DatachecksumsWorkerShmem->operation == RESET_STATE) + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ResetDataChecksumsStateInDatabase"); + else + elog(ERROR, "invalid datachecksumsworker operation requested: %d", + DatachecksumsWorkerShmem->operation); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksumsworker worker"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksumsworker worker"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid); + + /* + * If there are no worker slots available, make sure we retry processing + * this database. This will make the datachecksumsworker move on to the + * next database and quite likely fail with the same problem. TODO: Maybe + * we need a backoff to avoid running through all the databases here in + * short order. + */ + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + ereport(WARNING, + (errmsg("failed to start worker for enabling data checksums in database \"%s\", retrying", + db->dbname), + errhint("The max_worker_processes setting might be too low."))); + return DATACHECKSUMSWORKER_RETRYDB; + } + + status = WaitForBackgroundWorkerStartup(bgw_handle, &pid); + if (status == BGWH_STOPPED) + { + ereport(WARNING, + (errmsg("could not start background worker for enabling data checksums in database \"%s\"", + db->dbname), + errhint("More details on the error might be found in the server log."))); + return DATACHECKSUMSWORKER_FAILED; + } + + /* + * If the postmaster crashed we cannot end up with a processed database so + * we have no alternative other than exiting. When enabling checksums we + * won't at this time have changed the pg_control version to enabled so + * when the cluster comes back up processing will have to be resumed. When + * disabling, the pg_control version will be set to off before this so + * when the cluster comes up checksums will be off as expected. In the + * latter case we might have stale relhaschecksums flags in pg_class which + * need to be handled in some way. TODO + */ + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + (errmsg("cannot enable data checksums without the postmaster process"), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."))); + + Assert(status == BGWH_STARTED); + ereport(DEBUG1, + (errmsg("initiating data checksum processing in database \"%s\"", + db->dbname))); + + snprintf(activity, sizeof(activity) - 1, + "Waiting for worker in database %s (pid %d)", db->dbname, pid); + pgstat_report_activity(STATE_RUNNING, activity); + + status = WaitForBackgroundWorkerShutdown(bgw_handle); + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + (errmsg("postmaster exited during data checksum processing in \"%s\"", + db->dbname), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."))); + + if (DatachecksumsWorkerShmem->success == DATACHECKSUMSWORKER_ABORTED) + ereport(LOG, + (errmsg("data checksums processing was aborted in database \"%s\"", + db->dbname))); + + pgstat_report_activity(STATE_IDLE, NULL); + + return DatachecksumsWorkerShmem->success; +} + +static void +launcher_exit(int code, Datum arg) +{ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->abort = false; + DatachecksumsWorkerShmem->launcher_started = false; + LWLockRelease(DatachecksumsWorkerLock); +} + +static void +launcher_cancel_handler(SIGNAL_ARGS) +{ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->abort = true; + LWLockRelease(DatachecksumsWorkerLock); +} + +/* + * WaitForAllTransactionsToFinish + * Blocks awaiting all current transactions to finish + * + * Returns when all transactions which are active at the call of the function + * have ended, or if the postmaster dies while waiting. If the postmaster dies + * the abort flag will be set to indicate that the caller of this shouldn't + * proceed. + */ +static void +WaitForAllTransactionsToFinish(void) +{ + TransactionId waitforxid; + bool aborted = false; + + LWLockAcquire(XidGenLock, LW_SHARED); + waitforxid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + LWLockRelease(XidGenLock); + + while (!aborted) + { + TransactionId oldestxid = GetOldestActiveTransactionId(); + + if (TransactionIdPrecedes(oldestxid, waitforxid)) + { + char activity[64]; + int rc; + + /* Oldest running xid is older than us, so wait */ + snprintf(activity, + sizeof(activity), + "Waiting for current transactions to finish (waiting for %u)", + waitforxid); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 5 seconds */ + ResetLatch(MyLatch); + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 5000, + WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION); + + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + + /* + * If the postmaster died we wont be able to enable checksums + * cluster-wide so abort and hope to continue when restarted. + */ + if (rc & WL_POSTMASTER_DEATH) + DatachecksumsWorkerShmem->abort = true; + aborted = DatachecksumsWorkerShmem->abort; + + LWLockRelease(DatachecksumsWorkerLock); + } + else + { + pgstat_report_activity(STATE_IDLE, NULL); + return; + } + } +} + +/* + * DatachecksumsWorkerLauncherMain + * + * Main function for launching dynamic background workers for processing data + * checksums in databases. This function has the bgworker management, with + * ProcessAllDatabases being responsible for looping over the databases and + * initiating processing. + */ +void +DatachecksumsWorkerLauncherMain(Datum arg) +{ + bool connected = false; + + on_shmem_exit(launcher_exit, 0); + + ereport(DEBUG1, + (errmsg("background worker \"datachecksumsworker\" launcher started"))); + + pqsignal(SIGTERM, die); + pqsignal(SIGINT, launcher_cancel_handler); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER; + init_ps_display(NULL); + + if (DatachecksumsWorkerShmem->operation == RESET_STATE) + { + if (!ProcessAllDatabases(connected)) + { + /* + * Before we error out make sure we clear state since this may + * otherwise render the worker stuck without possibility of a + * restart. + */ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->launcher_started = false; + DatachecksumsWorkerShmem->abort = false; + LWLockRelease(DatachecksumsWorkerLock); + ereport(ERROR, + (errmsg("unable to finish processing"))); + } + + connected = true; + SetDataChecksumsOnInProgress(); + + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->operation = ENABLE_CHECKSUMS; + LWLockRelease(DatachecksumsWorkerLock); + } + + /* + * Prepare for datachecksumsworker shutdown, once we signal that checksums + * are enabled we want the worker to be done and exited to avoid races + * with immediate disabling/enabling. + */ + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + DatachecksumsWorkerShmem->abort = false; + DatachecksumsWorkerShmem->launcher_started = false; + LWLockRelease(DatachecksumsWorkerLock); + + /* + * If processing succeeds for ENABLE_CHECKSUMS, then everything has been + * processed so set checksums as enabled cluster-wide + */ + if (ProcessAllDatabases(connected)) + { + SetDataChecksumsOn(); + ereport(LOG, + (errmsg("checksums enabled cluster-wide"))); + } +} + +/* + * ProcessAllDatabases + * Compute the list of all databases and process checksums in each + * + * This will repeatedly generate a list of databases to process for either + * enabling checksums or resetting the checksum catalog tracking. Until no + * new databases are found, this will loop around computing a new list and + * comparing it to the already seen ones. + */ +static bool +ProcessAllDatabases(bool already_connected) +{ + List *DatabaseList; + HTAB *ProcessedDatabases = NULL; + ListCell *lc; + HASHCTL hash_ctl; + bool found_failed = false; + + /* Initialize a hash tracking all processed databases */ + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(DatachecksumsWorkerResultEntry); + ProcessedDatabases = hash_create("Processed databases", + 64, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + + /* + * Initialize a connection to shared catalogs only. + */ + if (!already_connected) + BackgroundWorkerInitializeConnection(NULL, NULL, 0); + + /* + * Set up so first run processes shared catalogs, but not once in every + * db. + */ + DatachecksumsWorkerShmem->process_shared_catalogs = true; + + while (true) + { + int processed_databases = 0; + + /* + * Get a list of all databases to process. This may include databases + * that were created during our runtime. + * + * Since a database can be created as a copy of any other database + * (which may not have existed in our last run), we have to repeat + * this loop until no new databases show up in the list. Since we wait + * for all pre-existing transactions finish, this way we can be + * certain that there are no databases left without checksums. + */ + DatabaseList = BuildDatabaseList(); + + foreach(lc, DatabaseList) + { + DatachecksumsWorkerDatabase *db = (DatachecksumsWorkerDatabase *) lfirst(lc); + DatachecksumsWorkerResult result; + DatachecksumsWorkerResultEntry *entry; + bool found; + + elog(DEBUG1, + "starting processing of database %s with oid %u", + db->dbname, db->dboid); + + entry = (DatachecksumsWorkerResultEntry *) hash_search(ProcessedDatabases, &db->dboid, + HASH_FIND, NULL); + + if (entry) + { + if (entry->result == DATACHECKSUMSWORKER_RETRYDB) + { + /* + * Limit the number of retries to avoid infinite looping + * in case there simply wont be enough workers in the + * cluster to finish this operation. + */ + if (entry->retries > DATACHECKSUMSWORKER_MAX_DB_RETRIES) + entry->result = DATACHECKSUMSWORKER_FAILED; + } + + /* Skip if this database has been processed already */ + if (entry->result != DATACHECKSUMSWORKER_RETRYDB) + { + pfree(db->dbname); + pfree(db); + continue; + } + } + + result = ProcessDatabase(db); + processed_databases++; + + if (result == DATACHECKSUMSWORKER_SUCCESSFUL) + { + /* + * If one database has completed shared catalogs, we don't + * have to process them again. + */ + if (DatachecksumsWorkerShmem->process_shared_catalogs) + DatachecksumsWorkerShmem->process_shared_catalogs = false; + } + else if (result == DATACHECKSUMSWORKER_ABORTED) + { + /* Abort flag set, so exit the whole process */ + return false; + } + + entry = hash_search(ProcessedDatabases, &db->dboid, HASH_ENTER, &found); + entry->dboid = db->dboid; + entry->result = result; + if (!found) + entry->retries = 0; + else + entry->retries++; + + pfree(db->dbname); + pfree(db); + } + + elog(DEBUG1, + "%i databases processed for data checksum enabling, %s", + processed_databases, + (processed_databases ? "process with restart" : "process completed")); + + list_free(DatabaseList); + + /* + * If no databases were processed in this run of the loop, we have now + * finished all databases and no concurrently created ones can exist. + */ + if (processed_databases == 0) + break; + } + + /* + * ProcessedDatabases now has all databases and the results of their + * processing. Failure to enable checksums for a database can be because + * they actually failed for some reason, or because the database was + * dropped between us getting the database list and trying to process it. + * Get a fresh list of databases to detect the second case where the + * database was dropped before we had started processing it. If a database + * still exists, but enabling checksums failed then we fail the entire + * checksumming process and exit with an error. + */ + DatabaseList = BuildDatabaseList(); + + foreach(lc, DatabaseList) + { + DatachecksumsWorkerDatabase *db = (DatachecksumsWorkerDatabase *) lfirst(lc); + DatachecksumsWorkerResult *entry; + bool found; + + entry = hash_search(ProcessedDatabases, (void *) &db->dboid, + HASH_FIND, &found); + + /* + * We are only interested in the databases where the failed database + * still exists. + */ + if (found && *entry == DATACHECKSUMSWORKER_FAILED) + { + ereport(WARNING, + (errmsg("failed to enable data checksums in \"%s\"", + db->dbname))); + found_failed = found; + continue; + } + } + + if (found_failed) + { + /* Disable checksums on cluster, because we failed */ + SetDataChecksumsOff(); + ereport(ERROR, + (errmsg("checksums failed to get enabled in all databases, aborting"), + errhint("The server log might have more information on the error."))); + } + + /* + * Force a checkpoint to get everything out to disk. TODO: we probably + * don't want to use a CHECKPOINT_IMMEDIATE here but it's very convenient + * for testing until the patch is fully baked, as it may otherwise make + * tests take a lot longer. + */ + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_IMMEDIATE); + + return true; +} + +/* + * DatachecksumsWorkerShmemSize + * Compute required space for datachecksumsworker-related shared memory + */ +Size +DatachecksumsWorkerShmemSize(void) +{ + Size size; + + size = sizeof(DatachecksumsWorkerShmemStruct); + size = MAXALIGN(size); + + return size; +} + +/* + * DatachecksumsWorkerShmemInit + * Allocate and initialize datachecksumsworker-related shared memory + */ +void +DatachecksumsWorkerShmemInit(void) +{ + bool found; + + DatachecksumsWorkerShmem = (DatachecksumsWorkerShmemStruct *) + ShmemInitStruct("DatachecksumsWorker Data", + DatachecksumsWorkerShmemSize(), + &found); + + if (!found) + { + MemSet(DatachecksumsWorkerShmem, 0, DatachecksumsWorkerShmemSize()); + + /* + * Even if this is a redundant assignment, we want to be explicit + * about our intent for readability, since we want to be able to query + * this state in case of restartability. + */ + DatachecksumsWorkerShmem->launcher_started = false; + } +} + +/* + * BuildDatabaseList + * Compile a list of all currently available databases in the cluster + * + * This creates the list of databases for the datachecksumsworker workers to + * add checksums to. + */ +static List * +BuildDatabaseList(void) +{ + List *DatabaseList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + + /* + * Before we do this, wait for all pending transactions to finish. This + * will ensure there are no concurrently running CREATE DATABASE, which + * could cause us to miss the creation of a database that was copied + * without checksums. + */ + WaitForAllTransactionsToFinish(); + + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup); + DatachecksumsWorkerDatabase *db; + + oldctx = MemoryContextSwitchTo(ctx); + + db = (DatachecksumsWorkerDatabase *) palloc(sizeof(DatachecksumsWorkerDatabase)); + + db->dboid = pgdb->oid; + db->dbname = pstrdup(NameStr(pgdb->datname)); + + DatabaseList = lappend(DatabaseList, db); + + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return DatabaseList; +} + +/* + * BuildRelationList + * Compile a list of all relations in the database + * + * If shared is true, both shared relations and local ones are returned, else + * all non-shared relations are returned. Temp tables are not included. + */ +static List * +BuildRelationList(bool include_shared) +{ + List *RelationList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup); + DatachecksumsWorkerRelation *relentry; + + if (!RELKIND_HAS_STORAGE(pgc->relkind) || + pgc->relpersistence == RELPERSISTENCE_TEMP) + continue; + + if (pgc->relhaschecksums) + continue; + + if (pgc->relisshared && !include_shared) + continue; + + oldctx = MemoryContextSwitchTo(ctx); + relentry = (DatachecksumsWorkerRelation *) palloc(sizeof(DatachecksumsWorkerRelation)); + + relentry->reloid = pgc->oid; + relentry->relkind = pgc->relkind; + + RelationList = lappend(RelationList, relentry); + + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return RelationList; +} + +/* + * BuildTempTableList + * Compile a list of all temporary tables in the current database + * + * Contrary to BuildRelationList this function only returns a list of oids, + * since the relkind is already known. + */ +static List * +BuildTempTableList(void) +{ + List *RelationList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup); + + if (pgc->relpersistence != RELPERSISTENCE_TEMP) + continue; + + oldctx = MemoryContextSwitchTo(ctx); + RelationList = lappend_oid(RelationList, pgc->oid); + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return RelationList; +} + +/* + * ResetDataChecksumsStateInDatabase + * Main worker function for clearing checksums state in the catalog + * + * Resets the pg_class.relhaschecksums flag to false for all entries in the + * current database. This is required to be performed before adding checksums + * to a running cluster in order to track the state of the processing. + */ +void +ResetDataChecksumsStateInDatabase(Datum arg) +{ + Relation rel; + HeapTuple tuple; + Oid dboid = DatumGetObjectId(arg); + TableScanDesc scan; + Form_pg_class pgc; + + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_WORKER; + init_ps_display(NULL); + + ereport(DEBUG1, + (errmsg("resetting catalog state for data checksums in database with OID %u", + dboid))); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, BGWORKER_BYPASS_ALLOWCONN); + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, RowExclusiveLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tuple = heap_getnext(scan, ForwardScanDirection))) + { + tuple = heap_copytuple(tuple); + pgc = (Form_pg_class) GETSTRUCT(tuple); + + if (pgc->relhaschecksums) + { + pgc->relhaschecksums = false; + CatalogTupleUpdate(rel, &tuple->t_self, tuple); + } + + heap_freetuple(tuple); + } + + table_endscan(scan); + table_close(rel, RowExclusiveLock); + + CommitTransactionCommand(); + + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_SUCCESSFUL; +} + +/* + * DatachecksumsWorkerMain + * + * Main function for enabling checksums in a single database, This is the + * function set as the bgw_function_name in the dynamic background worker + * process initated for each database by the worker launcher. After enabling + * data checksums in each applicable relation in the database, it will wait for + * all temporary relations that were present when the function started to + * disappear before returning. This is required since we cannot rewrite + * existing temporary relations with data checksums. + */ +void +DatachecksumsWorkerMain(Datum arg) +{ + Oid dboid = DatumGetObjectId(arg); + List *RelationList = NIL; + List *InitialTempTableList = NIL; + ListCell *lc; + BufferAccessStrategy strategy; + bool aborted = false; + + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_WORKER; + init_ps_display(NULL); + + ereport(DEBUG1, + (errmsg("starting data checksum processing in database with OID %u", + dboid))); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, + BGWORKER_BYPASS_ALLOWCONN); + + /* + * Get a list of all temp tables present as we start in this database. We + * need to wait until they are all gone until we are done, since we cannot + * access these relations and modify them. + */ + InitialTempTableList = BuildTempTableList(); + + /* + * Enable vacuum cost delay, if any. + */ + VacuumCostDelay = DatachecksumsWorkerShmem->cost_delay; + VacuumCostLimit = DatachecksumsWorkerShmem->cost_limit; + VacuumCostActive = (VacuumCostDelay > 0); + VacuumCostBalance = 0; + VacuumPageHit = 0; + VacuumPageMiss = 0; + VacuumPageDirty = 0; + + /* + * Create and set the vacuum strategy as our buffer strategy. + */ + strategy = GetAccessStrategy(BAS_VACUUM); + + RelationList = BuildRelationList(DatachecksumsWorkerShmem->process_shared_catalogs); + foreach(lc, RelationList) + { + DatachecksumsWorkerRelation *rel = (DatachecksumsWorkerRelation *) lfirst(lc); + + if (!ProcessSingleRelationByOid(rel->reloid, strategy)) + { + aborted = true; + break; + } + } + list_free_deep(RelationList); + + if (aborted) + { + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_ABORTED; + ereport(DEBUG1, + (errmsg("data checksum processing aborted in database OID %u", + dboid))); + return; + } + + /* + * Wait for all temp tables that existed when we started to go away. This + * is necessary since we cannot "reach" them to enable checksums. Any temp + * tables created after we started will already have checksums in them + * (due to the "inprogress" state), so no need to wait for those. + */ + while (!aborted) + { + List *CurrentTempTables; + ListCell *lc; + int numleft; + char activity[64]; + int rc; + + CurrentTempTables = BuildTempTableList(); + numleft = 0; + foreach(lc, InitialTempTableList) + { + if (list_member_oid(CurrentTempTables, lfirst_oid(lc))) + numleft++; + } + list_free(CurrentTempTables); + + if (numleft == 0) + break; + + /* At least one temp table is left to wait for */ + snprintf(activity, + sizeof(activity), + "Waiting for %d temp tables to be removed", numleft); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 5 seconds */ + ResetLatch(MyLatch); + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 5000, + WAIT_EVENT_CHECKSUM_ENABLE_FINISHCONDITION); + + LWLockAcquire(DatachecksumsWorkerLock, LW_EXCLUSIVE); + + /* + * If the postmaster died we wont be able to enable checksums + * cluster-wide so abort and hope to continue when restarted. + */ + if (rc & WL_POSTMASTER_DEATH) + DatachecksumsWorkerShmem->abort = true; + aborted = DatachecksumsWorkerShmem->abort; + + LWLockRelease(DatachecksumsWorkerLock); + } + + list_free(InitialTempTableList); + + DatachecksumsWorkerShmem->success = DATACHECKSUMSWORKER_SUCCESSFUL; + ereport(DEBUG1, + (errmsg("data checksum processing completed in database with OID %u", + dboid))); +} diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 8116b23614..cd76de125a 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3770,6 +3770,12 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_CHECKPOINT_START: event_name = "CheckpointStart"; break; + case WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION: + event_name = "ChecksumEnableStartCondition"; + break; + case WAIT_EVENT_CHECKSUM_ENABLE_FINISHCONDITION: + event_name = "ChecksumEnableFinishCondition"; + break; case WAIT_EVENT_EXECUTE_GATHER: event_name = "ExecuteGather"; break; diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 6064384e32..28e77eedea 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -1595,7 +1595,7 @@ sendFile(const char *readfilename, const char *tarfilename, _tarWriteHeader(tarfilename, NULL, statbuf, false); - if (!noverify_checksums && DataChecksumsEnabled()) + if (!noverify_checksums && DataChecksumsNeedVerify()) { char *filename; diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index f21f61d5e1..f4dffad925 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -212,6 +212,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPW_CHANGE: case XLOG_FPI_FOR_HINT: case XLOG_FPI: + case XLOG_CHECKSUMS: break; default: elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 96c2aaabbd..b1713cf751 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -28,6 +28,7 @@ #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/origin.h" @@ -259,6 +260,7 @@ CreateSharedMemoryAndSemaphores(void) WalSndShmemInit(); WalRcvShmemInit(); ApplyLauncherShmemInit(); + DatachecksumsWorkerShmemInit(); /* * Set up other modules that need some shared memory space diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 4fa385b0ec..48f2352f03 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -18,6 +18,7 @@ #include #include "access/parallel.h" +#include "access/xlog.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -92,7 +93,10 @@ static volatile ProcSignalSlot *MyProcSignalSlot = NULL; static bool CheckProcSignal(ProcSignalReason reason); static void CleanupProcSignalState(int status, Datum arg); -static void ProcessBarrierPlaceholder(void); + +static void ProcessBarrierChecksumOnInProgress(void); +static void ProcessBarrierChecksumOn(void); +static void ProcessBarrierChecksumOff(void); /* * ProcSignalShmemSize @@ -495,8 +499,12 @@ ProcessProcSignalBarrier(void) * unconditionally, but it's more efficient to call only the ones that * might need us to do something based on the flags. */ - if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_PLACEHOLDER)) - ProcessBarrierPlaceholder(); + if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON)) + ProcessBarrierChecksumOnInProgress(); + else if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_CHECKSUM_ON)) + ProcessBarrierChecksumOn(); + else if (BARRIER_SHOULD_CHECK(flags, PROCSIGNAL_BARRIER_CHECKSUM_OFF)) + ProcessBarrierChecksumOff(); /* * State changes related to all types of barriers that might have been @@ -509,16 +517,21 @@ ProcessProcSignalBarrier(void) } static void -ProcessBarrierPlaceholder(void) +ProcessBarrierChecksumOn(void) { - /* - * XXX. This is just a placeholder until the first real user of this - * machinery gets committed. Rename PROCSIGNAL_BARRIER_PLACEHOLDER to - * PROCSIGNAL_BARRIER_SOMETHING_ELSE where SOMETHING_ELSE is something - * appropriately descriptive. Get rid of this function and instead have - * ProcessBarrierSomethingElse. Most likely, that function should live in - * the file pertaining to that subsystem, rather than here. - */ + AbsorbChecksumsOnBarrier(); +} + +static void +ProcessBarrierChecksumOff(void) +{ + AbsorbChecksumsOffBarrier(); +} + +static void +ProcessBarrierChecksumOnInProgress(void) +{ + AbsorbChecksumsOnInProgressBarrier(); } /* diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 774292fd94..23eaf9e576 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +DatachecksumsWorkerLock 48 diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README index e30d7ac59a..78edf57adc 100644 --- a/src/backend/storage/page/README +++ b/src/backend/storage/page/README @@ -10,7 +10,9 @@ http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22 on -hackers list. Current implementation requires this be enabled system-wide at initdb time, or -by using the pg_checksums tool on an offline cluster. +by using the pg_checksums tool on an offline cluster. Checksums can also be +turned on and off using pg_enable_data_checksums()/pg_disable_data_checksums() +at runtime. The checksum is not valid at all times on a data page!! The checksum is valid when the page leaves the shared pool and is checked diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index d708117a40..4c6deaae8b 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -94,7 +94,7 @@ PageIsVerified(Page page, BlockNumber blkno) */ if (!PageIsNew(page)) { - if (DataChecksumsEnabled()) + if (DataChecksumsNeedVerify()) { checksum = pg_checksum_page((char *) page, blkno); @@ -1167,7 +1167,7 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) static char *pageCopy = NULL; /* If we don't need a checksum, just return the passed-in data */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) return (char *) page; /* @@ -1194,7 +1194,7 @@ void PageSetChecksumInplace(Page page, BlockNumber blkno) { /* If we don't need a checksum, just return */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) return; ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 95738a4e34..4f31c1dce5 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1565,7 +1565,7 @@ pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS) int64 result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) + if (!DataChecksumsNeedWrite()) PG_RETURN_NULL(); if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) @@ -1583,7 +1583,7 @@ pg_stat_get_db_checksum_last_failure(PG_FUNCTION_ARGS) TimestampTz result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) + if (!DataChecksumsNeedWrite()) PG_RETURN_NULL(); if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 96ecad02dd..379c78d82d 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1876,6 +1876,8 @@ formrdesc(const char *relationName, Oid relationReltype, relation->rd_rel->relnatts = (int16) natts; relation->rd_rel->relam = HEAP_TABLE_AM_OID; + relation->rd_rel->relhaschecksums = DataChecksumsNeedWrite(); + /* * initialize attribute tuple form * @@ -3484,6 +3486,8 @@ RelationBuildLocalRelation(const char *relname, else rel->rd_rel->relispopulated = true; + rel->rd_rel->relhaschecksums = DataChecksumsNeedWrite(); + /* set replica identity -- system catalogs and non-tables don't have one */ if (!IsCatalogNamespace(relnamespace) && (relkind == RELKIND_RELATION || diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index cf8f9579c3..04bf0836b7 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -249,6 +249,12 @@ GetBackendTypeDesc(BackendType backendType) case B_LOGGER: backendDesc = "logger"; break; + case B_DATACHECKSUMSWORKER_LAUNCHER: + backendDesc = "datachecksumsworker launcher"; + break; + case B_DATACHECKSUMSWORKER_WORKER: + backendDesc = "datachecksumsworker worker"; + break; } return backendDesc; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index d4ab4c7e23..e5674f4e4f 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -617,6 +617,11 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, if (MyBackendId > MaxBackends || MyBackendId <= 0) elog(FATAL, "bad backend ID: %d", MyBackendId); + /* + * Set up local cache of Controldata values. + */ + InitLocalControldata(); + /* Now that we have a BackendId, we can participate in ProcSignal */ ProcSignalInit(MyBackendId); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index de87ad6ef7..68f6ab11b6 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -36,6 +36,7 @@ #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/namespace.h" #include "catalog/pg_authid.h" @@ -76,6 +77,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/checksum.h" #include "storage/dsm_impl.h" #include "storage/fd.h" #include "storage/large_object.h" @@ -498,6 +500,16 @@ static struct config_enum_entry shared_memory_options[] = { {NULL, 0, false} }; +/* + * Options for data_checksums enum. + */ +static const struct config_enum_entry data_checksum_options[] = { + {"on", DATA_CHECKSUMS_ON, true}, + {"off", DATA_CHECKSUMS_OFF, true}, + {"inprogress", DATA_CHECKSUMS_INPROGRESS_ON, true}, + {NULL, 0, false} +}; + /* * Options for enum values stored in other modules */ @@ -607,7 +619,7 @@ static int max_identifier_length; static int block_size; static int segment_size; static int wal_block_size; -static bool data_checksums; +static int data_checksums_tmp; static bool integer_datetimes; static bool assert_enabled; static char *recovery_target_timeline_string; @@ -1898,17 +1910,6 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, - { - {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows whether data checksums are turned on for this cluster."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE - }, - &data_checksums, - false, - NULL, NULL, NULL - }, - { {"syslog_sequence_numbers", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Add sequence number to syslog messages to avoid duplicate suppression."), @@ -4784,6 +4785,17 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, + gettext_noop("Shows whether data checksums are turned on for this cluster."), + NULL, + GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE + }, + &data_checksums_tmp, + DATA_CHECKSUMS_OFF, data_checksum_options, + NULL, NULL, show_data_checksums + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index ffdc23945c..6a5a596f46 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -600,7 +600,7 @@ main(int argc, char *argv[]) exit(1); } - if (ControlFile->data_checksum_version > 0 && + if (ControlFile->data_checksum_version == DATA_CHECKSUMS_ON && mode == PG_MODE_ENABLE) { pg_log_error("data checksums are already enabled in cluster"); diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index 00d71e3a8a..586bc70a70 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -657,6 +657,15 @@ check_control_data(ControlData *oldctrl, * check_for_isn_and_int8_passing_mismatch(). */ + /* + * If checksums have been turned on in the old cluster, but the + * datachecksumsworker have yet to finish, then disallow upgrading. The + * user should either let the process finish, or turn off checksums, + * before retrying. + */ + if (oldctrl->data_checksum_version == 2) + pg_fatal("checksum enabling in old cluster is in progress\n"); + /* * We might eventually allow upgrades from checksum to no-checksum * clusters. diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 8b90cefbe0..a806cc6d0e 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -218,7 +218,7 @@ typedef struct uint32 large_object; bool date_is_int; bool float8_pass_by_value; - bool data_checksum_version; + uint32 data_checksum_version; } ControlData; /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 221af87e71..c169ef90ee 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -199,7 +199,7 @@ extern PGDLLIMPORT int wal_level; * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log * them if forced by wal_log_hints=on. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() (DataChecksumsNeedWrite() || wal_log_hints) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) @@ -318,7 +318,18 @@ extern TimestampTz GetCurrentChunkReplayStartTime(void); extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); -extern bool DataChecksumsEnabled(void); +extern bool DataChecksumsNeedWrite(void); +extern bool DataChecksumsNeedVerify(void); +extern bool DataChecksumsOnInProgress(void); +extern void SetDataChecksumsOnInProgress(void); +extern void SetDataChecksumsOn(void); +extern void SetDataChecksumsOff(void); +extern void AbsorbChecksumsOnInProgressBarrier(void); +extern void AbsorbChecksumsOffInProgressBarrier(void); +extern void AbsorbChecksumsOnBarrier(void); +extern void AbsorbChecksumsOffBarrier(void); +extern const char *show_data_checksums(void); +extern void InitLocalControldata(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 4146753d47..80a959bd7f 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -25,6 +25,7 @@ #include "lib/stringinfo.h" #include "pgtime.h" #include "storage/block.h" +#include "storage/checksum.h" #include "storage/relfilenode.h" @@ -249,6 +250,12 @@ typedef struct xl_restore_point char rp_name[MAXFNAMELEN]; } xl_restore_point; +/* Information logged when checksum level is changed */ +typedef struct xl_checksum_state +{ + ChecksumType new_checksumtype; +} xl_checksum_state; + /* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */ typedef struct xl_end_of_recovery { diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index 679eec3443..6ecec47f54 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -119,6 +119,9 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat /* is relation a partition? */ bool relispartition BKI_DEFAULT(f); + /* does the relation have checksums enabled */ + bool relhaschecksums BKI_DEFAULT(f); + /* heap for rewrite during DDL, link to original rel */ Oid relrewrite BKI_DEFAULT(0); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 06bed90c5e..6bc802d8ba 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -76,6 +76,7 @@ typedef struct CheckPoint #define XLOG_END_OF_RECOVERY 0x90 #define XLOG_FPI_FOR_HINT 0xA0 #define XLOG_FPI 0xB0 +#define XLOG_CHECKSUMS 0xC0 /* diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 1dd325e0e6..4e525b2c8f 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10890,6 +10890,22 @@ proargnames => '{max_data_alignment,database_block_size,blocks_per_segment,wal_block_size,bytes_per_wal_segment,max_identifier_length,max_index_columns,max_toast_chunk_size,large_object_chunk_size,float8_pass_by_value,data_page_checksum_version}', prosrc => 'pg_control_init' }, +{ oid => '4142', + descr => 'disable data checksums', + proname => 'pg_disable_data_checksums', provolatile => 'v', prorettype => 'bool', + proparallel => 'r', + proargtypes => '', + prosrc => 'disable_data_checksums' }, + +{ oid => '4035', + descr => 'enable data checksums', + proname => 'pg_enable_data_checksums', provolatile => 'v', prorettype => 'bool', + proparallel => 'r', + proargtypes => 'int4 int4', proallargtypes => '{int4,int4}', + proargmodes => '{i,i}', + proargnames => '{cost_delay,cost_limit}', + prosrc => 'enable_data_checksums' }, + # collation management functions { oid => '3445', descr => 'import collations from operating system', proname => 'pg_import_system_collations', procost => '100', diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 72e3352398..c4893551a3 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -323,6 +323,8 @@ typedef enum BackendType B_ARCHIVER, B_STATS_COLLECTOR, B_LOGGER, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER, } BackendType; extern BackendType MyBackendType; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 807a9c1edf..98d883f4f9 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -852,6 +852,8 @@ typedef enum WAIT_EVENT_BTREE_PAGE, WAIT_EVENT_CHECKPOINT_DONE, WAIT_EVENT_CHECKPOINT_START, + WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION, + WAIT_EVENT_CHECKSUM_ENABLE_FINISHCONDITION, WAIT_EVENT_EXECUTE_GATHER, WAIT_EVENT_HASH_BATCH_ALLOCATE, WAIT_EVENT_HASH_BATCH_ELECT, diff --git a/src/include/postmaster/datachecksumsworker.h b/src/include/postmaster/datachecksumsworker.h new file mode 100644 index 0000000000..3ed9f193f1 --- /dev/null +++ b/src/include/postmaster/datachecksumsworker.h @@ -0,0 +1,42 @@ +/*------------------------------------------------------------------------- + * + * datachecksumsworker.h + * header file for checksum helper background worker + * + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/postmaster/datachecksumsworker.h + * + *------------------------------------------------------------------------- + */ +#ifndef DATACHECKSUMSWORKER_H +#define DATACHECKSUMSWORKER_H + +typedef enum DataChecksumOperation +{ + ENABLE_CHECKSUMS = 0, + RESET_STATE +} DataChecksumOperation; + +/* Shared memory */ +extern Size DatachecksumsWorkerShmemSize(void); +extern void DatachecksumsWorkerShmemInit(void); + +/* Status functions */ +bool DataChecksumsWorkerStarted(void); + +/* Start the background processes for enabling checksums */ +void StartDatachecksumsWorkerLauncher(DataChecksumOperation op, + int cost_delay, int cost_limit); + +/* Shutdown the background processes, if any */ +void ShutdownDatachecksumsWorkerIfRunning(void); + +/* Background worker entrypoints */ +void DatachecksumsWorkerLauncherMain(Datum arg); +void DatachecksumsWorkerMain(Datum arg); +void ResetDataChecksumsStateInDatabase(Datum arg); + +#endif /* DATACHECKSUMSWORKER_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 51b8f994ac..828ba1a437 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -198,6 +198,8 @@ typedef PageHeaderData *PageHeader; */ #define PG_PAGE_LAYOUT_VERSION 4 #define PG_DATA_CHECKSUM_VERSION 1 +#define PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION 2 + /* ---------------------------------------------------------------- * page support macros diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index 6e77744cbc..f6ae955f58 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -15,6 +15,14 @@ #include "storage/block.h" +typedef enum ChecksumType +{ + DATA_CHECKSUMS_OFF = 0, + DATA_CHECKSUMS_ON, + DATA_CHECKSUMS_INPROGRESS_ON, + DATA_CHECKSUMS_INPROGRESS_OFF +} ChecksumType; + /* * Compute the checksum for a Postgres page. The page must be aligned on a * 4-byte boundary. diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 5cb39697f3..05f85861e3 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -48,12 +48,9 @@ typedef enum typedef enum { - /* - * XXX. PROCSIGNAL_BARRIER_PLACEHOLDER should be replaced when the first - * real user of the ProcSignalBarrier mechanism is added. It's just here - * for now because we can't have an empty enum. - */ - PROCSIGNAL_BARRIER_PLACEHOLDER = 0 + PROCSIGNAL_BARRIER_CHECKSUM_OFF = 0, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON, + PROCSIGNAL_BARRIER_CHECKSUM_ON } ProcSignalBarrierType; /* diff --git a/src/test/Makefile b/src/test/Makefile index efb206aa75..6469ac94a4 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -12,7 +12,8 @@ subdir = src/test top_builddir = ../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = perl regress isolation modules authentication recovery subscription +SUBDIRS = perl regress isolation modules authentication recovery subscription \ + checksum # Test suites that are not safe by default but can be run if selected # by the user via the whitespace-separated list in variable diff --git a/src/test/checksum/.gitignore b/src/test/checksum/.gitignore new file mode 100644 index 0000000000..871e943d50 --- /dev/null +++ b/src/test/checksum/.gitignore @@ -0,0 +1,2 @@ +# Generated by test suite +/tmp_check/ diff --git a/src/test/checksum/Makefile b/src/test/checksum/Makefile new file mode 100644 index 0000000000..558a8135f1 --- /dev/null +++ b/src/test/checksum/Makefile @@ -0,0 +1,23 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/checksum +# +# Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/checksum/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/test/checksum +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) + +clean distclean maintainer-clean: + rm -rf tmp_check diff --git a/src/test/checksum/README b/src/test/checksum/README new file mode 100644 index 0000000000..0f0317060b --- /dev/null +++ b/src/test/checksum/README @@ -0,0 +1,22 @@ +src/test/checksum/README + +Regression tests for data checksums +=================================== + +This directory contains a test suite for enabling data checksums +in a running cluster. + +Running the tests +================= + + make check + +or + + make installcheck + +NOTE: This creates a temporary installation (in the case of "check"), +with multiple nodes, be they master or standby(s) for the purpose of +the tests. + +NOTE: This requires the --enable-tap-tests argument to configure. diff --git a/src/test/checksum/t/001_basic.pl b/src/test/checksum/t/001_basic.pl new file mode 100644 index 0000000000..9dbb660937 --- /dev/null +++ b/src/test/checksum/t/001_basic.pl @@ -0,0 +1,86 @@ +# Test suite for testing enabling data checksums in an online cluster +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 10; + +# Initialize node with checksums disabled. +my $node = get_new_node('main'); +$node->init(); +$node->start(); + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are turned off +my $result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, 'off', 'ensure checksums are disabled'); + +# No relation in pg_class should have relhaschecksums at this point +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE relhaschecksums;"); +is($result, '0', 'ensure no entries in pg_class has checksums recorded'); + +# Enable data checksums +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +# Wait for checksums to become enabled +$result = $node->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled'); + +# Check that relations with storage have been marked with relhaschecksums in +# pg_class +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " . + "AND relkind IN ('r', 'i', 'S', 't', 'm');"); +is($result, '0', 'ensure all relations are correctly flagged in the catalog'); + +# Run a dummy query just to make sure we read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Enable data checksums again which should be a no-op.. +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); +# ..and make sure we still can process data fine +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Disable checksums again +$node->safe_psql('postgres', "SELECT pg_disable_data_checksums();"); + +# Wait for checksums to be disabled +$result = $node->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'off'); +is($result, 1, 'ensure checksums are disabled'); + +# Test reading again +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Disable checksums when already disabled, which is also a no-op so we mainly +# want to run this to make sure the backend isn't crashing or erroring out +$node->safe_psql('postgres', "SELECT pg_disable_data_checksums();"); + +# Re-enable checksums and make sure that the relhaschecksums flags in the +# catalog aren't tricking processing into skipping previously checksummed +# relations +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); + +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); +$result = $node->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled'); + +# Run a dummy query just to make sure we read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +$node->stop; diff --git a/src/test/checksum/t/002_restarts.pl b/src/test/checksum/t/002_restarts.pl new file mode 100644 index 0000000000..d908b95561 --- /dev/null +++ b/src/test/checksum/t/002_restarts.pl @@ -0,0 +1,97 @@ +# Test suite for testing enabling data checksums in an online cluster with +# restarting the processing +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; +use IPC::Run qw(pump finish timer); + +# If we don't have IO::Pty, forget it, because IPC::Run depends on that +# to support pty connections +eval { require IO::Pty; }; +if ($@) +{ + plan skip_all => 'IO::Pty is needed to run this test'; +} + +# Initialize node with checksums disabled. +my $node = get_new_node('main'); +$node->init(); +$node->start(); + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +my $result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, 'off', 'ensure checksums are disabled'); + +# Create a barrier for checksumming to block on, in this case a pre-existing +# temporary table which is kept open while processing is started. We can +# accomplish this by setting up an interactive psql process which keeps the +# temporary table created as we enable checksums in another psql process. +my $in = ''; +my $out = ''; +my $timer = timer(5); + +my $h = $node->interactive_psql('postgres', \$in, \$out, $timer); + +$out = ''; +$timer->start(5); + +$in .= "CREATE TEMPORARY TABLE tt (a integer);\n"; +pump $h until ($out =~ /CREATE TABLE/ || $timer->is_expired); + +# In another session, make sure we can see the blocking temp table but start +# processing anyways and check that we are blocked with a proper wait event. +$result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"); +is($result, 't', 'ensure we can see the temporary table'); + +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +$result = $node->poll_query_until('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " . + "AND relkind IN ('r', 'i', 'S', 't', 'm');", + '1'); +is($result, 1, 'ensure there is a single table left'); + +$result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, 'inprogress', "ensure checksums aren't enabled yet"); + +$result = $node->safe_psql('postgres', + "SELECT wait_event FROM pg_stat_activity WHERE backend_type = 'datachecksumsworker worker';"); +is($result, 'ChecksumEnableFinishCondition', 'test for correct wait event'); + +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " . + "AND relkind IN ('r', 'i', 'S', 't', 'm');"); +is($result, '1', 'doublecheck that there is a single table left before restarting'); + +$node->stop; +$node->start; + +$result = $node->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, 'inprogress', "ensure checksums aren't enabled yet"); + +$result = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_catalog.pg_class WHERE NOT relhaschecksums " . + "AND relkind IN ('r', 'i', 'S', 't', 'm');"); +is($result, '0', 'no temporary tables this time around'); + +$node->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +$result = $node->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are turned on'); + +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +done_testing(); diff --git a/src/test/checksum/t/003_standby_checksum.pl b/src/test/checksum/t/003_standby_checksum.pl new file mode 100644 index 0000000000..a5ebe6cd04 --- /dev/null +++ b/src/test/checksum/t/003_standby_checksum.pl @@ -0,0 +1,96 @@ +# Test suite for testing enabling data checksums in an online cluster with +# streaming replication +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 10; + +# Initialize primary node +my $node_primary = get_new_node('primary'); +$node_primary->init(allows_streaming => 1); +$node_primary->start; +my $backup_name = 'my_backup'; + +# Take backup +$node_primary->backup($backup_name); + +# Create streaming standby linking to primary +my $node_standby_1 = get_new_node('standby_1'); +$node_standby_1->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby_1->start; + +# Create some content on primary to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Wait for standbys to catch up +$node_primary->wait_for_catchup($node_standby_1, 'replay', + $node_primary->lsn('insert')); + +# Check that checksums are turned off +my $result = $node_primary->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "off", 'ensure checksums are turned off on primary'); + +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "off", 'ensure checksums are turned off on standby_1'); + +# Enable checksums for the cluster +$node_primary->safe_psql('postgres', "SELECT pg_enable_data_checksums();"); + +# Ensure that the primary switches to "inprogress" +$result = $node_primary->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + "inprogress"); +is($result, 1, 'ensure checksums are in progress on primary'); + +# Wait for checksum enable to be replayed +$node_primary->wait_for_catchup($node_standby_1, 'replay'); + +# Ensure that the standby has switched to "inprogress" or on +# Normally it would be "inprogress", but it is theoretically possible for the primary +# to complete the checksum enabling *and* have the standby replay that record before +# we reach the check below. +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +cmp_ok($result, '~~', ["inprogress", "on"], 'ensure checksums are on or in progress on standby_1'); + +# Insert some more data which should be checksummed on INSERT +$node_primary->safe_psql('postgres', + "INSERT INTO t VALUES (generate_series(1,10000));"); + +# Wait for checksums enabled on the primary +$result = $node_primary->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled on the primary'); + +# Wait for checksums enabled on the standby +$result = $node_standby_1->poll_query_until('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'on'); +is($result, 1, 'ensure checksums are enabled on the standby'); + +$result = $node_primary->safe_psql('postgres', "SELECT count(a) FROM t"); +is ($result, '20000', 'ensure we can safely read all data with checksums'); + +# Disable checksums and ensure it's propagated to standby and that we can +# still read all data +$node_primary->safe_psql('postgres', "SELECT pg_disable_data_checksums();"); +$result = $node_primary->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, 'off', 'ensure data checksums are disabled on the primary'); + +# Wait for checksum disable to be replayed +$node_primary->wait_for_catchup($node_standby_1, 'replay'); + +# Ensure that the standby has switched to off +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"); +is($result, "off", 'ensure checksums are off on standby_1'); + +$result = $node_primary->safe_psql('postgres', "SELECT count(a) FROM t"); +is ($result, "20000", 'ensure we can safely read all data without checksums'); -- 2.21.1 (Apple Git-122.3)