From 8485fc23d54cc1e359a71801845ea255584905d5 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Wed, 14 Jun 2023 11:31:29 -0400 Subject: [PATCH v1 7/8] Prototype patch for incremental and differential backup. We don't differentiate between incremental and differential backups; the term "incremental" as used herein means "either incremental or differential". This adds a new background process, the WAL summarizer, whose behavor is governed by new GUCs wal_summarize_mb and wal_summarize_keep_time. This writes out WAL summary files to $PGDATA/pg_wal/summaries. Each summary file contains information for a certain range of LSNs on a certain TLI. For each relation, it stores a "limit block" which is 0 if a relation is created or destroyed within a certain range of WAL records, or otherwise the shortest length to which the relation was truncated during that range of WAL records, or otherwise InvalidBlockNumber. In addition, it stores any blocks which have been modified during that range of WAL records, but excluding blocks which were removed by truncation after they were modified and which were never modified thereafter. In other words, it tells us which blocks need to copied in case of an incremental backup covering that range of WAL records. To take an incremental backup, you use the new replication command UPLOAD_MANIFEST to upload the manifest for the prior backup. This prior backup could either be a full backup or another incremental backup. You then use BASE_BACKUP with the INCREMENTAL option to take the backup. pg_basebackup now has an --incremental=PATH_TO_MANIFEST option to trigger this behavior. An incremental backup is like a regular full backup except that some relation files are replaced with files with names like INCREMENTAL.${ORIGINAL_NAME}, and the backup_label file contains additional lines identifying it as an incremental backup. The new pg_combinebackup tool can be used to reconstruct a data directory from a full backup and a series of incremental backups. XXX. It would be nice if we could do something about incremental JSON parsing. XXX. This needs a lot of work on documentation and tests. Patch by me. Thanks to Dilip Kumar and Andres Freund for some helpful design discussions. --- doc/src/sgml/monitoring.sgml | 17 + src/backend/access/transam/xlog.c | 97 +- src/backend/access/transam/xlogbackup.c | 10 + src/backend/access/transam/xlogrecovery.c | 10 +- src/backend/backup/Makefile | 5 +- src/backend/backup/basebackup.c | 340 +++- src/backend/backup/basebackup_incremental.c | 867 ++++++++++ src/backend/backup/meson.build | 3 + src/backend/backup/walsummary.c | 356 +++++ src/backend/backup/walsummaryfuncs.c | 169 ++ src/backend/postmaster/Makefile | 1 + src/backend/postmaster/auxprocess.c | 8 + src/backend/postmaster/meson.build | 1 + src/backend/postmaster/postmaster.c | 53 + src/backend/postmaster/walsummarizer.c | 1414 +++++++++++++++++ src/backend/replication/repl_gram.y | 14 +- src/backend/replication/repl_scanner.l | 2 + src/backend/replication/walsender.c | 162 +- src/backend/storage/ipc/ipci.c | 3 + src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/utils/activity/pgstat_io.c | 4 +- src/backend/utils/activity/wait_event.c | 15 + src/backend/utils/init/miscinit.c | 3 + src/backend/utils/misc/guc_tables.c | 29 + src/backend/utils/misc/postgresql.conf.sample | 5 + src/bin/Makefile | 1 + src/bin/initdb/initdb.c | 1 + src/bin/meson.build | 1 + src/bin/pg_basebackup/bbstreamer_file.c | 1 + src/bin/pg_basebackup/pg_basebackup.c | 108 +- src/bin/pg_basebackup/t/010_pg_basebackup.pl | 4 +- src/bin/pg_combinebackup/.gitignore | 1 + src/bin/pg_combinebackup/Makefile | 46 + src/bin/pg_combinebackup/backup_label.c | 281 ++++ src/bin/pg_combinebackup/backup_label.h | 29 + src/bin/pg_combinebackup/copy_file.c | 169 ++ src/bin/pg_combinebackup/copy_file.h | 19 + src/bin/pg_combinebackup/load_manifest.c | 245 +++ src/bin/pg_combinebackup/load_manifest.h | 67 + src/bin/pg_combinebackup/meson.build | 29 + src/bin/pg_combinebackup/pg_combinebackup.c | 1268 +++++++++++++++ src/bin/pg_combinebackup/reconstruct.c | 618 +++++++ src/bin/pg_combinebackup/reconstruct.h | 32 + src/bin/pg_combinebackup/write_manifest.c | 293 ++++ src/bin/pg_combinebackup/write_manifest.h | 33 + src/bin/pg_resetwal/pg_resetwal.c | 36 + src/common/Makefile | 1 + src/common/blkreftable.c | 1309 +++++++++++++++ src/common/meson.build | 1 + src/include/access/xlog.h | 1 + src/include/access/xlogbackup.h | 2 + src/include/backup/basebackup.h | 5 +- src/include/backup/basebackup_incremental.h | 56 + src/include/backup/walsummary.h | 49 + src/include/catalog/pg_proc.dat | 19 + src/include/common/blkreftable.h | 120 ++ src/include/miscadmin.h | 3 + src/include/nodes/replnodes.h | 9 + src/include/postmaster/walsummarizer.h | 31 + src/include/storage/proc.h | 9 +- src/include/utils/guc_tables.h | 1 + src/include/utils/wait_event.h | 7 +- src/test/recovery/t/001_stream_rep.pl | 2 + src/test/recovery/t/019_replslot_limit.pl | 3 + .../t/035_standby_logical_decoding.pl | 1 + src/tools/pgindent/typedefs.list | 24 + 66 files changed, 8454 insertions(+), 70 deletions(-) create mode 100644 src/backend/backup/basebackup_incremental.c create mode 100644 src/backend/backup/walsummary.c create mode 100644 src/backend/backup/walsummaryfuncs.c create mode 100644 src/backend/postmaster/walsummarizer.c create mode 100644 src/bin/pg_combinebackup/.gitignore create mode 100644 src/bin/pg_combinebackup/Makefile create mode 100644 src/bin/pg_combinebackup/backup_label.c create mode 100644 src/bin/pg_combinebackup/backup_label.h create mode 100644 src/bin/pg_combinebackup/copy_file.c create mode 100644 src/bin/pg_combinebackup/copy_file.h create mode 100644 src/bin/pg_combinebackup/load_manifest.c create mode 100644 src/bin/pg_combinebackup/load_manifest.h create mode 100644 src/bin/pg_combinebackup/meson.build create mode 100644 src/bin/pg_combinebackup/pg_combinebackup.c create mode 100644 src/bin/pg_combinebackup/reconstruct.c create mode 100644 src/bin/pg_combinebackup/reconstruct.h create mode 100644 src/bin/pg_combinebackup/write_manifest.c create mode 100644 src/bin/pg_combinebackup/write_manifest.h create mode 100644 src/common/blkreftable.c create mode 100644 src/include/backup/basebackup_incremental.h create mode 100644 src/include/backup/walsummary.h create mode 100644 src/include/common/blkreftable.h create mode 100644 src/include/postmaster/walsummarizer.h diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 5cfdc70c03..97809a73f6 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1161,6 +1161,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser WalSenderMain Waiting in main loop of WAL sender process. + + WalSummarizeWAL + Waiting in WAL summarizer process for new WAL to be written. + WalWriterMain Waiting in main loop of WAL writer process. @@ -1591,6 +1595,14 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting for a read from a timeline history file during a walsender timeline command. + + WalSummaryRead + Waiting to read from a WAL summary file. + + + WalSummaryWrite + Waiting to write to a WAL summary file. + WALSync Waiting for a WAL file to reach durable storage. @@ -2357,6 +2369,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting to acquire an exclusive lock to truncate off any empty pages at the end of a table vacuumed. + + WalSummarizerError + Waiting to retry after recovering from an error in the + WAL summarizer process. + diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 664d4ba598..6c66d5118b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -77,6 +77,7 @@ #include "port/pg_iovec.h" #include "postmaster/bgwriter.h" #include "postmaster/startup.h" +#include "postmaster/walsummarizer.h" #include "postmaster/walwriter.h" #include "replication/logical.h" #include "replication/origin.h" @@ -3477,6 +3478,43 @@ XLogGetLastRemovedSegno(void) return lastRemovedSegNo; } +/* + * Return the oldest WAL segment on the given TLI that still exists in + * XLOGDIR, or 0 if none. + */ +XLogSegNo +XLogGetOldestSegno(TimeLineID tli) +{ + DIR *xldir; + struct dirent *xlde; + XLogSegNo oldest_segno = 0; + + xldir = AllocateDir(XLOGDIR); + while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) + { + TimeLineID file_tli; + XLogSegNo file_segno; + + /* Ignore files that are not XLOG segments */ + if (!IsXLogFileName(xlde->d_name)) + continue; + + /* Parse filename to get TLI and segno. */ + XLogFromFileName(xlde->d_name, &file_tli, &file_segno, + wal_segment_size); + + /* Ignore anything that's not from the TLI of interest. */ + if (tli != file_tli) + continue; + + /* If it's the oldest so far, update oldest_segno. */ + if (oldest_segno == 0 || file_segno < oldest_segno) + oldest_segno = file_segno; + } + + FreeDir(xldir); + return oldest_segno; +} /* * Update the last removed segno pointer in shared memory, to reflect that the @@ -3756,8 +3794,8 @@ RemoveXlogFile(const struct dirent *segment_de, } /* - * Verify whether pg_wal and pg_wal/archive_status exist. - * If the latter does not exist, recreate it. + * Verify whether pg_wal, pg_wal/archive_status, and pg_wal/summaries exist. + * If the latter do not exist, recreate them. * * It is not the goal of this function to verify the contents of these * directories, but to help in cases where someone has performed a cluster @@ -3800,6 +3838,26 @@ ValidateXLOGDirectoryStructure(void) (errmsg("could not create missing directory \"%s\": %m", path))); } + + /* Check for summaries */ + snprintf(path, MAXPGPATH, XLOGDIR "/summaries"); + if (stat(path, &stat_buf) == 0) + { + /* Check for weird cases where it exists but isn't a directory */ + if (!S_ISDIR(stat_buf.st_mode)) + ereport(FATAL, + (errmsg("required WAL directory \"%s\" does not exist", + path))); + } + else + { + ereport(LOG, + (errmsg("creating missing WAL directory \"%s\"", path))); + if (MakePGDirectory(path) < 0) + ereport(FATAL, + (errmsg("could not create missing directory \"%s\": %m", + path))); + } } /* @@ -5123,9 +5181,9 @@ StartupXLOG(void) #endif /* - * Verify that pg_wal and pg_wal/archive_status exist. In cases where - * someone has performed a copy for PITR, these directories may have been - * excluded and need to be re-created. + * Verify that pg_wal, pg_wal/archive_status, and pg_wal/summaries exist. + * In cases where someone has performed a copy for PITR, these directories + * may have been excluded and need to be re-created. */ ValidateXLOGDirectoryStructure(); @@ -6802,6 +6860,17 @@ CreateCheckPoint(int flags) */ END_CRIT_SECTION(); + /* + * If there hasn't been much system activity in a while, the WAL + * summarizer may be sleeping for relatively long periods, which could + * delay an incremental backup that has started concurrently. In the hopes + * of avoiding that, poke the WAL summarizer here. + * + * Possibly this should instead be done at some earlier point in this + * function, but it's not clear that it matters much. + */ + SetWalSummarizerLatch(); + /* * Let smgr do post-checkpoint cleanup (eg, deleting old files). */ @@ -7476,6 +7545,20 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) } } + /* + * If WAL summarization is in use, don't remove WAL that has yet to be + * summarized. + */ + keep = GetOldestUnsummarizedLSN(NULL, NULL); + if (keep != InvalidXLogRecPtr) + { + XLogSegNo unsummarized_segno; + + XLByteToSeg(keep, unsummarized_segno, wal_segment_size); + if (unsummarized_segno < segno) + segno = unsummarized_segno; + } + /* but, keep at least wal_keep_size if that's set */ if (wal_keep_size_mb > 0) { @@ -8462,8 +8545,8 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces, /* * Try to parse the directory name as an unsigned integer. * - * Tablespace directories should be positive integers that can - * be represented in 32 bits, with no leading zeroes or trailing + * Tablespace directories should be positive integers that can be + * represented in 32 bits, with no leading zeroes or trailing * garbage. If we come across a name that doesn't meet those * criteria, skip it. */ diff --git a/src/backend/access/transam/xlogbackup.c b/src/backend/access/transam/xlogbackup.c index 23461c9d2c..3ad6b679d5 100644 --- a/src/backend/access/transam/xlogbackup.c +++ b/src/backend/access/transam/xlogbackup.c @@ -77,6 +77,16 @@ build_backup_content(BackupState *state, bool ishistoryfile) appendStringInfo(result, "STOP TIMELINE: %u\n", state->stoptli); } + /* either both istartpoint and istarttli should be set, or neither */ + Assert(XLogRecPtrIsInvalid(state->istartpoint) == (state->istarttli == 0)); + if (!XLogRecPtrIsInvalid(state->istartpoint)) + { + appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%X\n", + LSN_FORMAT_ARGS(state->istartpoint)); + appendStringInfo(result, "INCREMENTAL FROM TLI: %u\n", + state->istarttli); + } + data = result->data; pfree(result); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 4ff4430006..89ddec5bf9 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1284,6 +1284,12 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, tli_from_file, BACKUP_LABEL_FILE))); } + if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("this is an incremental backup, not a data directory"), + errhint("Use pg_combinebackup to reconstruct a valid data directory."))); + if (ferror(lfp) || FreeFile(lfp)) ereport(FATAL, (errcode_for_file_access(), @@ -1340,7 +1346,7 @@ read_tablespace_map(List **tablespaces) { if (!was_backslash && (ch == '\n' || ch == '\r')) { - char *endp; + char *endp; if (i == 0) continue; /* \r immediately followed by \n */ @@ -1363,7 +1369,7 @@ read_tablespace_map(List **tablespaces) ti = palloc0(sizeof(tablespaceinfo)); errno = 0; ti->oid = strtoul(str, &endp, 10); - if (*endp != '\0' || errno == EINVAL || errno == ERANGE) + if (*endp != '\0' || errno == EINVAL || errno == ERANGE) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); diff --git a/src/backend/backup/Makefile b/src/backend/backup/Makefile index b21bd8ff43..751e6d3d5e 100644 --- a/src/backend/backup/Makefile +++ b/src/backend/backup/Makefile @@ -19,12 +19,15 @@ OBJS = \ basebackup.o \ basebackup_copy.o \ basebackup_gzip.o \ + basebackup_incremental.o \ basebackup_lz4.o \ basebackup_zstd.o \ basebackup_progress.o \ basebackup_server.o \ basebackup_sink.o \ basebackup_target.o \ - basebackup_throttle.o + basebackup_throttle.o \ + walsummary.o \ + walsummaryfuncs.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index 64ab54fe06..8aea2a4a76 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -20,8 +20,10 @@ #include "access/xlogbackup.h" #include "backup/backup_manifest.h" #include "backup/basebackup.h" +#include "backup/basebackup_incremental.h" #include "backup/basebackup_sink.h" #include "backup/basebackup_target.h" +#include "catalog/pg_tablespace_d.h" #include "commands/defrem.h" #include "common/compression.h" #include "common/file_perm.h" @@ -64,6 +66,7 @@ typedef struct bool fastcheckpoint; bool nowait; bool includewal; + bool incremental; uint32 maxrate; bool sendtblspcmapfile; bool send_to_client; @@ -75,22 +78,37 @@ typedef struct pg_checksum_type manifest_checksum_type; } basebackup_options; +typedef struct +{ + const char *filename; + pg_checksum_context *checksum_ctx; + bbsink *sink; + size_t bytes_sent; +} FileChunkContext; + static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, - struct backup_manifest_info *manifest); + struct backup_manifest_info *manifest, + IncrementalBackupInfo *ib); static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, List *tablespaces, bool sendtblspclinks, - backup_manifest_info *manifest, Oid spcoid); + backup_manifest_info *manifest, Oid spcoid, + IncrementalBackupInfo *ib); static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, struct stat *statbuf, bool missing_ok, Oid dboid, Oid spcoid, RelFileNumber relfilenumber, unsigned segno, - backup_manifest_info *manifest); + backup_manifest_info *manifest, + unsigned num_incremental_blocks, + BlockNumber *incremental_blocks, + unsigned truncation_block_length); static off_t read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, off_t offset, size_t length, BlockNumber blkno, bool verify_checksum, int *checksum_failures); +static void push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx, + size_t *bytes_done, void *data, size_t length); static bool verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno, uint16 *expected_checksum); @@ -102,7 +120,8 @@ static int64 _tarWriteHeader(bbsink *sink, const char *filename, bool sizeonly); static void _tarWritePadding(bbsink *sink, int len); static void convert_link_to_directory(const char *pathbuf, struct stat *statbuf); -static void perform_base_backup(basebackup_options *opt, bbsink *sink); +static void perform_base_backup(basebackup_options *opt, bbsink *sink, + IncrementalBackupInfo *ib); static void parse_basebackup_options(List *options, basebackup_options *opt); static int compareWalFileNames(const ListCell *a, const ListCell *b); static int basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, @@ -220,7 +239,8 @@ static const struct exclude_list_item excludeFiles[] = * clobbered by longjmp" from stupider versions of gcc. */ static void -perform_base_backup(basebackup_options *opt, bbsink *sink) +perform_base_backup(basebackup_options *opt, bbsink *sink, + IncrementalBackupInfo *ib) { bbsink_state state; XLogRecPtr endptr; @@ -270,6 +290,10 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) ListCell *lc; tablespaceinfo *newti; + /* If this is an incremental backup, execute preparatory steps. */ + if (ib != NULL) + PrepareForIncrementalBackup(ib, backup_state); + /* Add a node for the base directory at the end */ newti = palloc0(sizeof(tablespaceinfo)); newti->size = -1; @@ -289,10 +313,10 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) if (tmp->path == NULL) tmp->size = sendDir(sink, ".", 1, true, state.tablespaces, - true, NULL, InvalidOid); + true, NULL, InvalidOid, NULL); else tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true, - NULL); + NULL, NULL); state.bytes_total += tmp->size; } state.bytes_total_is_valid = true; @@ -330,7 +354,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) /* Then the bulk of the files... */ sendDir(sink, ".", 1, false, state.tablespaces, - sendtblspclinks, &manifest, InvalidOid); + sendtblspclinks, &manifest, InvalidOid, ib); /* ... and pg_control after everything else. */ if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) @@ -340,7 +364,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) XLOG_CONTROL_FILE))); sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false, InvalidOid, InvalidOid, - InvalidRelFileNumber, 0, &manifest); + InvalidRelFileNumber, 0, &manifest, 0, NULL, 0); } else { @@ -348,7 +372,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) bbsink_begin_archive(sink, archive_name); - sendTablespace(sink, ti->path, ti->oid, false, &manifest); + sendTablespace(sink, ti->path, ti->oid, false, &manifest, ib); } /* @@ -610,7 +634,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) sendFile(sink, pathbuf, pathbuf, &statbuf, false, InvalidOid, InvalidOid, InvalidRelFileNumber, 0, - &manifest); + &manifest, 0, NULL, 0); /* unconditionally mark file as archived */ StatusFilePath(pathbuf, fname, ".done"); @@ -686,6 +710,7 @@ parse_basebackup_options(List *options, basebackup_options *opt) bool o_checkpoint = false; bool o_nowait = false; bool o_wal = false; + bool o_incremental = false; bool o_maxrate = false; bool o_tablespace_map = false; bool o_noverify_checksums = false; @@ -764,6 +789,15 @@ parse_basebackup_options(List *options, basebackup_options *opt) opt->includewal = defGetBoolean(defel); o_wal = true; } + else if (strcmp(defel->defname, "incremental") == 0) + { + if (o_incremental) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->incremental = defGetBoolean(defel); + o_incremental = true; + } else if (strcmp(defel->defname, "max_rate") == 0) { int64 maxrate; @@ -956,7 +990,7 @@ parse_basebackup_options(List *options, basebackup_options *opt) * the filesystem, bypassing the buffer cache. */ void -SendBaseBackup(BaseBackupCmd *cmd) +SendBaseBackup(BaseBackupCmd *cmd, IncrementalBackupInfo *ib) { basebackup_options opt; bbsink *sink; @@ -980,6 +1014,20 @@ SendBaseBackup(BaseBackupCmd *cmd) set_ps_display(activitymsg); } + /* + * If we're asked to perform an incremental backup and the user has not + * supplied a manifest, that's an ERROR. + * + * If we're asked to perform a full backup and the user did supply a + * manifest, just ignore it. + */ + if (!opt.incremental) + ib = NULL; + else if (ib == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("must UPLOAD_MANIFEST before performing an incremental BASE_BACKUP"))); + /* * If the target is specifically 'client' then set up to stream the backup * to the client; otherwise, it's being sent someplace else and should not @@ -1011,7 +1059,7 @@ SendBaseBackup(BaseBackupCmd *cmd) */ PG_TRY(); { - perform_base_backup(&opt, sink); + perform_base_backup(&opt, sink, ib); } PG_FINALLY(); { @@ -1086,7 +1134,7 @@ sendFileWithContent(bbsink *sink, const char *filename, const char *content, */ static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, - backup_manifest_info *manifest) + backup_manifest_info *manifest, IncrementalBackupInfo *ib) { int64 size; char pathbuf[MAXPGPATH]; @@ -1120,7 +1168,7 @@ sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, /* Send all the files in the tablespace version directory */ size += sendDir(sink, pathbuf, strlen(path), sizeonly, NIL, true, manifest, - spcoid); + spcoid, ib); return size; } @@ -1140,7 +1188,7 @@ sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, List *tablespaces, bool sendtblspclinks, backup_manifest_info *manifest, - Oid spcoid) + Oid spcoid, IncrementalBackupInfo *ib) { DIR *dir; struct dirent *de; @@ -1148,7 +1196,8 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, struct stat statbuf; int64 size = 0; const char *lastDir; /* Split last dir from parent path. */ - bool isRelationDir = false; /* Does directory contain relations? */ + bool isRelationDir = false; /* Does directory contain relations? */ + bool isGlobalDir = false; Oid dboid = InvalidOid; /* @@ -1182,14 +1231,17 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, } } else if (strcmp(path, "./global") == 0) + { isRelationDir = true; + isGlobalDir = true; + } dir = AllocateDir(path); while ((de = ReadDir(dir, path)) != NULL) { int excludeIdx; bool excludeFound; - RelFileNumber relfilenumber = InvalidRelFileNumber; + RelFileNumber relfilenumber = InvalidRelFileNumber; ForkNumber relForkNum = InvalidForkNumber; unsigned segno = 0; bool isRelationFile = false; @@ -1256,9 +1308,8 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, char initForkFile[MAXPGPATH]; /* - * If any other type of fork, check if there is an init fork - * with the same RelFileNumber. If so, the file can be - * excluded. + * If any other type of fork, check if there is an init fork with + * the same RelFileNumber. If so, the file can be excluded. */ snprintf(initForkFile, sizeof(initForkFile), "%s/%u_init", path, relfilenumber); @@ -1332,11 +1383,13 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, &statbuf, sizeonly); /* - * Also send archive_status directory (by hackishly reusing - * statbuf from above ...). + * Also send archive_status and summaries directories (by + * hackishly reusing statbuf from above ...). */ size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL, &statbuf, sizeonly); + size += _tarWriteHeader(sink, "./pg_wal/summaries", NULL, + &statbuf, sizeonly); continue; /* don't recurse into pg_wal */ } @@ -1405,27 +1458,79 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, if (!skip_this_dir) size += sendDir(sink, pathbuf, basepathlen, sizeonly, tablespaces, - sendtblspclinks, manifest, spcoid); + sendtblspclinks, manifest, spcoid, ib); } else if (S_ISREG(statbuf.st_mode)) { bool sent = false; + unsigned num_blocks_required = 0; + unsigned truncation_block_length = 0; + BlockNumber relative_block_numbers[RELSEG_SIZE]; + char tarfilenamebuf[MAXPGPATH * 2]; + char *tarfilename = pathbuf + basepathlen + 1; + FileBackupMethod method = BACK_UP_FILE_FULLY; + + if (ib != NULL && isRelationFile) + { + Oid relspcoid; + char *lookup_path; + + if (OidIsValid(spcoid)) + { + relspcoid = spcoid; + lookup_path = psprintf("pg_tblspc/%u/%s", spcoid, + pathbuf + basepathlen + 1); + } + else + { + if (isGlobalDir) + relspcoid = GLOBALTABLESPACE_OID; + else + relspcoid = DEFAULTTABLESPACE_OID; + lookup_path = pstrdup(pathbuf + basepathlen + 1); + } - if (!sizeonly) - sent = sendFile(sink, pathbuf, pathbuf + basepathlen + 1, &statbuf, - true, dboid, spcoid, - relfilenumber, segno, manifest); + method = GetFileBackupMethod(ib, lookup_path, dboid, relspcoid, + relfilenumber, relForkNum, + segno, statbuf.st_size, + &num_blocks_required, + relative_block_numbers, + &truncation_block_length); + if (method == BACK_UP_FILE_INCREMENTALLY) + { + statbuf.st_size = + GetIncrementalFileSize(num_blocks_required); + snprintf(tarfilenamebuf, sizeof(tarfilenamebuf), + "%s/INCREMENTAL.%s", + path + basepathlen + 1, + de->d_name); + tarfilename = tarfilenamebuf; + } + + pfree(lookup_path); + } - if (sent || sizeonly) + if (method != DO_NOT_BACK_UP_FILE) { - /* Add size. */ - size += statbuf.st_size; + if (!sizeonly) + sent = sendFile(sink, pathbuf, tarfilename, &statbuf, + true, dboid, spcoid, + relfilenumber, segno, manifest, + num_blocks_required, + method == BACK_UP_FILE_INCREMENTALLY ? relative_block_numbers : NULL, + truncation_block_length); + + if (sent || sizeonly) + { + /* Add size. */ + size += statbuf.st_size; - /* Pad to a multiple of the tar block size. */ - size += tarPaddingBytesRequired(statbuf.st_size); + /* Pad to a multiple of the tar block size. */ + size += tarPaddingBytesRequired(statbuf.st_size); - /* Size of the header for the file. */ - size += TAR_BLOCK_SIZE; + /* Size of the header for the file. */ + size += TAR_BLOCK_SIZE; + } } } else @@ -1444,6 +1549,12 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, * If dboid is anything other than InvalidOid then any checksum failures * detected will get reported to the cumulative stats system. * + * If the file is to be set incrementally, then num_incremental_blocks + * should be the number of blocks to be sent, and incremental_blocks + * an array of block numbers relative to the start of the current segment. + * If the whole file is to be sent, then incremental_blocks should be NULL, + * and num_incremental_blocks can have any value, as it will be ignored. + * * Returns true if the file was successfully sent, false if 'missing_ok', * and the file did not exist. */ @@ -1451,7 +1562,8 @@ static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, struct stat *statbuf, bool missing_ok, Oid dboid, Oid spcoid, RelFileNumber relfilenumber, unsigned segno, - backup_manifest_info *manifest) + backup_manifest_info *manifest, unsigned num_incremental_blocks, + BlockNumber *incremental_blocks, unsigned truncation_block_length) { int fd; BlockNumber blkno = 0; @@ -1460,6 +1572,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, pgoff_t bytes_done = 0; bool verify_checksum = false; pg_checksum_context checksum_ctx; + int ibindex = 0; if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0) elog(ERROR, "could not initialize checksum of file \"%s\"", @@ -1492,22 +1605,111 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, RelFileNumberIsValid(relfilenumber)) verify_checksum = true; + /* + * If we're sending an incremental file, write the file header. + */ + if (incremental_blocks != NULL) + { + unsigned magic = INCREMENTAL_MAGIC; + size_t header_bytes_done = 0; + + /* Emit header data. */ + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + &magic, sizeof(magic)); + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + &num_incremental_blocks, sizeof(num_incremental_blocks)); + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + &truncation_block_length, sizeof(truncation_block_length)); + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + incremental_blocks, + sizeof(BlockNumber) * num_incremental_blocks); + + /* Flush out any data still in the buffer so it's again empty. */ + if (header_bytes_done > 0) + { + bbsink_archive_contents(sink, header_bytes_done); + if (pg_checksum_update(&checksum_ctx, + (uint8 *) sink->bbs_buffer, + header_bytes_done) < 0) + elog(ERROR, "could not update checksum of base backup"); + } + + /* Update our notion of file position. */ + bytes_done += sizeof(magic); + bytes_done += sizeof(num_incremental_blocks); + bytes_done += sizeof(truncation_block_length); + bytes_done += sizeof(BlockNumber) * num_incremental_blocks; + } + /* * Loop until we read the amount of data the caller told us to expect. The * file could be longer, if it was extended while we were sending it, but * for a base backup we can ignore such extended data. It will be restored * from WAL. */ - while (bytes_done < statbuf->st_size) + while (1) { - size_t remaining = statbuf->st_size - bytes_done; + /* + * Determine whether we've read all the data that we need, and if not, + * read some more. + */ + if (incremental_blocks == NULL) + { + size_t remaining = statbuf->st_size - bytes_done; + + /* + * If we've read the required number of bytes, then it's time to + * stop. + */ + if (bytes_done >= statbuf->st_size) + break; + + /* + * Read as many bytes as will fit in the buffer, or however many + * are left to read, whichever is less. + */ + cnt = read_file_data_into_buffer(sink, readfilename, fd, + bytes_done, remaining, + blkno + segno * RELSEG_SIZE, + verify_checksum, + &checksum_failures); + } + else + { + BlockNumber relative_blkno; + + /* + * If we've read all the blocks, then it's time to stop. + */ + if (ibindex >= num_incremental_blocks) + break; + + /* + * Read just one block, whichever one is the next that we're + * supposed to include. + */ + relative_blkno = incremental_blocks[ibindex++]; + cnt = read_file_data_into_buffer(sink, readfilename, fd, + relative_blkno * BLCKSZ, + BLCKSZ, + relative_blkno + segno * RELSEG_SIZE, + verify_checksum, + &checksum_failures); - /* Try to read some more data. */ - cnt = read_file_data_into_buffer(sink, readfilename, fd, bytes_done, - remaining, - blkno + segno * RELSEG_SIZE, - verify_checksum, - &checksum_failures); + /* + * If we get a partial read, that must mean that the relation is + * being truncated. Ultimately, it should be truncated to a + * multiple of BLCKSZ, since this path should only be reached for + * relation files, but we might transiently observe an + * intermediate value. + * + * It should be fine to treat this just as if the entire block had + * been truncated away - i.e. fill this and all later blocks with + * zeroes. WAL replay will fix things up. + */ + if (cnt < BLCKSZ) + break; + } /* * If the amount of data we were able to read was not a multiple of @@ -1690,6 +1892,56 @@ read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, return cnt; } +/* + * Push data into a bbsink. + * + * It's better, when possible, to read data directly into the bbsink's buffer, + * rather than using this function to copy it into the buffer; this function is + * for cases where that approach is not practical. + * + * bytes_done should point to a count of the number of bytes that are + * currently used in the bbsink's buffer. Upon return, the bytes identified by + * data and length will have been copied into the bbsink's buffer, flushing + * as required, and *bytes_done will have been updated accordingly. If the + * buffer was flushed, the previous contents will also have been fed to + * checksum_ctx. + * + * Note that after one or more calls to this function it is the caller's + * responsibility to perform any required final flush. + */ +static void +push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx, + size_t *bytes_done, void *data, size_t length) +{ + while (length > 0) + { + size_t bytes_to_copy; + + /* + * We use < here rather than <= so that if the data exactly fills the + * remaining buffer space, we trigger a flush now. + */ + if (length < sink->bbs_buffer_length - *bytes_done) + { + /* Append remaining data to buffer. */ + memcpy(sink->bbs_buffer + *bytes_done, data, length); + *bytes_done += length; + return; + } + + /* Copy until buffer is full and flush it. */ + bytes_to_copy = sink->bbs_buffer_length - *bytes_done; + memcpy(sink->bbs_buffer + *bytes_done, data, bytes_to_copy); + data = ((char *) data) + bytes_to_copy; + length -= bytes_to_copy; + bbsink_archive_contents(sink, sink->bbs_buffer_length); + if (pg_checksum_update(checksum_ctx, (uint8 *) sink->bbs_buffer, + sink->bbs_buffer_length) < 0) + elog(ERROR, "could not update checksum"); + *bytes_done = 0; + } +} + /* * Try to verify the checksum for the provided page, if it seems appropriate * to do so. diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c new file mode 100644 index 0000000000..b70eeb0282 --- /dev/null +++ b/src/backend/backup/basebackup_incremental.c @@ -0,0 +1,867 @@ +/*------------------------------------------------------------------------- + * + * basebackup_incremental.c + * code for incremental backup support + * + * This code isn't actually in charge of taking an incremental backup; + * the actual construction of the incremental backup happens in + * basebackup.c. Here, we're concerned with providing the necessary + * supports for that operation. In particular, we need to parse the + * backup manifest supplied by the user taking the incremental backup + * and extract the required information from it. + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/backup/basebackup_incremental.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/timeline.h" +#include "access/xlog.h" +#include "access/xlogrecovery.h" +#include "backup/basebackup_incremental.h" +#include "backup/walsummary.h" +#include "common/blkreftable.h" +#include "common/parse_manifest.h" +#include "common/hashfn.h" +#include "postmaster/walsummarizer.h" + +#define BLOCKS_PER_READ 512 + +typedef struct +{ + TimeLineID tli; + XLogRecPtr start_lsn; + XLogRecPtr end_lsn; +} backup_wal_range; + +typedef struct +{ + uint32 status; + const char *path; + size_t size; +} backup_file_entry; + +static uint32 hash_string_pointer(const char *s); +#define SH_PREFIX backup_file +#define SH_ELEMENT_TYPE backup_file_entry +#define SH_KEY_TYPE const char * +#define SH_KEY path +#define SH_HASH_KEY(tb, key) hash_string_pointer(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE static inline +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +struct IncrementalBackupInfo +{ + /* Memory context for this object and its subsidiary objects. */ + MemoryContext mcxt; + + /* Temporary buffer for storing the manifest while parsing it. */ + StringInfoData buf; + + /* WAL ranges extracted from the backup manifest. */ + List *manifest_wal_ranges; + + /* + * Files extracted from the backup manifest. + * + * We don't really need this information, because we use WAL summaries to + * figure what's changed. It would be unsafe to just rely on the list of + * files that existed before, because it's possible for a file to be + * removed and a new one created with the same name and different + * contents. In such cases, the whole file must still be sent. We can tell + * from the WAL summaries whether that happened, but not from the file + * list. + * + * Nonetheless, this data is useful for sanity checking. If a file that we + * think we shouldn't need to send is not present in the manifest for the + * prior backup, something has gone terribly wrong. We retain the file + * names and sizes, but not the checksums or last modified times, for + * which we have no use. + * + * One significant downside of storing this data is that it consumes + * memory. If that turns out to be a problem, we might have to decide not + * to retain this information, or to make it optional. + */ + backup_file_hash *manifest_files; + + /* + * Block-reference table for the incremental backup. + * + * It's possible that storing the entire block-reference table in memory + * will be a problem for some users. The in-memory format that we're using + * here is pretty efficient, converging to little more than 1 bit per + * block for relation forks with large numbers of modified blocks. It's + * possible, however, that if you try to perform an incremental backup of + * a database with a sufficiently large number of relations on a + * sufficiently small machine, you could run out of memory here. If that + * turns out to be a problem in practice, we'll need to be more clever. + */ + BlockRefTable *brtab; +}; + +static void manifest_process_file(JsonManifestParseContext *, + char *pathname, + size_t size, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload); +static void manifest_process_wal_range(JsonManifestParseContext *, + TimeLineID tli, + XLogRecPtr start_lsn, + XLogRecPtr end_lsn); +static void manifest_report_error(JsonManifestParseContext *ib, + const char *fmt,...) + pg_attribute_printf(2, 3) pg_attribute_noreturn(); + +/* + * Create a new object for storing information extracted from the manifest + * supplied when creating an incremental backup. + */ +IncrementalBackupInfo * +CreateIncrementalBackupInfo(MemoryContext mcxt) +{ + IncrementalBackupInfo *ib; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(mcxt); + + ib = palloc0(sizeof(IncrementalBackupInfo)); + ib->mcxt = mcxt; + initStringInfo(&ib->buf); + + /* + * It's hard to guess how many files a "typical" installation will have in + * the data directory, but a fresh initdb creates almost 1000 files as of + * this writing, so it seems to make sense for our estimate to + * substantially higher. + */ + ib->manifest_files = backup_file_create(mcxt, 10000, NULL); + + MemoryContextSwitchTo(oldcontext); + + return ib; +} + +/* + * Before taking an incremental backup, the caller must supply the backup + * manifest from a prior backup. Each chunk of manifest data recieved + * from the client should be passed to this function. + */ +void +AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data, + int len) +{ + MemoryContext oldcontext; + + /* Switch to our memory context. */ + oldcontext = MemoryContextSwitchTo(ib->mcxt); + + /* + * XXX. Our json parser is at present incapable of parsing json blobs + * incrementally, so we have to accumulate the entire backup manifest + * before we can do anything with it. This should really be fixed, since + * some users might have very large numbers of files in the data + * directory. + */ + appendBinaryStringInfo(&ib->buf, data, len); + + /* Switch back to previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Finalize an IncrementalBackupInfo object after all manifest data has + * been supplied via calls to AppendIncrementalManifestData. + */ +void +FinalizeIncrementalManifest(IncrementalBackupInfo *ib) +{ + JsonManifestParseContext context; + MemoryContext oldcontext; + + /* Switch to our memory context. */ + oldcontext = MemoryContextSwitchTo(ib->mcxt); + + /* Parse the manifest. */ + context.private_data = ib; + context.perfile_cb = manifest_process_file; + context.perwalrange_cb = manifest_process_wal_range; + context.error_cb = manifest_report_error; + json_parse_manifest(&context, ib->buf.data, ib->buf.len); + + /* Done with the buffer, so release memory. */ + pfree(ib->buf.data); + ib->buf.data = NULL; + + /* Switch back to previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Prepare to take an incremental backup. + * + * Before this function is called, AppendIncrementalManifestData and + * FinalizeIncrementalManifest should have already been called to pass all + * the manifest data to this object. + * + * This function performs sanity checks on the data extracted from the + * manifest and figures out for which WAL ranges we need summaries, and + * whether those summaries are available. Then, it reads and combines the + * data from those summary files. It also updates the backup_state with the + * reference TLI and LSN for the prior backup. + */ +void +PrepareForIncrementalBackup(IncrementalBackupInfo *ib, + BackupState *backup_state) +{ + MemoryContext oldcontext; + List *expectedTLEs; + List *all_wslist, + *required_wslist = NIL; + ListCell *lc; + TimeLineHistoryEntry **tlep; + int num_wal_ranges; + int i; + bool found_backup_start_tli = false; + TimeLineID earliest_wal_range_tli = 0; + XLogRecPtr earliest_wal_range_start_lsn; + TimeLineID latest_wal_range_tli = 0; + XLogRecPtr summarized_lsn; + + Assert(ib->buf.data == NULL); + + /* Switch to our memory context. */ + oldcontext = MemoryContextSwitchTo(ib->mcxt); + + /* + * Match up the TLIs that appear in the WAL ranges of the backup manifest + * with those that appear in this server's timeline history. We expect + * every backup_wal_range to match to a TimeLineHistoryEntry; if it does + * not, that's an error. + * + * This loop also decides which of the WAL ranges is the manifest is most + * ancient and which one is the newest, according to the timeline history + * of this server, and stores TLIs of those WAL ranges into + * earliest_wal_range_tli and latest_wal_range_tli. It also updates + * earliest_wal_range_start_lsn to the start LSN of the WAL range for + * earliest_wal_range_tli. + * + * Note that the return value of readTimeLineHistory puts the latest + * timeline at the beginning of the list, not the end. Hence, the earliest + * TLI is the one that occurs nearest the end of the list returned by + * readTimeLineHistory, and the latest TLI is the one that occurs closest + * to the beginning. + */ + expectedTLEs = readTimeLineHistory(backup_state->starttli); + num_wal_ranges = list_length(ib->manifest_wal_ranges); + tlep = palloc0(num_wal_ranges * sizeof(TimeLineHistoryEntry *)); + for (i = 0; i < num_wal_ranges; ++i) + { + backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i); + bool saw_earliest_wal_range_tli = false; + bool saw_latest_wal_range_tli = false; + + /* Search this server's history for this WAL range's TLI. */ + foreach(lc, expectedTLEs) + { + TimeLineHistoryEntry *tle = lfirst(lc); + + if (tle->tli == range->tli) + { + tlep[i] = tle; + break; + } + + if (tle->tli == earliest_wal_range_tli) + saw_earliest_wal_range_tli = true; + if (tle->tli == latest_wal_range_tli) + saw_latest_wal_range_tli = true; + } + + /* + * An incremental backup can only be taken relative to a backup that + * represents a previous state of this server. If the backup requires + * WAL from a timeline that's not in our history, that definitely + * isn't the case. + */ + if (tlep[i] == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("timeline %u found in manifest, but not in this server's history", + range->tli))); + + /* + * If we found this TLI in the server's history before encountering + * the latest TLI seen so far in the server's history, then this TLI + * is the latest one seen so far. + * + * If on the other hand we saw the earliest TLI seen so far before + * finding this TLI, this TLI is earlier than the earliest one seen so + * far. And if this is the first TLI for which we've searched, it's + * also the earliest one seen so far. + * + * On the first loop iteration, both things should necessarily be + * true. + */ + if (!saw_latest_wal_range_tli) + latest_wal_range_tli = range->tli; + if (earliest_wal_range_tli == 0 || saw_earliest_wal_range_tli) + { + earliest_wal_range_tli = range->tli; + earliest_wal_range_start_lsn = range->start_lsn; + } + } + + /* + * Propagate information about the prior backup into the backup_label that + * will be generated for this backup. + */ + backup_state->istartpoint = earliest_wal_range_start_lsn; + backup_state->istarttli = earliest_wal_range_tli; + + /* + * Sanity check start and end LSNs for the WAL ranges in the manifest. + * + * Commonly, there won't be any timeline switches during the prior backup + * at all, but if there are, they should happen at the same LSNs that this + * server switched timelines. + * + * Whether there are any timeline switches during the prior backup or not, + * the prior backup shouldn't require any WAL from a timeline prior to the + * start of that timeline. It also shouldn't require any WAL from later + * than the start of this backup. + * + * If any of these sanity checks fail, one possible explanation is that + * the user has generated WAL on the same timeline with the same LSNs more + * than once. For instance, if two standbys running on timeline 1 were + * both promoted and (due to a broken archiving setup) both selected new + * timeline ID 2, then it's possible that one of these checks might trip. + * + * Note that there are lots of ways for the user to do something very bad + * without tripping any of these checks, and they are not intended to be + * comprehensive. It's pretty hard to see how we could be certain of + * anything here. However, if there's a problem staring us right in the + * face, it's best to report it, so we do. + */ + for (i = 0; i < num_wal_ranges; ++i) + { + backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i); + + if (range->tli == earliest_wal_range_tli) + { + if (range->start_lsn < tlep[i]->begin) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->start_lsn), + LSN_FORMAT_ARGS(tlep[i]->begin)))); + } + else + { + if (range->start_lsn != tlep[i]->begin) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->start_lsn), + LSN_FORMAT_ARGS(tlep[i]->begin)))); + } + + if (range->tli == latest_wal_range_tli) + { + if (range->end_lsn > backup_state->startpoint) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->end_lsn), + LSN_FORMAT_ARGS(backup_state->startpoint)))); + } + else + { + if (range->end_lsn != tlep[i]->end) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->end_lsn), + LSN_FORMAT_ARGS(tlep[i]->end)))); + } + + } + + /* + * Wait for WAL summarization to catch up to the backup start LSN (but + * time out if it doesn't do so quickly enough). + */ + /* XXX make timeout configurable */ + summarized_lsn = WaitForWalSummarization(backup_state->startpoint, 60000); + if (summarized_lsn < backup_state->startpoint) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("timeout waiting for WAL summarization"), + errdetail("This backup requires WAL to be summarized up to %X/%X, but summarizer has only reached %X/%X.", + LSN_FORMAT_ARGS(backup_state->startpoint), + LSN_FORMAT_ARGS(summarized_lsn)))); + + /* + * Retrieve a list of all WAL summaries on any timeline that overlap with + * the LSN range of interest. We could instead call GetWalSummaries() once + * per timeline in the loop that follows, but that would involve reading + * the directory multiple times. It should be mildly faster - and perhaps + * a bit safer - to do it just once. + */ + all_wslist = GetWalSummaries(0, earliest_wal_range_start_lsn, + backup_state->startpoint); + + /* + * We need WAL summaries for everything that happened during the prior + * backup and everything that happened afterward up until the point where + * the current backup started. + */ + foreach(lc, expectedTLEs) + { + TimeLineHistoryEntry *tle = lfirst(lc); + XLogRecPtr tli_start_lsn = tle->begin; + XLogRecPtr tli_end_lsn = tle->end; + XLogRecPtr tli_missing_lsn = InvalidXLogRecPtr; + List *tli_wslist; + + /* + * Working through the history of this server from the current + * timeline backwards, we skip everything until we find the timeline + * where this backup started. Most of the time, this means we won't + * skip anything at all, as it's unlikely that the timeline has + * changed since the beginning of the backup moments ago. + */ + if (tle->tli == backup_state->starttli) + { + found_backup_start_tli = true; + tli_end_lsn = backup_state->startpoint; + } + else if (!found_backup_start_tli) + continue; + + /* + * Find the summaries that overlap the LSN range of interest for this + * timeline. If this is the earliest timeline involved, the range of + * interest begins with the start LSN of the prior backup; otherwise, + * it begins at the LSN at which this timeline came into existence. If + * this is the latest TLI involved, the range of interest ends at the + * start LSN of the current backup; otherwise, it ends at the point + * where we switched from this timeline to the next one. + */ + if (tle->tli == earliest_wal_range_tli) + tli_start_lsn = earliest_wal_range_start_lsn; + tli_wslist = FilterWalSummaries(all_wslist, tle->tli, + tli_start_lsn, tli_end_lsn); + + /* + * There is no guarantee that the WAL summaries we found cover the + * entire range of LSNs for which summaries are required, or indeed + * that we found any WAL summaries at all. Check whether we have a + * problem of that sort. + */ + if (!WalSummariesAreComplete(tli_wslist, tli_start_lsn, tli_end_lsn, + &tli_missing_lsn)) + { + if (XLogRecPtrIsInvalid(tli_missing_lsn)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist", + tle->tli, + LSN_FORMAT_ARGS(tli_start_lsn), + LSN_FORMAT_ARGS(tli_end_lsn)))); + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete", + tle->tli, + LSN_FORMAT_ARGS(tli_start_lsn), + LSN_FORMAT_ARGS(tli_end_lsn)), + errdetail("The first unsummarized LSN is this range is %X/%X.", + LSN_FORMAT_ARGS(tli_missing_lsn)))); + } + + /* + * Remember that we need to read these summaries. + * + * Technically, it's possible that this could read more files than + * required, since tli_wslist in theory could contain redundant + * summaries. For instance, if we have a summary from 0/10000000 to + * 0/20000000 and also one from 0/00000000 to 0/30000000, then the + * latter subsumes the former and the former could be ignored. + * + * We ignore this possibility because the WAL summarizer only tries to + * generate summaries that do not overlap. If somehow they exist, + * we'll do a bit of extra work but the results should still be + * correct. + */ + required_wslist = list_concat(required_wslist, tli_wslist); + + /* + * Timelines earlier than the one in which the prior backup began are + * not relevant. + */ + if (tle->tli == earliest_wal_range_tli) + break; + } + + /* + * Read all of the required block reference table files and merge all of + * the data into a single in-memory block reference table. + * + * See the comments for struct IncrementalBackupInfo for some thoughts on + * memory usage. + */ + ib->brtab = CreateEmptyBlockRefTable(); + foreach(lc, required_wslist) + { + WalSummaryFile *ws = lfirst(lc); + WalSummaryIO wsio; + BlockRefTableReader *reader; + RelFileLocator rlocator; + ForkNumber forknum; + BlockNumber limit_block; + BlockNumber blocks[BLOCKS_PER_READ]; + + wsio.file = OpenWalSummaryFile(ws, false); + wsio.filepos = 0; + ereport(DEBUG1, + (errmsg_internal("reading WAL summary file \"%s\"", + FilePathName(wsio.file)))); + reader = CreateBlockRefTableReader(ReadWalSummary, &wsio, + FilePathName(wsio.file), + ReportWalSummaryError, NULL); + while (BlockRefTableReaderNextRelation(reader, &rlocator, &forknum, + &limit_block)) + { + BlockRefTableSetLimitBlock(ib->brtab, &rlocator, + forknum, limit_block); + + while (1) + { + unsigned nblocks; + unsigned i; + + nblocks = BlockRefTableReaderGetBlocks(reader, blocks, + BLOCKS_PER_READ); + if (nblocks == 0) + break; + + for (i = 0; i < nblocks; ++i) + BlockRefTableMarkBlockModified(ib->brtab, &rlocator, + forknum, blocks[i]); + } + } + DestroyBlockRefTableReader(reader); + FileClose(wsio.file); + } + + /* Switch back to previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Get the pathname that should be used when a file is sent incrementally. + * + * The result is a palloc'd string. + */ +char * +GetIncrementalFilePath(Oid dboid, Oid spcoid, RelFileNumber relfilenumber, + ForkNumber forknum, unsigned segno) +{ + char *path; + char *lastslash; + char *ipath; + + path = GetRelationPath(dboid, spcoid, relfilenumber, InvalidBackendId, + forknum); + + lastslash = strrchr(path, '/'); + Assert(lastslash != NULL); + *lastslash = '\0'; + + if (segno > 0) + ipath = psprintf("%s/INCREMENTAL.%s.%u", path, lastslash + 1, segno); + else + ipath = psprintf("%s/INCREMENTAL.%s", path, lastslash + 1); + + pfree(path); + + return ipath; +} + +/* + * How should we back up a particular file as part of an incremental backup? + * + * If the return value is BACK_UP_FILE_FULLY, caller should back up the whole + * file just as if this were not an incremental backup. + * + * If the return value is BACK_UP_FILE_INCREMENTALLY, caller should include + * an incremental file in the backup instead of the entire file. On return, + * *num_blocks_required will be set to the number of blocks that need to be + * sent, and the actual block numbers will have been stored in + * relative_block_numbers, which should be an array of at least RELSEG_SIZE. + * In addition, *truncation_block_length will be set to the value that should + * be included in the incremental file. + * + * If the return value is DO_NOT_BACK_UP_FILE, the caller should not include + * the file in the backup at all. + */ +FileBackupMethod +GetFileBackupMethod(IncrementalBackupInfo *ib, char *path, + Oid dboid, Oid spcoid, + RelFileNumber relfilenumber, ForkNumber forknum, + unsigned segno, size_t size, + unsigned *num_blocks_required, + BlockNumber *relative_block_numbers, + unsigned *truncation_block_length) +{ + BlockNumber absolute_block_numbers[RELSEG_SIZE]; + BlockNumber limit_block; + BlockNumber start_blkno; + BlockNumber stop_blkno; + RelFileLocator rlocator; + BlockRefTableEntry *brtentry; + unsigned i; + unsigned nblocks; + + /* Should only be called after PrepareForIncrementalBackup. */ + Assert(ib->buf.data == NULL); + + /* + * dboid could be InvalidOid if shared rel, but spcoid and relfilenumber + * should have legal values. + */ + Assert(OidIsValid(spcoid)); + Assert(RelFileNumberIsValid(relfilenumber)); + + /* + * If the file size is too large or not a multiple of BLCKSZ, then + * something weird is happening, so give up and send the whole file. + */ + if ((size % BLCKSZ) != 0 || size / BLCKSZ > RELSEG_SIZE) + return BACK_UP_FILE_FULLY; + + /* + * The free-space map fork is not properly WAL-logged, so we need to + * backup the entire file every time. + */ + if (forknum == FSM_FORKNUM) + return BACK_UP_FILE_FULLY; + + /* + * Check whether this file is part of the prior backup. If it isn't, back + * up the whole file. + */ + if (backup_file_lookup(ib->manifest_files, path) == NULL) + { + char *ipath; + + ipath = GetIncrementalFilePath(dboid, spcoid, relfilenumber, + forknum, segno); + if (backup_file_lookup(ib->manifest_files, ipath) == NULL) + return BACK_UP_FILE_FULLY; + } + + /* Look up the block reference table entry. */ + rlocator.spcOid = spcoid; + rlocator.dbOid = dboid; + rlocator.relNumber = relfilenumber; + brtentry = BlockRefTableGetEntry(ib->brtab, &rlocator, forknum, + &limit_block); + + /* + * If there is no entry, then there have been no WAL-logged changes to the + * relation since the predecessor backup was taken, so we can back it up + * incrementally and need not include any modified blocks. + * + * However, if the file is zero-length, we should do a full backup, + * because an incremental file is always more than zero length, and it's + * silly to take an incremental backup when a full backup would be + * smaller. + */ + if (brtentry == NULL) + { + *num_blocks_required = 0; + *truncation_block_length = size / BLCKSZ; + if (size == 0) + return BACK_UP_FILE_FULLY; + return BACK_UP_FILE_INCREMENTALLY; + } + + /* + * If the limit_block is less than or equal to the point where this + * segment starts, send the whole file. + */ + if (limit_block <= segno * RELSEG_SIZE) + return BACK_UP_FILE_FULLY; + + /* + * Get relevant entries from the block reference table entry. + * + * We shouldn't overflow computing the start or stop block numbers, but if + * it manages to happen somehow, detect it and throw an error. + */ + start_blkno = segno * RELSEG_SIZE; + stop_blkno = start_blkno + (size / BLCKSZ); + if (start_blkno / RELSEG_SIZE != segno || stop_blkno < start_blkno) + ereport(ERROR, + errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("overflow computing block number bounds for segment %u with size %lu", + segno, size)); + nblocks = BlockRefTableEntryGetBlocks(brtentry, start_blkno, stop_blkno, + absolute_block_numbers, RELSEG_SIZE); + Assert(nblocks <= RELSEG_SIZE); + + /* + * If we're going to have to send nearly all of the blocks, then just send + * the whole file, because that won't require much extra storage or + * transfer and will speed up and simplify backup restoration. It's not + * clear what threshold is most appropriate here and perhaps it ought to + * be configurable, but for now we're just going to say that if we'd need + * to send 90% of the blocks anyway, give up and send the whole file. + * + * NB: If you change the threshold here, at least make sure to back up the + * file fully when every single block must be sent, because there's + * nothing good about sending an incremental file in that case. + */ + if (nblocks * BLCKSZ > size * 0.9) + return BACK_UP_FILE_FULLY; + + /* + * Looks like we can send an incremental file. + * + * Return the relevant details to the caller, transposing absolute block + * numbers to relative block numbers. + * + * The truncation block length is the minimum length of the reconstructed + * file. Any block numbers below this threshold that are not present in + * the backup need to be fetched from the prior backup. At or above this + * threshold, blocks should only be included in the result if they are + * present in the backup. (This may require inserting zero blocks if the + * blocks included in the backup are non-consecutive.) + */ + for (i = 0; i < nblocks; ++i) + relative_block_numbers[i] = absolute_block_numbers[i] - start_blkno; + *num_blocks_required = nblocks; + *truncation_block_length = + Min(size / BLCKSZ, limit_block - segno * RELSEG_SIZE); + return BACK_UP_FILE_INCREMENTALLY; +} + +/* + * Compute the size for an incremental file containing a given number of blocks. + */ +extern size_t +GetIncrementalFileSize(unsigned num_blocks_required) +{ + size_t result; + + /* Make sure we're not going to overflow. */ + Assert(num_blocks_required <= RELSEG_SIZE); + + /* + * Three four byte quantities (magic number, truncation block length, + * block count) followed by block numbers followed by block contents. + */ + result = 3 * sizeof(uint32); + result += (BLCKSZ + sizeof(BlockNumber)) * num_blocks_required; + + return result; +} + +/* + * Helper function for filemap hash table. + */ +static uint32 +hash_string_pointer(const char *s) +{ + unsigned char *ss = (unsigned char *) s; + + return hash_bytes(ss, strlen(s)); +} + +/* + * This callback is invoked for each file mentioned in the backup manifest. + * + * We store the path to each file and the size of each file for sanity-checking + * purposes. For further details, see comments for IncrementalBackupInfo. + */ +static void +manifest_process_file(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload) +{ + IncrementalBackupInfo *ib = context->private_data; + backup_file_entry *entry; + bool found; + + entry = backup_file_insert(ib->manifest_files, pathname, &found); + if (!found) + { + entry->path = MemoryContextStrdup(ib->manifest_files->ctx, + pathname); + entry->size = size; + } +} + +/* + * This callback is invoked for each WAL range mentioned in the backup + * manifest. + * + * We're just interested in learning the oldest LSN and the corresponding TLI + * that appear in any WAL range. + */ +static void +manifest_process_wal_range(JsonManifestParseContext *context, + TimeLineID tli, XLogRecPtr start_lsn, + XLogRecPtr end_lsn) +{ + IncrementalBackupInfo *ib = context->private_data; + backup_wal_range *range = palloc(sizeof(backup_wal_range)); + + range->tli = tli; + range->start_lsn = start_lsn; + range->end_lsn = end_lsn; + ib->manifest_wal_ranges = lappend(ib->manifest_wal_ranges, range); +} + +/* + * This callback is invoked if an error occurs while parsing the backup + * manifest. + */ +static void +manifest_report_error(JsonManifestParseContext *context, const char *fmt,...) +{ + StringInfoData errbuf; + + initStringInfo(&errbuf); + + for (;;) + { + va_list ap; + int needed; + + va_start(ap, fmt); + needed = appendStringInfoVA(&errbuf, fmt, ap); + va_end(ap); + if (needed == 0) + break; + enlargeStringInfo(&errbuf, needed); + } + + ereport(ERROR, + errmsg_internal("%s", errbuf.data)); +} diff --git a/src/backend/backup/meson.build b/src/backend/backup/meson.build index 11a79bbf80..1cace3b2fe 100644 --- a/src/backend/backup/meson.build +++ b/src/backend/backup/meson.build @@ -5,6 +5,7 @@ backend_sources += files( 'basebackup.c', 'basebackup_copy.c', 'basebackup_gzip.c', + 'basebackup_incremental.c', 'basebackup_lz4.c', 'basebackup_progress.c', 'basebackup_server.c', @@ -12,4 +13,6 @@ backend_sources += files( 'basebackup_target.c', 'basebackup_throttle.c', 'basebackup_zstd.c', + 'walsummary.o', + 'walsummaryfuncs.o' ) diff --git a/src/backend/backup/walsummary.c b/src/backend/backup/walsummary.c new file mode 100644 index 0000000000..ebf4ea038d --- /dev/null +++ b/src/backend/backup/walsummary.c @@ -0,0 +1,356 @@ +/*------------------------------------------------------------------------- + * + * walsummary.c + * Functions for accessing and managing WAL summary data. + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * src/backend/backup/walsummary.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "access/xlog_internal.h" +#include "backup/walsummary.h" +#include "utils/wait_event.h" + +static bool IsWalSummaryFilename(char *filename); +static int ListComparatorForWalSummaryFiles(const ListCell *a, + const ListCell *b); + +/* + * Get a list of WAL summaries. + * + * If tli != 0, only WAL summaries with the indicated TLI will be included. + * + * If start_lsn != InvalidXLogRecPtr, only summaries that end before the + * indicated LSN will be included. + * + * If end_lsn != InvalidXLogRecPtr, only summaries that start before the + * indicated LSN will be included. + * + * The intent is that you can call GetWalSummaries(tli, start_lsn, end_lsn) + * to get all WAL summaries on the indicated timeline that overlap the + * specified LSN range. + */ +List * +GetWalSummaries(TimeLineID tli, XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + DIR *sdir; + struct dirent *dent; + List *result = NIL; + + sdir = AllocateDir(XLOGDIR "/summaries"); + while ((dent = ReadDir(sdir, XLOGDIR "/summaries")) != NULL) + { + WalSummaryFile *ws; + uint32 tmp[5]; + TimeLineID file_tli; + XLogRecPtr file_start_lsn; + XLogRecPtr file_end_lsn; + + /* Decode filename, or skip if it's not in the expected format. */ + if (!IsWalSummaryFilename(dent->d_name)) + continue; + sscanf(dent->d_name, "%08X%08X%08X%08X%08X", + &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4]); + file_tli = tmp[0]; + file_start_lsn = ((uint64) tmp[1]) << 32 | tmp[2]; + file_end_lsn = ((uint64) tmp[3]) << 32 | tmp[4]; + + /* Skip if it doesn't match the filter criteria. */ + if (tli != 0 && tli != file_tli) + continue; + if (!XLogRecPtrIsInvalid(start_lsn) && start_lsn > file_end_lsn) + continue; + if (!XLogRecPtrIsInvalid(end_lsn) && end_lsn < file_start_lsn) + continue; + + /* Add it to the list. */ + ws = palloc(sizeof(WalSummaryFile)); + ws->tli = file_tli; + ws->start_lsn = file_start_lsn; + ws->end_lsn = file_end_lsn; + result = lappend(result, ws); + } + FreeDir(sdir); + + return result; +} + +/* + * Build a new list of WAL summaries based on an existing list, but filtering + * out summaries that don't match the search parameters. + * + * If tli != 0, only WAL summaries with the indicated TLI will be included. + * + * If start_lsn != InvalidXLogRecPtr, only summaries that end before the + * indicated LSN will be included. + * + * If end_lsn != InvalidXLogRecPtr, only summaries that start before the + * indicated LSN will be included. + */ +List * +FilterWalSummaries(List *wslist, TimeLineID tli, + XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + List *result = NIL; + ListCell *lc; + + /* Loop over input. */ + foreach(lc, wslist) + { + WalSummaryFile *ws = lfirst(lc); + + /* Skip if it doesn't match the filter criteria. */ + if (tli != 0 && tli != ws->tli) + continue; + if (!XLogRecPtrIsInvalid(start_lsn) && start_lsn > ws->end_lsn) + continue; + if (!XLogRecPtrIsInvalid(end_lsn) && end_lsn < ws->start_lsn) + continue; + + /* Add it to the result list. */ + result = lappend(result, ws); + } + + return result; +} + +/* + * Check whether the supplied list of WalSummaryFile objects covers the + * whole range of LSNs from start_lsn to end_lsn. This function ignores + * timelines, so the caller should probably filter using the appropriate + * timeline before calling this. + * + * If the whole range of LSNs is covered, returns true, otherwise false. + * If false is returned, *missing_lsn is set either to InvalidXLogRecPtr + * if there are no WAL summary files in the input list, or to the first LSN + * in the range that is not covered by a WAL summary file in the input list. + */ +bool +WalSummariesAreComplete(List *wslist, XLogRecPtr start_lsn, + XLogRecPtr end_lsn, XLogRecPtr *missing_lsn) +{ + XLogRecPtr current_lsn = start_lsn; + ListCell *lc; + + /* Special case for empty list. */ + if (wslist == NIL) + { + *missing_lsn = InvalidXLogRecPtr; + return false; + } + + /* Make a private copy of the list and sort it by start LSN. */ + wslist = list_copy(wslist); + list_sort(wslist, ListComparatorForWalSummaryFiles); + + /* + * Consider summary files in order of increasing start_lsn, advancing the + * known-summarized range from start_lsn toward end_lsn. + * + * Normally, the summary files should cover non-overlapping WAL ranges, + * but this algorithm is intended to be correct even in case of overlap. + */ + foreach(lc, wslist) + { + WalSummaryFile *ws = lfirst(lc); + + if (ws->start_lsn > current_lsn) + { + /* We found a gap. */ + break; + } + if (ws->end_lsn > current_lsn) + { + /* + * Next summary extends beyond end of previous summary, so extend + * the end of the range known to be summarized. + */ + current_lsn = ws->end_lsn; + + /* + * If the range we know to be summarized has reached the required + * end LSN, we have proved completeness. + */ + if (current_lsn >= end_lsn) + return true; + } + } + + /* + * We either ran out of summary files without reaching the end LSN, or we + * hit a gap in the sequence that resulted in us bailing out of the loop + * above. + */ + *missing_lsn = current_lsn; + return false; +} + +/* + * Open a WAL summary file. + * + * This will throw an error in case of trouble. As an exception, if + * missing_ok = true and the trouble is specifically that the file does + * not exist, it will not throw an error and will return a value less than 0. + */ +File +OpenWalSummaryFile(WalSummaryFile *ws, bool missing_ok) +{ + char path[MAXPGPATH]; + File file; + + snprintf(path, MAXPGPATH, + XLOGDIR "/summaries/%08X%08X%08X%08X%08X.summary", + ws->tli, + LSN_FORMAT_ARGS(ws->start_lsn), + LSN_FORMAT_ARGS(ws->end_lsn)); + + file = PathNameOpenFile(path, O_RDONLY); + if (file < 0 && (errno != EEXIST || !missing_ok)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + return file; +} + +/* + * Remove a WAL summary file if the last modification time precedes the + * cutoff time. + */ +void +RemoveWalSummaryIfOlderThan(WalSummaryFile *ws, time_t cutoff_time) +{ + char path[MAXPGPATH]; + struct stat statbuf; + + snprintf(path, MAXPGPATH, + XLOGDIR "/summaries/%08X%08X%08X%08X%08X.summary", + ws->tli, + LSN_FORMAT_ARGS(ws->start_lsn), + LSN_FORMAT_ARGS(ws->end_lsn)); + + if (lstat(path, &statbuf) != 0) + { + if (errno == ENOENT) + return; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + } + if (statbuf.st_mtime >= cutoff_time) + return; + if (unlink(path) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + ereport(DEBUG2, + (errmsg_internal("removing file \"%s\"", path))); +} + +/* + * Test whether a filename looks like a WAL summary file. + */ +static bool +IsWalSummaryFilename(char *filename) +{ + return strspn(filename, "0123456789ABCDEF") == 40 && + strcmp(filename + 40, ".summary") == 0; +} + +/* + * Data read callback for use with CreateBlockRefTableReader. + */ +int +ReadWalSummary(void *wal_summary_io, void *data, int length) +{ + WalSummaryIO *io = wal_summary_io; + int nbytes; + + nbytes = FileRead(io->file, data, length, io->filepos, + WAIT_EVENT_WAL_SUMMARY_READ); + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + FilePathName(io->file)))); + + io->filepos += nbytes; + return nbytes; +} + +/* + * Data write callback for use with WriteBlockRefTable. + */ +int +WriteWalSummary(void *wal_summary_io, void *data, int length) +{ + WalSummaryIO *io = wal_summary_io; + int nbytes; + + nbytes = FileWrite(io->file, data, length, io->filepos, + WAIT_EVENT_WAL_SUMMARY_WRITE); + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + FilePathName(io->file)))); + if (nbytes != length) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": wrote only %d of %d bytes at offset %u", + FilePathName(io->file), nbytes, + length, (unsigned) io->filepos), + errhint("Check free disk space."))); + + io->filepos += nbytes; + return nbytes; +} + +/* + * Error-reporting callback for use with CreateBlockRefTableReader. + */ +void +ReportWalSummaryError(void *callback_arg, char *fmt,...) +{ + StringInfoData buf; + va_list ap; + int needed; + + initStringInfo(&buf); + for (;;) + { + va_start(ap, fmt); + needed = appendStringInfoVA(&buf, fmt, ap); + va_end(ap); + if (needed == 0) + break; + enlargeStringInfo(&buf, needed); + } + ereport(ERROR, + errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("%s", buf.data)); +} + +/* + * Comparator to sort a List of WalSummaryFile objects by start_lsn. + */ +static int +ListComparatorForWalSummaryFiles(const ListCell *a, const ListCell *b) +{ + WalSummaryFile *ws1 = lfirst(a); + WalSummaryFile *ws2 = lfirst(b); + + if (ws1->start_lsn < ws2->start_lsn) + return -1; + if (ws1->start_lsn > ws2->start_lsn) + return 1; + return 0; +} diff --git a/src/backend/backup/walsummaryfuncs.c b/src/backend/backup/walsummaryfuncs.c new file mode 100644 index 0000000000..2e77d38b4a --- /dev/null +++ b/src/backend/backup/walsummaryfuncs.c @@ -0,0 +1,169 @@ +/*------------------------------------------------------------------------- + * + * walsummaryfuncs.c + * SQL-callable functions for accessing WAL summary data. + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * src/backend/backup/walsummaryfuncs.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "backup/walsummary.h" +#include "common/blkreftable.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "utils/fmgrprotos.h" +#include "utils/pg_lsn.h" + +#define NUM_WS_ATTS 3 +#define NUM_SUMMARY_ATTS 6 +#define MAX_BLOCKS_PER_CALL 256 + +/* + * List the WAL summary files available in pg_wal/summaries. + */ +Datum +pg_available_wal_summaries(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsi; + List *wslist; + ListCell *lc; + Datum values[NUM_WS_ATTS]; + bool nulls[NUM_WS_ATTS]; + + InitMaterializedSRF(fcinfo, 0); + rsi = (ReturnSetInfo *) fcinfo->resultinfo; + + memset(nulls, 0, sizeof(nulls)); + + wslist = GetWalSummaries(0, InvalidXLogRecPtr, InvalidXLogRecPtr); + foreach(lc, wslist) + { + WalSummaryFile *ws = (WalSummaryFile *) lfirst(lc); + HeapTuple tuple; + + CHECK_FOR_INTERRUPTS(); + + values[0] = Int64GetDatum((int64) ws->tli); + values[1] = LSNGetDatum(ws->start_lsn); + values[2] = LSNGetDatum(ws->end_lsn); + + tuple = heap_form_tuple(rsi->setDesc, values, nulls); + tuplestore_puttuple(rsi->setResult, tuple); + } + + return (Datum) 0; +} + +/* + * List the contents of a WAL summary file identified by TLI, start LSN, + * and end LSN. + */ +Datum +pg_wal_summary_contents(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsi; + Datum values[NUM_SUMMARY_ATTS]; + bool nulls[NUM_SUMMARY_ATTS]; + WalSummaryFile ws; + WalSummaryIO io; + BlockRefTableReader *reader; + int64 raw_tli; + RelFileLocator rlocator; + ForkNumber forknum; + BlockNumber limit_block; + + InitMaterializedSRF(fcinfo, 0); + rsi = (ReturnSetInfo *) fcinfo->resultinfo; + memset(nulls, 0, sizeof(nulls)); + + /* + * Since the timeline could at least in theory be more than 2^31, and + * since we don't have unsigned types at the SQL level, it is passed as a + * 64-bit integer. Test whether it's out of range. + */ + raw_tli = PG_GETARG_INT64(0); + if (raw_tli < 1 || raw_tli > PG_INT32_MAX) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid timeline %lld", (long long) raw_tli)); + + /* Prepare to read the specified WAL summry file. */ + ws.tli = (TimeLineID) raw_tli; + ws.start_lsn = PG_GETARG_LSN(1); + ws.end_lsn = PG_GETARG_LSN(2); + io.filepos = 0; + io.file = OpenWalSummaryFile(&ws, false); + reader = CreateBlockRefTableReader(ReadWalSummary, &io, + FilePathName(io.file), + ReportWalSummaryError, NULL); + + /* Loop over relation forks. */ + while (BlockRefTableReaderNextRelation(reader, &rlocator, &forknum, + &limit_block)) + { + BlockNumber blocks[MAX_BLOCKS_PER_CALL]; + HeapTuple tuple; + + CHECK_FOR_INTERRUPTS(); + + values[0] = ObjectIdGetDatum(rlocator.relNumber); + values[1] = ObjectIdGetDatum(rlocator.spcOid); + values[2] = ObjectIdGetDatum(rlocator.dbOid); + values[3] = Int16GetDatum((int16) forknum); + + /* Loop over blocks within the current relation fork. */ + while (true) + { + unsigned nblocks; + unsigned i; + + CHECK_FOR_INTERRUPTS(); + + nblocks = BlockRefTableReaderGetBlocks(reader, blocks, + MAX_BLOCKS_PER_CALL); + if (nblocks == 0) + break; + + /* + * For each block that we specifically know to have been modified, + * emit a row with that block number and limit_block = false. + */ + values[5] = BoolGetDatum(false); + for (i = 0; i < nblocks; ++i) + { + values[4] = Int64GetDatum((int64) blocks[i]); + + tuple = heap_form_tuple(rsi->setDesc, values, nulls); + tuplestore_puttuple(rsi->setResult, tuple); + } + + /* + * If the limit block is not InvalidBlockNumber, emit an exta row + * with that block number and limit_block = true. + * + * There is no point in doing this when the limit_block is + * InvalidBlockNumber, because no block with that number or any + * higher number can ever exist. + */ + if (BlockNumberIsValid(limit_block)) + { + values[4] = Int64GetDatum((int64) limit_block); + values[5] = BoolGetDatum(true); + + tuple = heap_form_tuple(rsi->setDesc, values, nulls); + tuplestore_puttuple(rsi->setResult, tuple); + } + } + } + + /* Cleanup */ + DestroyBlockRefTableReader(reader); + FileClose(io.file); + + return (Datum) 0; +} diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index 047448b34e..367a46c617 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -24,6 +24,7 @@ OBJS = \ postmaster.o \ startup.o \ syslogger.o \ + walsummarizer.o \ walwriter.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index cae6feb356..0c15c1777d 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -21,6 +21,7 @@ #include "postmaster/auxprocess.h" #include "postmaster/bgwriter.h" #include "postmaster/startup.h" +#include "postmaster/walsummarizer.h" #include "postmaster/walwriter.h" #include "replication/walreceiver.h" #include "storage/bufmgr.h" @@ -80,6 +81,9 @@ AuxiliaryProcessMain(AuxProcType auxtype) case WalReceiverProcess: MyBackendType = B_WAL_RECEIVER; break; + case WalSummarizerProcess: + MyBackendType = B_WAL_SUMMARIZER; + break; default: elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType); MyBackendType = B_INVALID; @@ -161,6 +165,10 @@ AuxiliaryProcessMain(AuxProcType auxtype) WalReceiverMain(); proc_exit(1); + case WalSummarizerProcess: + WalSummarizerMain(); + proc_exit(1); + default: elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType); proc_exit(1); diff --git a/src/backend/postmaster/meson.build b/src/backend/postmaster/meson.build index cda921fd10..a30eb6692f 100644 --- a/src/backend/postmaster/meson.build +++ b/src/backend/postmaster/meson.build @@ -12,5 +12,6 @@ backend_sources += files( 'postmaster.c', 'startup.c', 'syslogger.c', + 'walsummarizer.c', 'walwriter.c', ) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 4c49393fc5..c85ac19f4a 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -114,6 +114,7 @@ #include "postmaster/pgarch.h" #include "postmaster/postmaster.h" #include "postmaster/syslogger.h" +#include "postmaster/walsummarizer.h" #include "replication/logicallauncher.h" #include "replication/walsender.h" #include "storage/fd.h" @@ -251,6 +252,7 @@ static pid_t StartupPID = 0, CheckpointerPID = 0, WalWriterPID = 0, WalReceiverPID = 0, + WalSummarizerPID = 0, AutoVacPID = 0, PgArchPID = 0, SysLoggerPID = 0; @@ -442,6 +444,7 @@ static bool CreateOptsFile(int argc, char *argv[], char *fullprogname); static pid_t StartChildProcess(AuxProcType type); static void StartAutovacuumWorker(void); static void MaybeStartWalReceiver(void); +static void MaybeStartWalSummarizer(void); static void InitPostmasterDeathWatchHandle(void); /* @@ -562,6 +565,7 @@ static void ShmemBackendArrayRemove(Backend *bn); #define StartCheckpointer() StartChildProcess(CheckpointerProcess) #define StartWalWriter() StartChildProcess(WalWriterProcess) #define StartWalReceiver() StartChildProcess(WalReceiverProcess) +#define StartWalSummarizer() StartChildProcess(WalSummarizerProcess) /* Macros to check exit status of a child process */ #define EXIT_STATUS_0(st) ((st) == 0) @@ -1845,6 +1849,9 @@ ServerLoop(void) if (WalReceiverRequested) MaybeStartWalReceiver(); + /* If we need to start a WAL summarizer, try to do that now */ + MaybeStartWalSummarizer(); + /* Get other worker processes running, if needed */ if (StartWorkerNeeded || HaveCrashedWorker) maybe_start_bgworkers(); @@ -2736,6 +2743,8 @@ process_pm_reload_request(void) signal_child(WalWriterPID, SIGHUP); if (WalReceiverPID != 0) signal_child(WalReceiverPID, SIGHUP); + if (WalSummarizerPID != 0) + signal_child(WalSummarizerPID, SIGHUP); if (AutoVacPID != 0) signal_child(AutoVacPID, SIGHUP); if (PgArchPID != 0) @@ -3089,6 +3098,7 @@ process_pm_child_exit(void) BgWriterPID = StartBackgroundWriter(); if (WalWriterPID == 0) WalWriterPID = StartWalWriter(); + MaybeStartWalSummarizer(); /* * Likewise, start other special children as needed. In a restart @@ -3207,6 +3217,20 @@ process_pm_child_exit(void) continue; } + /* + * Was it the wal summarizer? Normal exit can be ignored; we'll start + * a new one at the next iteration of the postmaster's main loop, if + * necessary. Any other exit condition is treated as a crash. + */ + if (pid == WalSummarizerPID) + { + WalSummarizerPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("WAL summarizer process")); + continue; + } + /* * Was it the autovacuum launcher? Normal exit can be ignored; we'll * start a new one at the next iteration of the postmaster's main @@ -3602,6 +3626,12 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) else if (WalReceiverPID != 0 && take_action) sigquit_child(WalReceiverPID); + /* Take care of the walsummarizer too */ + if (pid == WalSummarizerPID) + WalSummarizerPID = 0; + else if (WalSummarizerPID != 0 && take_action) + sigquit_child(WalSummarizerPID); + /* Take care of the autovacuum launcher too */ if (pid == AutoVacPID) AutoVacPID = 0; @@ -3752,6 +3782,8 @@ PostmasterStateMachine(void) signal_child(StartupPID, SIGTERM); if (WalReceiverPID != 0) signal_child(WalReceiverPID, SIGTERM); + if (WalSummarizerPID != 0) + signal_child(WalSummarizerPID, SIGTERM); /* checkpointer, archiver, stats, and syslogger may continue for now */ /* Now transition to PM_WAIT_BACKENDS state to wait for them to die */ @@ -3778,6 +3810,7 @@ PostmasterStateMachine(void) if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 && StartupPID == 0 && WalReceiverPID == 0 && + WalSummarizerPID == 0 && BgWriterPID == 0 && (CheckpointerPID == 0 || (!FatalError && Shutdown < ImmediateShutdown)) && @@ -3875,6 +3908,7 @@ PostmasterStateMachine(void) /* These other guys should be dead already */ Assert(StartupPID == 0); Assert(WalReceiverPID == 0); + Assert(WalSummarizerPID == 0); Assert(BgWriterPID == 0); Assert(CheckpointerPID == 0); Assert(WalWriterPID == 0); @@ -4096,6 +4130,8 @@ TerminateChildren(int signal) signal_child(WalWriterPID, signal); if (WalReceiverPID != 0) signal_child(WalReceiverPID, signal); + if (WalSummarizerPID != 0) + signal_child(WalSummarizerPID, signal); if (AutoVacPID != 0) signal_child(AutoVacPID, signal); if (PgArchPID != 0) @@ -5402,6 +5438,10 @@ StartChildProcess(AuxProcType type) ereport(LOG, (errmsg("could not fork WAL receiver process: %m"))); break; + case WalSummarizerProcess: + ereport(LOG, + (errmsg("could not fork WAL summarizer process: %m"))); + break; default: ereport(LOG, (errmsg("could not fork process: %m"))); @@ -5538,6 +5578,19 @@ MaybeStartWalReceiver(void) } } +/* + * MaybeStartWalSummarizer + * Start the WAL summarizer process, if not running and our state allows. + */ +static void +MaybeStartWalSummarizer(void) +{ + if (wal_summarize_mb != 0 && WalSummarizerPID == 0 && + (pmState == PM_RUN || pmState == PM_HOT_STANDBY) && + Shutdown <= SmartShutdown) + WalSummarizerPID = StartWalSummarizer(); +} + /* * Create the opts file diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c new file mode 100644 index 0000000000..926b6c6ae4 --- /dev/null +++ b/src/backend/postmaster/walsummarizer.c @@ -0,0 +1,1414 @@ +/*------------------------------------------------------------------------- + * + * walsummarizer.c + * + * Background process to perform WAL summarization, if it is enabled. + * It continuously scans the write-ahead log and periodically emits a + * summary file which indicates which blocks in which relation forks + * were modified by WAL records in the LSN range covered by the summary + * file. See walsummary.c and blkreftable.c for more details on the + * naming and contents of WAL summary files. + * + * If configured to do, this background process will also remove WAL + * summary files when the file timestamp is older than a configurable + * threshold (but only if the WAL has been removed first). + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/postmaster/walsummarizer.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/timeline.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "backup/walsummary.h" +#include "catalog/storage_xlog.h" +#include "common/blkreftable.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "postmaster/bgwriter.h" +#include "postmaster/interrupt.h" +#include "postmaster/walsummarizer.h" +#include "replication/walreceiver.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/latch.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/wait_event.h" + +/* + * Data in shared memory related to WAL summarization. + */ +typedef struct +{ + /* + * These fields are protected by WALSummarizerLock. + * + * Until we've discovered what summary files already exist on disk and + * stored that information in shared memory, initialized is false and the + * other fields here contain no meaningful information. After that has + * been done, initialized is true. + * + * summarized_tli and summarized_lsn indicate the last LSN and TLI at + * which the next summary file will start. Normally, these are the LSN + * and TLI at which the last file ended; in such case, lsn_is_exact is + * true. If, however, the LSN is just an approximation, then lsn_is_exact + * is false. This can happen if, for example, there are no existing WAL + * summary files at startup. In that case, we have to derive the position + * at which to start summarizing from the WAL files that exist on disk, + * and so the LSN might point to the start of the next file even though + * that might happen to be in the middle of a WAL record. + * + * summarizer_pgprocno is the pgprocno value for the summarizer process, + * if one is running, or else INVALID_PGPROCNO. + * + * pending_lsn is used by the summarizer to advertise the ending LSN of + * a record it has recently read. It shouldn't ever be less than + * summarized_lsn, but might be greater, because the summarizer buffers + * data for a range of LSNs in memory before writing out a new file. + * + * switch_requested can be set to true to notify the summarizer that a + * new WAL summary file should be written as soon as possible, without + * trying to read more WAL first. + */ + bool initialized; + TimeLineID summarized_tli; + XLogRecPtr summarized_lsn; + bool lsn_is_exact; + int summarizer_pgprocno; + XLogRecPtr pending_lsn; + bool switch_requested; + + /* + * This field handles its own synchronizaton. + */ + ConditionVariable summary_file_cv; +} WalSummarizerData; + +/* + * Private data for our xlogreader's page read callback. + */ +typedef struct +{ + TimeLineID tli; + bool historic; + XLogRecPtr read_upto; + bool end_of_wal; + bool waited; + XLogRecPtr redo_pointer; + bool redo_pointer_reached; + XLogRecPtr redo_pointer_refresh_lsn; +} SummarizerReadLocalXLogPrivate; + +/* Pointer to shared memory state. */ +static WalSummarizerData *WalSummarizerCtl; + +/* + * When we reach end of WAL and need to read more, we sleep for a number of + * milliseconds that is a integer multiple of MS_PER_SLEEP_QUANTUM. This is + * the multiplier. It should vary between 1 and MAX_SLEEP_QUANTA, depending + * on system activity. See summarizer_wait_for_wal() for how we adjust this. + */ +static long sleep_quanta = 1; + +/* + * The sleep time will always be a multiple of 200ms and will not exceed + * one minute (300 * 200 = 60 * 1000). + */ +#define MAX_SLEEP_QUANTA 300 +#define MS_PER_SLEEP_QUANTUM 200 + +/* + * This is a count of the number of pages of WAL that we've read since the + * last time we waited for more WAL to appear. + */ +static long pages_read_since_last_sleep = 0; + +/* + * Most recent RedoRecPtr value observed by MaybeRemoveOldWalSummaries. + */ +static XLogRecPtr redo_pointer_at_last_summary_removal = InvalidXLogRecPtr; + +/* + * GUC parameters + */ +int wal_summarize_mb = 256; +int wal_summarize_keep_time = 7 * 24 * 60; + +static XLogRecPtr GetLatestLSN(TimeLineID *tli); +static void HandleWalSummarizerInterrupts(void); +static XLogRecPtr SummarizeWAL(TimeLineID tli, bool historic, + XLogRecPtr start_lsn, bool exact, + XLogRecPtr cutoff_lsn, XLogRecPtr maximum_lsn); +static void SummarizeSmgrRecord(XLogReaderState *xlogreader, + BlockRefTable *brtab); +static void SummarizeXactRecord(XLogReaderState *xlogreader, + BlockRefTable *brtab); +static int summarizer_read_local_xlog_page(XLogReaderState *state, + XLogRecPtr targetPagePtr, + int reqLen, + XLogRecPtr targetRecPtr, + char *cur_page); +static void summarizer_wait_for_wal(void); +static void MaybeRemoveOldWalSummaries(void); + +/* + * Amount of shared memory required for this module. + */ +Size +WalSummarizerShmemSize(void) +{ + return sizeof(WalSummarizerData); +} + +/* + * Create or attach to shared memory segment for this module. + */ +void +WalSummarizerShmemInit(void) +{ + bool found; + + WalSummarizerCtl = (WalSummarizerData *) + ShmemInitStruct("Wal Summarizer Ctl", WalSummarizerShmemSize(), + &found); + + if (!found) + { + /* + * First time through, so initialize. + * + * We're just filling in dummy values here -- the real initialization + * will happen when GetOldestUnsummarizedLSN() is called for the first + * time. + */ + WalSummarizerCtl->initialized = false; + WalSummarizerCtl->summarized_tli = 0; + WalSummarizerCtl->summarized_lsn = InvalidXLogRecPtr; + WalSummarizerCtl->lsn_is_exact = false; + WalSummarizerCtl->summarizer_pgprocno = INVALID_PGPROCNO; + WalSummarizerCtl->pending_lsn = InvalidXLogRecPtr; + WalSummarizerCtl->switch_requested = false; + ConditionVariableInit(&WalSummarizerCtl->summary_file_cv); + } +} + +/* + * Entry point for walsummarizer process. + */ +void +WalSummarizerMain(void) +{ + sigjmp_buf local_sigjmp_buf; + MemoryContext context; + + /* + * Within this function, 'current_lsn' and 'current_tli' refer to the + * point from which the next WAL summary file should start. 'exact' is + * true if 'current_lsn' is known to be the start of a WAL recod or WAL + * segment, and false if it might be in the middle of a record someplace. + * + * 'switch_lsn' and 'switch_tli', if set, are the LSN at which we need to + * switch to a new timeline and the timeline to which we need to switch. + * If not set, we either haven't figured out the answers yet or we're + * already on the latest timeline. + */ + XLogRecPtr current_lsn; + TimeLineID current_tli; + bool exact; + XLogRecPtr switch_lsn = InvalidXLogRecPtr; + TimeLineID switch_tli = 0; + + ereport(DEBUG1, + (errmsg_internal("WAL summarizer started"))); + + /* + * Properly accept or ignore signals the postmaster might send us + * + * We have no particular use for SIGINT at the moment, but seems + * reasonable to treat like SIGTERM. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SignalHandlerForShutdownRequest); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); /* not used */ + + /* Advertise ourselves. */ + LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); + WalSummarizerCtl->summarizer_pgprocno = MyProc->pgprocno; + LWLockRelease(WALSummarizerLock); + + /* Create and switch to a memory context that we can reset on error. */ + context = AllocSetContextCreate(TopMemoryContext, + "Wal Summarizer", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(context); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + /* + * If an exception is encountered, processing resumes here. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* Release resources we might have acquired. */ + LWLockReleaseAll(); + ConditionVariableCancelSleep(); + pgstat_report_wait_end(); + ReleaseAuxProcessResources(false); + AtEOXact_Files(false); + AtEOXact_HashTables(false); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(context); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(context); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* + * Sleep for 10 seconds before attempting to resume operations in + * order to avoid excessing logging. + * + * Many of the likely error conditions are things that will repeat + * every time. For example, if the WAL can't be read or the summary + * can't be written, only administrator action will cure the problem. + * So a really fast retry time doesn't seem to be especially + * beneficial, and it will clutter the logs. + */ + (void) WaitLatch(MyLatch, + WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10000, + WAIT_EVENT_WAL_SUMMARIZER_ERROR); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + + /* + * Fetch information about previous progress from shared memory. + * + * If we discover that WAL summarization is not enabled, just exit. + */ + current_lsn = GetOldestUnsummarizedLSN(¤t_tli, &exact); + if (XLogRecPtrIsInvalid(current_lsn)) + proc_exit(0); + + /* + * Loop forever + */ + for (;;) + { + XLogRecPtr latest_lsn; + TimeLineID latest_tli; + XLogRecPtr cutoff_lsn; + XLogRecPtr end_of_summary_lsn; + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(context); + + /* Process any signals received recently. */ + HandleWalSummarizerInterrupts(); + + /* If it's time to remove any old WAL summaries, do that now. */ + MaybeRemoveOldWalSummaries(); + + /* Find the LSN and TLI up to which we can safely summarize. */ + latest_lsn = GetLatestLSN(&latest_tli); + + /* + * If we're summarizing a historic timeline and we haven't yet + * computed the point at which to switch to the next timeline, do that + * now. + * + * Note that if this is a standby, what was previously the current + * timeline could become historic at any time. + * + * We could try to make this more efficient by caching the results of + * readTimeLineHistory when latest_tli has not changed, but since we + * only have to do this once per timeline switch, we probably wouldn't + * save any significant amount of work in practice. + */ + if (current_tli != latest_tli && XLogRecPtrIsInvalid(switch_lsn)) + { + List *tles = readTimeLineHistory(latest_tli); + + switch_lsn = tliSwitchPoint(current_tli, tles, &switch_tli); + elog(DEBUG2, + "switch point from TLI %u to TLI %u is at %X/%X", + current_tli, switch_tli, LSN_FORMAT_ARGS(switch_lsn)); + } + + /* + * wal_summarize_mb sets a soft limit on the amont of WAL covered + * by a single summary file. If we read a WAL record that ends after + * the cutoff LSN computed here, we'll stop the summary. In most cases, + * it will actually stop earlier than that, but this is here as a + * backstop. + */ + cutoff_lsn = current_lsn + wal_summarize_mb * 1024 * 1024; + if (!XLogRecPtrIsInvalid(switch_lsn) && cutoff_lsn > switch_lsn) + cutoff_lsn = switch_lsn; + elog(DEBUG2, + "WAL summarization cutoff is TLI %d @ %X/%X, flush position is %X/%X", + current_tli, LSN_FORMAT_ARGS(cutoff_lsn), LSN_FORMAT_ARGS(latest_lsn)); + + /* Summarize WAL. */ + end_of_summary_lsn = SummarizeWAL(current_tli, + current_tli != latest_tli, + current_lsn, exact, + cutoff_lsn, latest_lsn); + Assert(!XLogRecPtrIsInvalid(end_of_summary_lsn)); + Assert(end_of_summary_lsn >= current_lsn); + + /* + * Update state for next loop iteration. + * + * Next summary file should start from exactly where this one ended. + * Timeline remains unchanged unless a switch LSN was computed and we + * have reached it. + */ + current_lsn = end_of_summary_lsn; + exact = true; + if (!XLogRecPtrIsInvalid(switch_lsn) && cutoff_lsn >= switch_lsn) + { + current_tli = switch_tli; + switch_lsn = InvalidXLogRecPtr; + switch_tli = 0; + } + + /* Update state in shared memory. */ + LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); + Assert(WalSummarizerCtl->pending_lsn <= end_of_summary_lsn); + WalSummarizerCtl->summarized_lsn = end_of_summary_lsn; + WalSummarizerCtl->summarized_tli = current_tli; + WalSummarizerCtl->lsn_is_exact = true; + WalSummarizerCtl->pending_lsn = end_of_summary_lsn; + WalSummarizerCtl->switch_requested = false; + LWLockRelease(WALSummarizerLock); + + /* Wake up anyone waiting for more summary files to be written. */ + ConditionVariableBroadcast(&WalSummarizerCtl->summary_file_cv); + } +} + +/* + * Get the oldest LSN in this server's timeline history that has not yet been + * summarized. + * + * If *tli != NULL, it will be set to the TLI for the LSN that is returned. + * + * If *lsn_is_exact != NULL, it will be set to true if the returned LSN is + * necessarily the start of a WAL record and false if it's just the beginning + * of a WAL segment. + */ +XLogRecPtr +GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact) +{ + TimeLineID latest_tli; + LWLockMode mode = LW_SHARED; + int n; + List *tles; + XLogRecPtr unsummarized_lsn; + TimeLineID unsummarized_tli = 0; + bool should_make_exact = false; + List *existing_summaries; + ListCell *lc; + + /* If not summarizing WAL, do nothing. */ + if (wal_summarize_mb == 0) + return InvalidXLogRecPtr; + + /* + * Initially, we acquire the lock in shared mode and try to fetch the + * required information. If the data structure hasn't been initialized, we + * reacquire the lock in shared mode so that we can initialize it. + * However, if someone else does that first before we get the lock, then + * we can just return the requested information after all. + */ + while (true) + { + LWLockAcquire(WALSummarizerLock, mode); + + if (WalSummarizerCtl->initialized) + { + unsummarized_lsn = WalSummarizerCtl->summarized_lsn; + if (tli != NULL) + *tli = WalSummarizerCtl->summarized_tli; + if (lsn_is_exact != NULL) + *lsn_is_exact = WalSummarizerCtl->lsn_is_exact; + LWLockRelease(WALSummarizerLock); + return unsummarized_lsn; + } + + if (mode == LW_EXCLUSIVE) + break; + + LWLockRelease(WALSummarizerLock); + mode = LW_EXCLUSIVE; + } + + /* + * The data structure needs to be initialized, and we are the first to + * obtain the lock in exclusive mode, so it's our job to do that + * initialization. + * + * So, find the oldest timeline on which WAL still exists, and the + * earliest segment for which it exists. + */ + (void) GetLatestLSN(&latest_tli); + tles = readTimeLineHistory(latest_tli); + for (n = list_length(tles) - 1; n >= 0; --n) + { + TimeLineHistoryEntry *tle = list_nth(tles, n); + XLogSegNo oldest_segno; + + oldest_segno = XLogGetOldestSegno(tle->tli); + if (oldest_segno != 0) + { + /* Compute oldest LSN that still exists on disk. */ + XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size, + unsummarized_lsn); + + unsummarized_tli = tle->tli; + break; + } + } + + /* It really should not be possible for us to find no WAL. */ + if (unsummarized_tli == 0) + ereport(ERROR, + errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("no WAL found on timeline %d", latest_tli)); + + /* + * Don't try to summarize anything older than the end LSN of the newest + * summary file that exists for this timeline. + */ + existing_summaries = + GetWalSummaries(unsummarized_tli, + InvalidXLogRecPtr, InvalidXLogRecPtr); + foreach(lc, existing_summaries) + { + WalSummaryFile *ws = lfirst(lc); + + if (ws->end_lsn > unsummarized_lsn) + { + unsummarized_lsn = ws->end_lsn; + should_make_exact = true; + } + } + + /* Update shared memory with the discovered values. */ + WalSummarizerCtl->initialized = true; + WalSummarizerCtl->summarized_lsn = unsummarized_lsn; + WalSummarizerCtl->summarized_tli = unsummarized_tli; + WalSummarizerCtl->lsn_is_exact = should_make_exact; + WalSummarizerCtl->pending_lsn = unsummarized_lsn; + + /* Also return the to the caller as required. */ + if (tli != NULL) + *tli = WalSummarizerCtl->summarized_tli; + if (lsn_is_exact != NULL) + *lsn_is_exact = WalSummarizerCtl->lsn_is_exact; + LWLockRelease(WALSummarizerLock); + + return unsummarized_lsn; +} + +/* + * Attempt to set the WAL summarizer's latch. + * + * This might not work, because there's no guarantee that the WAL summarizer + * process was successfully started, and it also might have started but + * subsequently terminated. So, under normal circumstances, this will get the + * latch set, but there's no guarantee. + */ +void +SetWalSummarizerLatch(void) +{ + int pgprocno; + + LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); + pgprocno = WalSummarizerCtl->summarizer_pgprocno; + LWLockRelease(WALSummarizerLock); + + if (pgprocno != INVALID_PGPROCNO) + SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch); +} + +/* + * Wait until WAL summarization reaches the given LSN, but not longer than + * the given timeout. + * + * The return value is the first still-unsummarized LSN. If it's greater than + * or equal to the passed LSN, then that LSN was reached. If not, we timed out. + */ +XLogRecPtr +WaitForWalSummarization(XLogRecPtr lsn, long timeout) +{ + TimestampTz start_time = GetCurrentTimestamp(); + TimestampTz deadline = TimestampTzPlusMilliseconds(start_time, timeout); + XLogRecPtr summarized_lsn; + + Assert(!XLogRecPtrIsInvalid(lsn)); + Assert(timeout > 0); + + while (1) + { + TimestampTz now; + long remaining_timeout; + + /* + * If the LSN summarized on disk has reached the target value, stop. + * If it hasn't, but the in-memory value has reached the target value, + * request that a file be written as soon as possible. + */ + LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); + summarized_lsn = WalSummarizerCtl->summarized_lsn; + if (summarized_lsn < lsn && + WalSummarizerCtl->pending_lsn >= lsn) + WalSummarizerCtl->switch_requested = true; + LWLockRelease(WALSummarizerLock); + if (summarized_lsn >= lsn) + break; + + /* Timeout reached? If yes, stop. */ + now = GetCurrentTimestamp(); + remaining_timeout = TimestampDifferenceMilliseconds(now, deadline); + if (remaining_timeout <= 0) + break; + + /* + * Limit the sleep to 1 second, because we may need to request a + * switch. + */ + if (remaining_timeout > 1000) + remaining_timeout = 1000; + + /* Wait and see. */ + ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv, + remaining_timeout, + WAIT_EVENT_WAL_SUMMARY_READY); + } + + return summarized_lsn; +} + +/* + * Get the latest LSN that is eligible to be summarized, and set *tli to the + * corresponding timeline. + */ +static XLogRecPtr +GetLatestLSN(TimeLineID *tli) +{ + if (!RecoveryInProgress()) + { + /* Don't summarize WAL before it's flushed. */ + return GetFlushRecPtr(tli); + } + else + { + XLogRecPtr flush_lsn; + TimeLineID flush_tli; + XLogRecPtr replay_lsn; + TimeLineID replay_tli; + + /* + * What we really want to know is how much WAL has been flushed to + * disk, but the only flush position available is the one provided by + * the walreceiver, which may not be running, because this could be + * crash recovery or recovery via restore_command. So use either the + * WAL receiver's flush position or the replay position, whichever is + * further ahead, on the theory that if the WAL has been replayed then + * it must also have been flushed to disk. + */ + flush_lsn = GetWalRcvFlushRecPtr(NULL, &flush_tli); + replay_lsn = GetXLogReplayRecPtr(&replay_tli); + if (flush_lsn > replay_lsn) + { + *tli = flush_tli; + return flush_lsn; + } + else + { + *tli = replay_tli; + return replay_lsn; + } + } +} + +/* + * Interrupt handler for main loop of WAL summarizer process. + */ +static void +HandleWalSummarizerInterrupts(void) +{ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (ShutdownRequestPending || wal_summarize_mb == 0) + { + ereport(DEBUG1, + errmsg_internal("WAL summarizer shutting down")); + proc_exit(0); + } + + /* Perform logging of memory contexts of this process */ + if (LogMemoryContextPending) + ProcessLogMemoryContextInterrupt(); +} + +/* + * Summarize a range of WAL records on a single timeline. + * + * 'tli' is the timeline to be summarized. 'historic' should be false if the + * timeline in question is the latest one and true otherwise. + * + * 'start_lsn' is the point at which we should start summarizing. If this + * value comes from the end LSN of the previous record as returned by the + * xlograder machinery, 'exact' should be true; otherwise, 'exact' should + * be false, and this function will search forward for the start of a valid + * WAL record. + * + * 'cutoff_lsn' is the point at which we should stop summarizing. The first + * record that ends at or after cutoff_lsn will be the last one included + * in the summary. + * + * 'maximum_lsn' identifies the point beyond which we can't count on being + * able to read any more WAL. It should be the switch point when reading a + * historic timeline, or the most-recently-measured end of WAL when reading + * the current timeline. + * + * The return value is the LSN at which the WAL summary actually ends. Most + * often, a summary file ends because we notice that a checkpoint has + * occurred and reach the redo pointer of that checkpoint, but sometimes + * we stop for other reasons, such as a timeline switch, or reading a record + * that ends after the cutoff_lsn. + */ +static XLogRecPtr +SummarizeWAL(TimeLineID tli, bool historic, + XLogRecPtr start_lsn, bool exact, + XLogRecPtr cutoff_lsn, XLogRecPtr maximum_lsn) +{ + SummarizerReadLocalXLogPrivate *private_data; + XLogReaderState *xlogreader; + XLogRecPtr summary_start_lsn; + XLogRecPtr summary_end_lsn = cutoff_lsn; + char temp_path[MAXPGPATH]; + char final_path[MAXPGPATH]; + WalSummaryIO io; + BlockRefTable *brtab = CreateEmptyBlockRefTable(); + + /* Initialize private data for xlogreader. */ + private_data = (SummarizerReadLocalXLogPrivate *) + palloc0(sizeof(SummarizerReadLocalXLogPrivate)); + private_data->tli = tli; + private_data->historic = historic; + private_data->read_upto = maximum_lsn; + private_data->redo_pointer = GetRedoRecPtr(); + private_data->redo_pointer_refresh_lsn = start_lsn; + private_data->redo_pointer_reached = + (start_lsn >= private_data->redo_pointer); + + /* Create xlogreader. */ + xlogreader = XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.page_read = &summarizer_read_local_xlog_page, + .segment_open = &wal_segment_open, + .segment_close = &wal_segment_close), + private_data); + if (xlogreader == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating a WAL reading processor."))); + + /* + * When exact = false, we're starting from an arbitrary point in the WAL + * and must search forward for the start of the next record. + * + * When exact = true, start_lsn should be either the LSN where a record + * begins, or the LSN of a page where the page header is immediately + * followed by the start of a new record. XLogBeginRead should tolerate + * either case. + * + * We need to allow for both cases because the behavior of xlogreader + * varies. When a record spans two or more xlog pages, the ending LSN + * reported by xlogreader will be the starting LSN of the following + * record, but when an xlog page boundary falls between two records, the + * end LSN for the first will be reported as the first byte of the + * following page. We can't know until we read that page how large the + * header will be, but we'll have to skip over it to find the next record. + */ + if (exact) + { + /* + * Even if start_lsn is the beginning of a page rather than the + * beginning of the first record on that page, we should still use it + * as the start LSN for the summary file. That's because we detect + * missing summary files by looking for cases where the end LSN of one + * file is less than the start LSN of the next file. When only a page + * header is skipped, nothing has been missed. + */ + XLogBeginRead(xlogreader, start_lsn); + summary_start_lsn = start_lsn; + } + else + { + summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn); + if (XLogRecPtrIsInvalid(summary_start_lsn)) + { + /* + * If we hit end-of-WAL while trying to find the next valid + * record, we must be on a historic timeline that has no valid + * records that begin after start_lsn and before end of WAL. + */ + if (private_data->end_of_wal) + { + ereport(DEBUG1, + errmsg_internal("could not read WAL from timeline %d at %X/%X: end of WAL at %X/%X", + tli, + LSN_FORMAT_ARGS(start_lsn), + LSN_FORMAT_ARGS(private_data->read_upto))); + + /* + * The timeline ends at or after start_lsn, without containing + * any records. Thus, we must make sure the main loop does not + * iterate. If start_lsn is the end of the timeline, then we + * won't actually emit an empty summary file, but otherwise, + * we must, to capture the fact that the LSN range in question + * contains no interesting WAL records. + */ + summary_start_lsn = start_lsn; + summary_end_lsn = private_data->read_upto; + cutoff_lsn = xlogreader->EndRecPtr; + } + else + ereport(ERROR, + (errmsg("could not find a valid record after %X/%X", + LSN_FORMAT_ARGS(start_lsn)))); + } + + /* We shouldn't go backward. */ + Assert(summary_start_lsn >= start_lsn); + } + + /* + * Main loop: read xlog records one by one. + */ + while (xlogreader->EndRecPtr < cutoff_lsn) + { + int block_id; + char *errormsg; + XLogRecord *record; + bool switch_requested; + + /* We shouldn't go backward. */ + Assert(summary_start_lsn <= xlogreader->EndRecPtr); + + /* + * This flag tracks whether the read of a particular record had to + * wait for more WAL to arrive, so reset it before reading the next + * record. + */ + private_data->waited = false; + + /* Now read the next record. */ + record = XLogReadRecord(xlogreader, &errormsg); + if (record == NULL) + { + SummarizerReadLocalXLogPrivate *private_data; + + private_data = (SummarizerReadLocalXLogPrivate *) + xlogreader->private_data; + if (private_data->end_of_wal) + { + /* + * This timeline must be historic and must end before we were + * able to read a complete record. + */ + ereport(DEBUG1, + errmsg_internal("could not read WAL from timeline %d at %X/%X: end of WAL at %X/%X", + tli, + LSN_FORMAT_ARGS(xlogreader->EndRecPtr), + LSN_FORMAT_ARGS(private_data->read_upto))); + /* Summary ends at end of WAL. */ + summary_end_lsn = private_data->read_upto; + break; + } + if (errormsg) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read WAL at %X/%X: %s", + LSN_FORMAT_ARGS(xlogreader->EndRecPtr), errormsg))); + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read WAL at %X/%X", + LSN_FORMAT_ARGS(xlogreader->EndRecPtr)))); + } + + /* We shouldn't go backward. */ + Assert(summary_start_lsn <= xlogreader->EndRecPtr); + + if (xlogreader->ReadRecPtr >= cutoff_lsn) + { + /* + * Woops! We've read a record that *starts* after the cutoff LSN, + * contrary to our goal of reading only until we hit the first + * record that ends at or after the cutoff LSN. Pretend we didn't + * read it after all by bailing out of this loop right here, + * before we do anything with this record. + * + * This can happen because the last record before the cutoff LSN + * might be continued across multiple pages, and then we might + * come to a page with XLP_FIRST_IS_OVERWRITE_CONTRECORD set. In + * that case, the record that was continued across multiple pages + * is incomplete and will be disregarded, and the read will + * restart from the beginning of the page that is flagged + * XLP_FIRST_IS_OVERWRITE_CONTRECORD. + * + * If this case occurs, we can fairly say that the current summary + * file ends at the cutoff LSN exactly. The first record on the + * page marked XLP_FIRST_IS_OVERWRITE_CONTRECORD will be + * discovered when generating the next summary file. + */ + summary_end_lsn = cutoff_lsn; + break; + } + + /* + * We attempt, on a best effort basis only, to make WAL summary file + * boundaries line up with checkpoint cycles. So, if the last redo + * pointer we've seen was in the future, and this record starts at + * that redo pointer, stop before processing and let it be included in + * the next summary file. + * + * Note that in the case of a checkpoint triggered by a backup, the + * redo pointer is likely to be pointing to the first record on a + * page. Before reading the record, xlogreader->EndRecPtr will have + * pointed to the start of the page, which precedes the redo LSN. But + * after reading the next record, we'll advance over the page header + * and realize that the next record starts at the redo LSN exactly, + * making this the first point at which we can realize that it's time + * to stop. + */ + if (!private_data->redo_pointer_reached && + xlogreader->ReadRecPtr >= private_data->redo_pointer) + { + summary_end_lsn = xlogreader->ReadRecPtr; + break; + } + + /* Special handling for particular types of WAL records. */ + switch (XLogRecGetRmid(xlogreader)) + { + case RM_SMGR_ID: + SummarizeSmgrRecord(xlogreader, brtab); + break; + case RM_XACT_ID: + SummarizeXactRecord(xlogreader, brtab); + break; + default: + break; + } + + /* Feed block references from xlog record to block reference table. */ + for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader); + block_id++) + { + RelFileLocator rlocator; + ForkNumber forknum; + BlockNumber blocknum; + + if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator, + &forknum, &blocknum, NULL)) + continue; + + BlockRefTableMarkBlockModified(brtab, &rlocator, forknum, + blocknum); + } + + /* Update our notion of where this summary file ends. */ + summary_end_lsn = xlogreader->EndRecPtr; + + /* + * Also update shared memory, and handle any request for a + * WAL summary file switch. + */ + LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); + Assert(summary_end_lsn >= WalSummarizerCtl->pending_lsn); + Assert(summary_end_lsn >= WalSummarizerCtl->summarized_lsn); + WalSummarizerCtl->pending_lsn = summary_end_lsn; + switch_requested = WalSummarizerCtl->switch_requested; + LWLockRelease(WALSummarizerLock); + if (switch_requested) + break; + + /* + * Periodically update our notion of the redo pointer, because it + * might be changing concurrently. There's no interlocking here: we + * might race past the new redo pointer before we learn about it. + * That's OK; we only use the redo pointer as a heuristic for where to + * stop summarizing. + * + * It would be nice if we could just fetch the updated redo pointer on + * every pass through this loop, but that seems a bit too expensive: + * GetRedoRecPtr acquires a heavily-contended spinlock. So, instead, + * just fetch the updated value if we've just had to sleep, or if + * we've read more than a segment's worth of WAL without sleeping. + */ + if (private_data->waited || xlogreader->EndRecPtr > + private_data->redo_pointer_refresh_lsn + wal_segment_size) + { + private_data->redo_pointer = GetRedoRecPtr(); + private_data->redo_pointer_refresh_lsn = xlogreader->EndRecPtr; + private_data->redo_pointer_reached = + (xlogreader->EndRecPtr >= private_data->redo_pointer); + } + + /* + * Recheck whether we've just caught up with the redo pointer, and + * if so, stop. This has the same purpose as the earlier check for + * the same condition above, but there we've just read a record and + * might decide against including it in the current summary file, + * whereas here we've already included it and might decide against + * reading the next one. Note that we may have just refreshed our + * notion of the redo pointer, so it's smart to check here before we + * do any more work. + */ + if (!private_data->redo_pointer_reached && + xlogreader->EndRecPtr >= private_data->redo_pointer) + break; + } + + /* Destroy xlogreader. */ + pfree(xlogreader->private_data); + XLogReaderFree(xlogreader); + + /* + * If a timeline switch occurs, we may fail to make any progress at all + * before exiting the loop above. If that happens, we don't write a WAL + * summary file at all. + */ + if (summary_end_lsn > summary_start_lsn) + { + /* Generate temporary and final path name. */ + snprintf(temp_path, MAXPGPATH, + XLOGDIR "/summaries/temp.summary"); + snprintf(final_path, MAXPGPATH, + XLOGDIR "/summaries/%08X%08X%08X%08X%08X.summary", + tli, + LSN_FORMAT_ARGS(summary_start_lsn), + LSN_FORMAT_ARGS(summary_end_lsn)); + + /* Open the temporary file for writing. */ + io.filepos = 0; + io.file = PathNameOpenFile(temp_path, O_WRONLY | O_CREAT | O_TRUNC); + if (io.file < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", temp_path))); + + /* Write the data. */ + WriteBlockRefTable(brtab, WriteWalSummary, &io); + + /* Close temporary file and shut down xlogreader. */ + FileClose(io.file); + + /* Tell the user what we did. */ + ereport(LOG, + errmsg("summarized WAL on TLI %d from %X/%X to %X/%X", + tli, + LSN_FORMAT_ARGS(summary_start_lsn), + LSN_FORMAT_ARGS(summary_end_lsn))); + + /* Durably rename the new summary into place. */ + durable_rename(temp_path, final_path, ERROR); + } + + return summary_end_lsn; +} + +/* + * Special handling for WAL records with RM_SMGR_ID. + */ +static void +SummarizeSmgrRecord(XLogReaderState *xlogreader, BlockRefTable *brtab) +{ + uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK; + + if (info == XLOG_SMGR_CREATE) + { + xl_smgr_create *xlrec; + + /* + * If a new relation fork is created on disk, there is no point + * tracking anything about which blocks have been modified, because + * the whole thing will be new. Hence, set the limit block for this + * fork to 0. + * + * Ignore the FSM fork, which is not fully WAL-logged. + */ + xlrec = (xl_smgr_create *) XLogRecGetData(xlogreader); + + if (xlrec->forkNum != FSM_FORKNUM) + BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator, + xlrec->forkNum, 0); + } + else if (info == XLOG_SMGR_TRUNCATE) + { + xl_smgr_truncate *xlrec; + + xlrec = (xl_smgr_truncate *) XLogRecGetData(xlogreader); + + /* + * If a relation fork is truncated on disk, there is in point in + * tracking anything about block modifications beyond the truncation + * point. + * + * We ignore SMGR_TRUNCATE_FSM here because the FSM isn't fully + * WAL-logged and thus we can't track modified blocks for it anyway. + */ + if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0) + BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator, + MAIN_FORKNUM, xlrec->blkno); + if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0) + BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator, + VISIBILITYMAP_FORKNUM, xlrec->blkno); + } +} + +/* + * Special handling for WAL recods with RM_XACT_ID. + */ +static void +SummarizeXactRecord(XLogReaderState *xlogreader, BlockRefTable *brtab) +{ + uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK; + uint8 xact_info = info & XLOG_XACT_OPMASK; + + if (xact_info == XLOG_XACT_COMMIT || + xact_info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(xlogreader); + xl_xact_parsed_commit parsed; + int i; + + ParseCommitRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed); + for (i = 0; i < parsed.nrels; ++i) + { + ForkNumber forknum; + + for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum) + if (forknum != FSM_FORKNUM) + BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i], + forknum, 0); + } + } + else if (xact_info == XLOG_XACT_ABORT || + xact_info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(xlogreader); + xl_xact_parsed_abort parsed; + int i; + + ParseAbortRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed); + for (i = 0; i < parsed.nrels; ++i) + { + ForkNumber forknum; + + for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum) + if (forknum != FSM_FORKNUM) + BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i], + forknum, 0); + } + } +} + +/* + * Similar to read_local_xlog_page, but limited to read from one particular + * timeline. If the end of WAL is reached, it will wait for more if reading + * from the current timeline, or give up if reading from a historic timeline. + * In the latter case, it will also set private_data->end_of_wal = true. + * + * Caller must set private_data->tli to the TLI of interest, + * private_data->read_upto to the lowest LSN that is not known to be safe + * to read on that timeline, and private_data->historic to true if and only + * if the timeline is not the current timeline. This function will update + * private_data->read_upto and private_data->historic if more WAL appears + * on the current timeline or if the current timeline becomes historic. + */ +static int +summarizer_read_local_xlog_page(XLogReaderState *state, + XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *cur_page) +{ + int count; + WALReadError errinfo; + SummarizerReadLocalXLogPrivate *private_data; + + private_data = (SummarizerReadLocalXLogPrivate *) + state->private_data; + + while (true) + { + if (targetPagePtr + XLOG_BLCKSZ <= private_data->read_upto) + { + /* + * more than one block available; read only that block, have + * caller come back if they need more. + */ + count = XLOG_BLCKSZ; + break; + } + else if (targetPagePtr + reqLen > private_data->read_upto) + { + /* We don't seem to have enough data. */ + if (private_data->historic) + { + /* + * This is a historic timeline, so there will never be any + * more data than we have currently. + */ + private_data->end_of_wal = true; + return -1; + } + else + { + XLogRecPtr latest_lsn; + TimeLineID latest_tli; + + /* + * This is - or at least was up until very recently - the + * current timeline, so more data might show up. Delay here + * so we don't tight-loop. + */ + HandleWalSummarizerInterrupts(); + summarizer_wait_for_wal(); + private_data->waited = true; + + /* Recheck end-of-WAL. */ + latest_lsn = GetLatestLSN(&latest_tli); + if (private_data->tli == latest_tli) + { + /* Still the current timeline, update max LSN. */ + Assert(latest_lsn >= private_data->read_upto); + private_data->read_upto = latest_lsn; + } + else + { + List *tles = readTimeLineHistory(latest_tli); + XLogRecPtr switchpoint; + + /* + * The timeline we're scanning is no longer the latest + * one. Figure out when it ended and allow reads up to + * exactly that point. + */ + private_data->historic = true; + switchpoint = tliSwitchPoint(private_data->tli, tles, + NULL); + Assert(switchpoint >= private_data->read_upto); + private_data->read_upto = switchpoint; + } + + /* Go around and try again. */ + } + } + else + { + /* enough bytes available to satisfy the request */ + count = private_data->read_upto - targetPagePtr; + break; + } + } + + /* + * Even though we just determined how much of the page can be validly read + * as 'count', read the whole page anyway. It's guaranteed to be + * zero-padded up to the page boundary if it's incomplete. + */ + if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, + private_data->tli, &errinfo)) + WALReadRaiseError(&errinfo); + + /* Track that we read a page, for sleep time calculation. */ + ++pages_read_since_last_sleep; + + /* number of valid bytes in the buffer */ + return count; +} + +/* + * Sleep for long enough that we believe it's likely that more WAL will + * be available afterwards. + */ +static void +summarizer_wait_for_wal(void) +{ + if (pages_read_since_last_sleep == 0) + { + /* + * No pages were read since the last sleep, so double the sleep time, + * but not beyond the maximum allowable value. + */ + sleep_quanta = Min(sleep_quanta * 2, MAX_SLEEP_QUANTA); + } + else if (pages_read_since_last_sleep > 1) + { + /* + * Multiple pages were read since the last sleep, so reduce the sleep + * time. + * + * A large burst of activity should be able to quickly reduce the + * sleep time to the minimum, but we don't want a handful of extra WAL + * records to provoke a strong reaction. We choose to reduce the sleep + * time by 1 quantum for each page read beyond the first, which is a + * fairly arbitrary way of trying to be reactive without + * overrreacting. + */ + if (pages_read_since_last_sleep > sleep_quanta - 1) + sleep_quanta = 1; + else + sleep_quanta -= pages_read_since_last_sleep; + } + + /* OK, now sleep. */ + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + sleep_quanta * MS_PER_SLEEP_QUANTUM, + WAIT_EVENT_WAL_SUMMARIZER_WAL); + ResetLatch(MyLatch); + + /* Reset count of pages read. */ + pages_read_since_last_sleep = 0; +} + +/* + * Most recent RedoRecPtr value observed by RemoveOldWalSummaries. + */ +static void +MaybeRemoveOldWalSummaries(void) +{ + XLogRecPtr redo_pointer = GetRedoRecPtr(); + List *wslist; + time_t cutoff_time; + + /* If WAL summary removal is disabled, don't do anything. */ + if (wal_summarize_keep_time == 0) + return; + + /* + * If the redo pointer has not advanced, don't do anything. + * + * This has the effect that we only try to remove old WAL summary files + * once per checkpoint cycle. + */ + if (redo_pointer == redo_pointer_at_last_summary_removal) + return; + redo_pointer_at_last_summary_removal = redo_pointer; + + /* + * Files should only be removed if the last modification time precedes the + * cutoff time we compute here. + */ + cutoff_time = time(NULL) - 60 * wal_summarize_keep_time; + + /* Get all the summaries that currently exist. */ + wslist = GetWalSummaries(0, InvalidXLogRecPtr, InvalidXLogRecPtr); + + /* Loop until all summaries have been considered for removal. */ + while (wslist != NIL) + { + ListCell *lc; + XLogSegNo oldest_segno; + XLogRecPtr oldest_lsn = InvalidXLogRecPtr; + TimeLineID selected_tli; + + CHECK_FOR_INTERRUPTS(); + + /* + * Pick a timeline for which some summary files still exist on disk, + * and find the oldest LSN that still exists on disk for that + * timeline. + */ + selected_tli = ((WalSummaryFile *) linitial(wslist))->tli; + oldest_segno = XLogGetOldestSegno(selected_tli); + if (oldest_segno != 0) + XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size, + oldest_lsn); + + + /* Consider each WAL file on the selected timeline in turn. */ + foreach(lc, wslist) + { + WalSummaryFile *ws = lfirst(lc); + + CHECK_FOR_INTERRUPTS(); + + /* If it's not on this timeline, it's not time to consider it. */ + if (selected_tli != ws->tli) + continue; + + /* + * If the WAL doesn't exist any more, we can remove it if the file + * modification time is old enough. + */ + if (XLogRecPtrIsInvalid(oldest_lsn) || ws->end_lsn <= oldest_lsn) + RemoveWalSummaryIfOlderThan(ws, cutoff_time); + + /* + * Whether we we removed the file or not, we need not consider it + * again. + */ + wslist = foreach_delete_current(wslist, lc); + pfree(ws); + } + } +} diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y index 0c874e33cf..a5d118ed68 100644 --- a/src/backend/replication/repl_gram.y +++ b/src/backend/replication/repl_gram.y @@ -76,11 +76,12 @@ Node *replication_parse_result; %token K_EXPORT_SNAPSHOT %token K_NOEXPORT_SNAPSHOT %token K_USE_SNAPSHOT +%token K_UPLOAD_MANIFEST %type command %type base_backup start_replication start_logical_replication create_replication_slot drop_replication_slot identify_system - read_replication_slot timeline_history show + read_replication_slot timeline_history show upload_manifest %type generic_option_list %type generic_option %type opt_timeline @@ -114,6 +115,7 @@ command: | read_replication_slot | timeline_history | show + | upload_manifest ; /* @@ -307,6 +309,15 @@ timeline_history: } ; +/* UPLOAD_MANIFEST doesn't currently accept any arguments */ +upload_manifest: + K_UPLOAD_MANIFEST + { + UploadManifestCmd *cmd = makeNode(UploadManifestCmd); + + $$ = (Node *) cmd; + } + opt_physical: K_PHYSICAL | /* EMPTY */ @@ -411,6 +422,7 @@ ident_or_keyword: | K_EXPORT_SNAPSHOT { $$ = "export_snapshot"; } | K_NOEXPORT_SNAPSHOT { $$ = "noexport_snapshot"; } | K_USE_SNAPSHOT { $$ = "use_snapshot"; } + | K_UPLOAD_MANIFEST { $$ = "upload_manifest"; } ; %% diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l index cb467ca46f..fa2bf4ee0a 100644 --- a/src/backend/replication/repl_scanner.l +++ b/src/backend/replication/repl_scanner.l @@ -136,6 +136,7 @@ EXPORT_SNAPSHOT { return K_EXPORT_SNAPSHOT; } NOEXPORT_SNAPSHOT { return K_NOEXPORT_SNAPSHOT; } USE_SNAPSHOT { return K_USE_SNAPSHOT; } WAIT { return K_WAIT; } +UPLOAD_MANIFEST { return K_UPLOAD_MANIFEST; } {space}+ { /* do nothing */ } @@ -303,6 +304,7 @@ replication_scanner_is_replication_command(void) case K_DROP_REPLICATION_SLOT: case K_READ_REPLICATION_SLOT: case K_TIMELINE_HISTORY: + case K_UPLOAD_MANIFEST: case K_SHOW: /* Yes; push back the first token so we can parse later. */ repl_pushed_back_token = first_token; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index d3a136b6f5..39eb293e5f 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -58,6 +58,7 @@ #include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "backup/basebackup.h" +#include "backup/basebackup_incremental.h" #include "catalog/pg_authid.h" #include "catalog/pg_type.h" #include "commands/dbcommands.h" @@ -137,6 +138,17 @@ bool wake_wal_senders = false; */ static XLogReaderState *xlogreader = NULL; +/* + * If the UPLOAD_MANIFEST command is used to provide a backup manifest in + * preparation for an incremental backup, uploaded_manifest will be point + * to an object containing information about its contexts, and + * uploaded_manifest_mcxt will point to the memory context that contains + * that object and all of its subordinate data. Otherwise, both values will + * be NULL. + */ +static IncrementalBackupInfo *uploaded_manifest = NULL; +static MemoryContext uploaded_manifest_mcxt = NULL; + /* * These variables keep track of the state of the timeline we're currently * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric, @@ -233,6 +245,9 @@ static void XLogSendLogical(void); static void WalSndDone(WalSndSendDataCallback send_data); static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli); static void IdentifySystem(void); +static void UploadManifest(void); +static bool HandleUploadManifestPacket(StringInfo buf, off_t *offset, + IncrementalBackupInfo *ib); static void ReadReplicationSlot(ReadReplicationSlotCmd *cmd); static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd); static void DropReplicationSlot(DropReplicationSlotCmd *cmd); @@ -660,6 +675,143 @@ SendTimeLineHistory(TimeLineHistoryCmd *cmd) pq_endmessage(&buf); } +/* + * Handle UPLOAD_MANIFEST command. + */ +static void +UploadManifest(void) +{ + MemoryContext mcxt; + IncrementalBackupInfo *ib; + off_t offset = 0; + StringInfoData buf; + + /* + * parsing the manifest will use the cryptohash stuff, which requires a + * resource owner + */ + Assert(CurrentResourceOwner == NULL); + CurrentResourceOwner = ResourceOwnerCreate(NULL, "base backup"); + + /* Prepare to read manifest data into a temporary context. */ + mcxt = AllocSetContextCreate(CurrentMemoryContext, + "incremental backup information", + ALLOCSET_DEFAULT_SIZES); + ib = CreateIncrementalBackupInfo(mcxt); + + /* Send a CopyInResponse message */ + pq_beginmessage(&buf, 'G'); + pq_sendbyte(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage_reuse(&buf); + pq_flush(); + + /* Recieve packets from client until done. */ + while (HandleUploadManifestPacket(&buf, &offset, ib)) + ; + + /* Finish up manifest processing. */ + FinalizeIncrementalManifest(ib); + + /* + * Discard any old manifest information and arrange to preserve the new + * information we just got. + * + * We assume that MemoryContextDelete and MemoryContextSetParent won't + * fail, and thus we shouldn't end up bailing out of here in such a way as + * to leave dangling pointrs. + */ + if (uploaded_manifest_mcxt != NULL) + MemoryContextDelete(uploaded_manifest_mcxt); + MemoryContextSetParent(mcxt, CacheMemoryContext); + uploaded_manifest = ib; + uploaded_manifest_mcxt = mcxt; + + /* clean up the resource owner we created */ + WalSndResourceCleanup(true); +} + +/* + * Process one packet received during the handling of an UPLOAD_MANIFEST + * operation. + * + * 'buf' is scratch space. This function expects it to be initialized, doesn't + * care what the current contents are, and may override them with completely + * new contents. + * + * The return value is true if the caller should continue processing + * additional packets and false if the UPLOAD_MANIFEST operation is complete. + */ +static bool +HandleUploadManifestPacket(StringInfo buf, off_t *offset, + IncrementalBackupInfo *ib) +{ + int mtype; + int maxmsglen; + + HOLD_CANCEL_INTERRUPTS(); + + pq_startmsgread(); + mtype = pq_getbyte(); + if (mtype == EOF) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("unexpected EOF on client connection with an open transaction"))); + + switch (mtype) + { + case 'd': /* CopyData */ + maxmsglen = PQ_LARGE_MESSAGE_LIMIT; + break; + case 'c': /* CopyDone */ + case 'f': /* CopyFail */ + case 'H': /* Flush */ + case 'S': /* Sync */ + maxmsglen = PQ_SMALL_MESSAGE_LIMIT; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected message type 0x%02X during COPY from stdin", + mtype))); + maxmsglen = 0; /* keep compiler quiet */ + break; + } + + /* Now collect the message body */ + if (pq_getmessage(buf, maxmsglen)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("unexpected EOF on client connection with an open transaction"))); + RESUME_CANCEL_INTERRUPTS(); + + /* Process the message */ + switch (mtype) + { + case 'd': /* CopyData */ + AppendIncrementalManifestData(ib, buf->data, buf->len); + return true; + + case 'c': /* CopyDone */ + return false; + + case 'H': /* Sync */ + case 'S': /* Flush */ + /* Ignore these while in CopyOut mode as we do elsewhere. */ + return true; + + case 'f': + ereport(ERROR, + (errcode(ERRCODE_QUERY_CANCELED), + errmsg("COPY from stdin failed: %s", + pq_getmsgstring(buf)))); + } + + /* Not reached. */ + Assert(false); + return false; +} + /* * Handle START_REPLICATION command. * @@ -1802,7 +1954,7 @@ exec_replication_command(const char *cmd_string) cmdtag = "BASE_BACKUP"; set_ps_display(cmdtag); PreventInTransactionBlock(true, cmdtag); - SendBaseBackup((BaseBackupCmd *) cmd_node); + SendBaseBackup((BaseBackupCmd *) cmd_node, uploaded_manifest); EndReplicationCommand(cmdtag); break; @@ -1864,6 +2016,14 @@ exec_replication_command(const char *cmd_string) } break; + case T_UploadManifestCmd: + cmdtag = "UPLOAD_MANIFEST"; + set_ps_display(cmdtag); + PreventInTransactionBlock(true, cmdtag); + UploadManifest(); + EndReplicationCommand(cmdtag); + break; + default: elog(ERROR, "unrecognized replication command node tag: %u", cmd_node->type); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 8f1ded7338..17608b3b8e 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -31,6 +31,7 @@ #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" +#include "postmaster/walsummarizer.h" #include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/slot.h" @@ -135,6 +136,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, ReplicationOriginShmemSize()); size = add_size(size, WalSndShmemSize()); size = add_size(size, WalRcvShmemSize()); + size = add_size(size, WalSummarizerShmemSize()); size = add_size(size, PgArchShmemSize()); size = add_size(size, ApplyLauncherShmemSize()); size = add_size(size, SnapMgrShmemSize()); @@ -283,6 +285,7 @@ CreateSharedMemoryAndSemaphores(void) ReplicationOriginShmemInit(); WalSndShmemInit(); WalRcvShmemInit(); + WalSummarizerShmemInit(); PgArchShmemInit(); ApplyLauncherShmemInit(); diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c295..49f76e82fb 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +WALSummarizerLock 48 diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index eb7d35d422..bd0a921a3e 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -292,7 +292,8 @@ pgstat_io_snapshot_cb(void) * - Syslogger because it is not connected to shared memory * - Archiver because most relevant archiving IO is delegated to a * specialized command or module -* - WAL Receiver and WAL Writer IO is not tracked in pg_stat_io for now +* - WAL Receiver, WAL Writer, and WAL Summarizer IO are not tracked in +* pg_stat_io for now * * Function returns true if BackendType participates in the cumulative stats * subsystem for IO and false if it does not. @@ -314,6 +315,7 @@ pgstat_tracks_io_bktype(BackendType bktype) case B_LOGGER: case B_WAL_RECEIVER: case B_WAL_WRITER: + case B_WAL_SUMMARIZER: return false; case B_AUTOVAC_LAUNCHER: diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 7940d64639..36b88f55b1 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -245,6 +245,9 @@ pgstat_get_wait_activity(WaitEventActivity w) case WAIT_EVENT_WAL_SENDER_MAIN: event_name = "WalSenderMain"; break; + case WAIT_EVENT_WAL_SUMMARIZER_WAL: + event_name = "WalSummarizerWal"; + break; case WAIT_EVENT_WAL_WRITER_MAIN: event_name = "WalWriterMain"; break; @@ -466,6 +469,9 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_WAL_RECEIVER_WAIT_START: event_name = "WalReceiverWaitStart"; break; + case WAIT_EVENT_WAL_SUMMARY_READY: + event_name = "WalSummaryReady"; + break; case WAIT_EVENT_XACT_GROUP_UPDATE: event_name = "XactGroupUpdate"; break; @@ -515,6 +521,9 @@ pgstat_get_wait_timeout(WaitEventTimeout w) case WAIT_EVENT_VACUUM_TRUNCATE: event_name = "VacuumTruncate"; break; + case WAIT_EVENT_WAL_SUMMARIZER_ERROR: + event_name = "WalSummarizerError"; + break; /* no default case, so that compiler will warn */ } @@ -747,6 +756,12 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_WAL_READ: event_name = "WALRead"; break; + case WAIT_EVENT_WAL_SUMMARY_READ: + event_name = "WalSummaryRead"; + break; + case WAIT_EVENT_WAL_SUMMARY_WRITE: + event_name = "WalSummaryWrite"; + break; case WAIT_EVENT_WAL_SYNC: event_name = "WALSync"; break; diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index a604432126..eb5736ad85 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -306,6 +306,9 @@ GetBackendTypeDesc(BackendType backendType) case B_WAL_SENDER: backendDesc = "walsender"; break; + case B_WAL_SUMMARIZER: + backendDesc = "walsummarizer"; + break; case B_WAL_WRITER: backendDesc = "walwriter"; break; diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 71e27f8eb0..c4918db4f9 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -61,6 +61,7 @@ #include "postmaster/postmaster.h" #include "postmaster/startup.h" #include "postmaster/syslogger.h" +#include "postmaster/walsummarizer.h" #include "postmaster/walwriter.h" #include "replication/logicallauncher.h" #include "replication/slot.h" @@ -694,6 +695,8 @@ const char *const config_group_names[] = gettext_noop("Write-Ahead Log / Archive Recovery"), /* WAL_RECOVERY_TARGET */ gettext_noop("Write-Ahead Log / Recovery Target"), + /* WAL_SUMMARIZATION */ + gettext_noop("Write-Ahead Log / Summarization"), /* REPLICATION_SENDING */ gettext_noop("Replication / Sending Servers"), /* REPLICATION_PRIMARY */ @@ -3167,6 +3170,32 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"wal_summarize_mb", PGC_SIGHUP, WAL_SUMMARIZATION, + gettext_noop("Number of bytes of WAL per summary file."), + gettext_noop("Smaller values minimize extra work performed by incremental backup, but increase the number of files on disk."), + GUC_UNIT_MB, + }, + &wal_summarize_mb, + 256, + 0, + INT_MAX, + NULL, NULL, NULL + }, + + { + {"wal_summarize_keep_time", PGC_SIGHUP, WAL_SUMMARIZATION, + gettext_noop("Time for which WAL summary files should be kept."), + NULL, + GUC_UNIT_MIN, + }, + &wal_summarize_keep_time, + 7 * 24 * 60, /* 1 week */ + 0, + INT_MAX, + NULL, NULL, NULL + }, + { {"autovacuum_naptime", PGC_SIGHUP, AUTOVACUUM, gettext_noop("Time to sleep between autovacuum runs."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e4c0269fa3..d028d02861 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -302,6 +302,11 @@ #recovery_target_action = 'pause' # 'pause', 'promote', 'shutdown' # (change requires restart) +# - WAL Summarization - + +#wal_summarize_mb = 256 # MB of WAL per summary file, 0 disables +#wal_summarize_keep_time = '7d' # when to remove old summary files, 0 = never + #------------------------------------------------------------------------------ # REPLICATION diff --git a/src/bin/Makefile b/src/bin/Makefile index 373077bf52..aa2210925e 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -19,6 +19,7 @@ SUBDIRS = \ pg_archivecleanup \ pg_basebackup \ pg_checksums \ + pg_combinebackup \ pg_config \ pg_controldata \ pg_ctl \ diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 09a5c98cc0..220f51a32d 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -230,6 +230,7 @@ static char *extra_options = ""; static const char *const subdirs[] = { "global", "pg_wal/archive_status", + "pg_wal/summaries", "pg_commit_ts", "pg_dynshmem", "pg_notify", diff --git a/src/bin/meson.build b/src/bin/meson.build index 67cb50630c..4cb6fd59bb 100644 --- a/src/bin/meson.build +++ b/src/bin/meson.build @@ -5,6 +5,7 @@ subdir('pg_amcheck') subdir('pg_archivecleanup') subdir('pg_basebackup') subdir('pg_checksums') +subdir('pg_combinebackup') subdir('pg_config') subdir('pg_controldata') subdir('pg_ctl') diff --git a/src/bin/pg_basebackup/bbstreamer_file.c b/src/bin/pg_basebackup/bbstreamer_file.c index 45f32974ff..6b78ee283d 100644 --- a/src/bin/pg_basebackup/bbstreamer_file.c +++ b/src/bin/pg_basebackup/bbstreamer_file.c @@ -296,6 +296,7 @@ should_allow_existing_directory(const char *pathname) if (strcmp(filename, "pg_wal") == 0 || strcmp(filename, "pg_xlog") == 0 || strcmp(filename, "archive_status") == 0 || + strcmp(filename, "summaries") == 0 || strcmp(filename, "pg_tblspc") == 0) return true; diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 1dc8efe0cb..3ffe15ac74 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -101,6 +101,11 @@ typedef void (*WriteDataCallback) (size_t nbytes, char *buf, */ #define MINIMUM_VERSION_FOR_TERMINATED_TARFILE 150000 +/* + * pg_wal/summaries exists beginning with v16. + */ +#define MINIMUM_VERSION_FOR_WAL_SUMMARIES 160000 + /* * Different ways to include WAL */ @@ -216,7 +221,8 @@ static void ReceiveBackupManifestInMemoryChunk(size_t r, char *copybuf, void *callback_data); static void BaseBackup(char *compression_algorithm, char *compression_detail, CompressionLocation compressloc, - pg_compress_specification *client_compress); + pg_compress_specification *client_compress, + char *incremental_manifest); static bool reached_end_position(XLogRecPtr segendpos, uint32 timeline, bool segment_finished); @@ -684,6 +690,23 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier, if (pg_mkdir_p(statusdir, pg_dir_create_mode) != 0 && errno != EEXIST) pg_fatal("could not create directory \"%s\": %m", statusdir); + + /* + * For newer server versions, likewise create pg_wal/summaries + */ + if (PQserverVersion(conn) < MINIMUM_VERSION_FOR_WAL_SUMMARIES) + { + char summarydir[MAXPGPATH]; + + snprintf(summarydir, sizeof(summarydir), "%s/%s/summaries", + basedir, + PQserverVersion(conn) < MINIMUM_VERSION_FOR_PG_WAL ? + "pg_xlog" : "pg_wal"); + + if (pg_mkdir_p(statusdir, pg_dir_create_mode) != 0 && + errno != EEXIST) + pg_fatal("could not create directory \"%s\": %m", summarydir); + } } /* @@ -1724,7 +1747,9 @@ ReceiveBackupManifestInMemoryChunk(size_t r, char *copybuf, static void BaseBackup(char *compression_algorithm, char *compression_detail, - CompressionLocation compressloc, pg_compress_specification *client_compress) + CompressionLocation compressloc, + pg_compress_specification *client_compress, + char *incremental_manifest) { PGresult *res; char *sysidentifier; @@ -1790,7 +1815,74 @@ BaseBackup(char *compression_algorithm, char *compression_detail, exit(1); /* - * Start the actual backup + * If the user wants an incremental backup, we must upload the manifest + * for the previous backup upon which it is to be based. + */ + if (incremental_manifest != NULL) + { + int fd; + char mbuf[65536]; + int nbytes; + + /* XXX add a server version check here */ + + /* Open the file. */ + fd = open(incremental_manifest, O_RDONLY | PG_BINARY, 0); + if (fd < 0) + pg_fatal("could not open file \"%s\": %m", incremental_manifest); + + /* Tell the server what we want to do. */ + if (PQsendQuery(conn, "UPLOAD_MANIFEST") == 0) + pg_fatal("could not send replication command \"%s\": %s", + "UPLOAD_MANIFEST", PQerrorMessage(conn)); + res = PQgetResult(conn); + if (PQresultStatus(res) != PGRES_COPY_IN) + { + if (PQresultStatus(res) == PGRES_FATAL_ERROR) + pg_fatal("could not upload manifest: %s", + PQerrorMessage(conn)); + else + pg_fatal("could not upload manifest: unexpected status %s", + PQresStatus(PQresultStatus(res))); + } + + /* Loop, reading from the file and sending the data to the server. */ + while ((nbytes = read(fd, mbuf, sizeof mbuf)) > 0) + { + if (PQputCopyData(conn, mbuf, nbytes) < 0) + pg_fatal("could not send COPY data: %s", + PQerrorMessage(conn)); + } + + /* Bail out if we exited the loop due to an error. */ + if (nbytes < 0) + pg_fatal("could not read file \"%s\": %m", incremental_manifest); + + /* End the COPY operation. */ + if (PQputCopyEnd(conn, NULL) < 0) + pg_fatal("could not send end-of-COPY: %s", + PQerrorMessage(conn)); + + /* See whether the server is happy with what we sent. */ + res = PQgetResult(conn); + if (PQresultStatus(res) == PGRES_FATAL_ERROR) + pg_fatal("could not upload manifest: %s", + PQerrorMessage(conn)); + else if (PQresultStatus(res) != PGRES_COMMAND_OK) + pg_fatal("could not upload manifest: unexpected status %s", + PQresStatus(PQresultStatus(res))); + + /* Consume ReadyForQuery message from server. */ + res = PQgetResult(conn); + if (res != NULL) + pg_fatal("unexpected extra result while sending manifest"); + + /* Add INCREMENTAL option to BASE_BACKUP command. */ + AppendPlainCommandOption(&buf, use_new_option_syntax, "INCREMENTAL"); + } + + /* + * Continue building up the options list for the BASE_BACKUP command. */ AppendStringCommandOption(&buf, use_new_option_syntax, "LABEL", label); if (estimatesize) @@ -1897,6 +1989,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, else basebkp = psprintf("BASE_BACKUP %s", buf.data); + /* OK, try to start the backup. */ if (PQsendQuery(conn, basebkp) == 0) pg_fatal("could not send replication command \"%s\": %s", "BASE_BACKUP", PQerrorMessage(conn)); @@ -2252,6 +2345,7 @@ main(int argc, char **argv) {"version", no_argument, NULL, 'V'}, {"pgdata", required_argument, NULL, 'D'}, {"format", required_argument, NULL, 'F'}, + {"incremental", required_argument, NULL, 'i'}, {"checkpoint", required_argument, NULL, 'c'}, {"create-slot", no_argument, NULL, 'C'}, {"max-rate", required_argument, NULL, 'r'}, @@ -2288,6 +2382,7 @@ main(int argc, char **argv) int option_index; char *compression_algorithm = "none"; char *compression_detail = NULL; + char *incremental_manifest = NULL; CompressionLocation compressloc = COMPRESS_LOCATION_UNSPECIFIED; pg_compress_specification client_compress; @@ -2312,7 +2407,7 @@ main(int argc, char **argv) atexit(cleanup_directories_atexit); - while ((c = getopt_long(argc, argv, "c:Cd:D:F:h:l:nNp:Pr:Rs:S:t:T:U:vwWX:zZ:", + while ((c = getopt_long(argc, argv, "c:Cd:D:F:h:i:l:nNp:Pr:Rs:S:t:T:U:vwWX:zZ:", long_options, &option_index)) != -1) { switch (c) @@ -2347,6 +2442,9 @@ main(int argc, char **argv) case 'h': dbhost = pg_strdup(optarg); break; + case 'i': + incremental_manifest = pg_strdup(optarg); + break; case 'l': label = pg_strdup(optarg); break; @@ -2756,7 +2854,7 @@ main(int argc, char **argv) } BaseBackup(compression_algorithm, compression_detail, compressloc, - &client_compress); + &client_compress, incremental_manifest); success = true; return 0; diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl index 793d64863c..22a10477ec 100644 --- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl +++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl @@ -223,10 +223,10 @@ SKIP: "check backup dir permissions"); } -# Only archive_status directory should be copied in pg_wal/. +# Only archive_status and summaries directories should be copied in pg_wal/. is_deeply( [ sort(slurp_dir("$tempdir/backup/pg_wal/")) ], - [ sort qw(. .. archive_status) ], + [ sort qw(. .. archive_status summaries) ], 'no WAL files copied'); # Contents of these directories should not be copied. diff --git a/src/bin/pg_combinebackup/.gitignore b/src/bin/pg_combinebackup/.gitignore new file mode 100644 index 0000000000..d7e617438c --- /dev/null +++ b/src/bin/pg_combinebackup/.gitignore @@ -0,0 +1 @@ +pg_combinebackup diff --git a/src/bin/pg_combinebackup/Makefile b/src/bin/pg_combinebackup/Makefile new file mode 100644 index 0000000000..cb20480aae --- /dev/null +++ b/src/bin/pg_combinebackup/Makefile @@ -0,0 +1,46 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/bin/pg_combinebackup +# +# Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/bin/pg_combinebackup/Makefile +# +#------------------------------------------------------------------------- + +PGFILEDESC = "pg_combinebackup - combine incremental backups" +PGAPPICON=win32 + +subdir = src/bin/pg_combinebackup +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) +LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils + +OBJS = \ + $(WIN32RES) \ + pg_combinebackup.o \ + backup_label.o \ + copy_file.o \ + load_manifest.o \ + reconstruct.o \ + write_manifest.o + +all: pg_combinebackup + +pg_combinebackup: $(OBJS) | submake-libpgport submake-libpgfeutils + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + +install: all installdirs + $(INSTALL_PROGRAM) pg_combinebackup$(X) '$(DESTDIR)$(bindir)/pg_combinebackup$(X)' + +installdirs: + $(MKDIR_P) '$(DESTDIR)$(bindir)' + +uninstall: + rm -f '$(DESTDIR)$(bindir)/pg_combinebackup$(X)' + +clean distclean maintainer-clean: + rm -f pg_combinebackup$(X) $(OBJS) diff --git a/src/bin/pg_combinebackup/backup_label.c b/src/bin/pg_combinebackup/backup_label.c new file mode 100644 index 0000000000..2a62aa6fad --- /dev/null +++ b/src/bin/pg_combinebackup/backup_label.c @@ -0,0 +1,281 @@ +/*------------------------------------------------------------------------- + * + * Read and manipulate backup label files + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/backup_label.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include + +#include "access/xlogdefs.h" +#include "backup_label.h" +#include "common/logging.h" +#include "common/file_perm.h" +#include "write_manifest.h" + +static int get_eol_offset(StringInfo buf); +static bool line_starts_with(char *s, char *e, char *match, char **sout); +static bool parse_lsn(char *s, char *e, XLogRecPtr *lsn, char **c); +static bool parse_tli(char *s, char *e, TimeLineID *tli); + +/* + * Parse a backup label file, starting at buf->cursor. + * + * We expect to find a START WAL LOCATION line, followed by a LSN, followed + * by a space; the resulting LSN is stored into *start_lsn. + * + * We expect to find a START TIMELINE line, followed by a TLI, followed by + * a newline; the resulting TLI is stored into *start_tli. + * + * We expect to find either both INCREMENTAL FROM LSN and INCREMENTAL FROM TLI + * or neither. If these are found, they should be followed by an LSN or TLI + * respectively and then by a newline, and the values will be stored into + * *previous_lsn and *previous_tli, respectively. + * + * Other lines in the provided backup_label data are ignored. filename is used + * for error reporting; errors are fatal. + */ +void +parse_backup_label(char *filename, StringInfo buf, + TimeLineID *start_tli, XLogRecPtr *start_lsn, + TimeLineID *previous_tli, XLogRecPtr *previous_lsn) +{ + int found = 0; + + *start_tli = 0; + *start_lsn = InvalidXLogRecPtr; + *previous_tli = 0; + *previous_lsn = InvalidXLogRecPtr; + + while (buf->cursor < buf->len) + { + char *s = &buf->data[buf->cursor]; + int eo = get_eol_offset(buf); + char *e = &buf->data[eo]; + char *c; + + if (line_starts_with(s, e, "START WAL LOCATION: ", &s)) + { + if (!parse_lsn(s, e, start_lsn, &c)) + pg_fatal("%s: could not parse START WAL LOCATION", + filename); + if (c >= e || *c != ' ') + pg_fatal("%s: improper terminator for START WAL LOCATION", + filename); + found |= 1; + } + else if (line_starts_with(s, e, "START TIMELINE: ", &s)) + { + if (!parse_tli(s, e, start_tli)) + pg_fatal("%s: could not parse TLI for START TIMELINE", + filename); + if (*start_tli == 0) + pg_fatal("%s: invalid TLI", filename); + found |= 2; + } + else if (line_starts_with(s, e, "INCREMENTAL FROM LSN: ", &s)) + { + if (!parse_lsn(s, e, previous_lsn, &c)) + pg_fatal("%s: could not parse INCREMENTAL FROM LSN", + filename); + if (c >= e || *c != '\n') + pg_fatal("%s: improper terminator for INCREMENTAL FROM LSN", + filename); + found |= 4; + } + else if (line_starts_with(s, e, "INCREMENTAL FROM TLI: ", &s)) + { + if (!parse_tli(s, e, previous_tli)) + pg_fatal("%s: could not parse INCREMENTAL FROM TLI", + filename); + if (*previous_tli == 0) + pg_fatal("%s: invalid TLI", filename); + found |= 8; + } + + buf->cursor = eo; + } + + if ((found & 1) == 0) + pg_fatal("%s: could not find START WAL LOCATION", filename); + if ((found & 2) == 0) + pg_fatal("%s: could not find START TIMELINE", filename); + if ((found & 4) != 0 && (found & 8) == 0) + pg_fatal("%s: INCREMENTAL FROM LSN requires INCREMENTAL FROM TLI", filename); + if ((found & 8) != 0 && (found & 4) == 0) + pg_fatal("%s: INCREMENTAL FROM TLI requires INCREMENTAL FROM LSN", filename); +} + +/* + * Write a backup label file to the output directory. + * + * This will be identical to the provided backup_label file, except that the + * INCREMENTAL FROM LSN and INCREMENTAL FROM TLI lines will be omitted. + * + * The new file will be checksummed using the specified algorithm. If + * mwriter != NULL, it will be added to the manifest. + */ +void +write_backup_label(char *output_directory, StringInfo buf, + pg_checksum_type checksum_type, manifest_writer *mwriter) +{ + char output_filename[MAXPGPATH]; + int output_fd; + pg_checksum_context checksum_ctx; + uint8 checksum_payload[PG_CHECKSUM_MAX_LENGTH]; + int checksum_length; + + pg_checksum_init(&checksum_ctx, checksum_type); + + snprintf(output_filename, MAXPGPATH, "%s/backup_label", output_directory); + + if ((output_fd = open(output_filename, + O_WRONLY | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", output_filename); + + while (buf->cursor < buf->len) + { + char *s = &buf->data[buf->cursor]; + int eo = get_eol_offset(buf); + char *e = &buf->data[eo]; + + if (!line_starts_with(s, e, "INCREMENTAL FROM LSN: ", NULL) && + !line_starts_with(s, e, "INCREMENTAL FROM TLI: ", NULL)) + { + ssize_t wb; + + wb = write(output_fd, s, e - s); + if (wb != e - s) + { + if (wb < 0) + pg_fatal("could not write file \"%s\": %m", output_filename); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", + output_filename, (int) wb, (int) (e - s)); + } + if (pg_checksum_update(&checksum_ctx, (uint8 *) s, e - s) < 0) + pg_fatal("could not update checksum of file \"%s\"", + output_filename); + } + + buf->cursor = eo; + } + + if (close(output_fd) != 0) + pg_fatal("could not close \"%s\": %m", output_filename); + + checksum_length = pg_checksum_final(&checksum_ctx, checksum_payload); + + if (mwriter != NULL) + { + struct stat sb; + + /* + * We could track the length ourselves, but must stat() to get the + * mtime. + */ + if (stat(output_filename, &sb) < 0) + pg_fatal("could not stat file \"%s\": %m", output_filename); + add_file_to_manifest(mwriter, "backup_label", sb.st_size, + sb.st_mtime, checksum_type, + checksum_length, checksum_payload); + } +} + +/* + * Return the offset at which the next line in the buffer starts, or there + * is none, the offset at which the buffer ends. + * + * The search begins at buf->cursor. + */ +static int +get_eol_offset(StringInfo buf) +{ + int eo = buf->cursor; + + while (eo < buf->len) + { + if (buf->data[eo] == '\n') + return eo + 1; + ++eo; + } + + return eo; +} + +/* + * Test whether the line that runs from s to e (inclusive of *s, but not + * inclusive of *e) starts with the match string provided, and return true + * or false according to whether or not this is the case. + * + * If the function returns true and if *sout != NULL, stores a pointer to the + * byte following the match into *sout. + */ +static bool +line_starts_with(char *s, char *e, char *match, char **sout) +{ + while (s < e && *match != '\0' && *s == *match) + ++s, ++match; + + if (*match == '\0' && sout != NULL) + *sout = s; + + return (*match == '\0'); +} + +/* + * Parse an LSN starting at s and not stopping at or before e. The return value + * is true on success and otherwise false. On success, stores the result into + * *lsn and sets *c to the first character that is not part of the LSN. + */ +static bool +parse_lsn(char *s, char *e, XLogRecPtr *lsn, char **c) +{ + char save = *e; + int nchars; + bool success; + unsigned hi; + unsigned lo; + + *e = '\0'; + success = (sscanf(s, "%X/%X%n", &hi, &lo, &nchars) == 2); + *e = save; + + if (success) + { + *lsn = ((XLogRecPtr) hi) << 32 | (XLogRecPtr) lo; + *c = s + nchars; + } + + return success; +} + +/* + * Parse a TLI starting at s and stopping at or before e. The return value is + * true on success and otherwise false. On success, stores the result into + * *tli. If the first character that is not part of the TLI is anything other + * than a newline, that is deemed a failure. + */ +static bool +parse_tli(char *s, char *e, TimeLineID *tli) +{ + char save = *e; + int nchars; + bool success; + + *e = '\0'; + success = (sscanf(s, "%u%n", tli, &nchars) == 1); + *e = save; + + if (success && s[nchars] != '\n') + success = false; + + return success; +} diff --git a/src/bin/pg_combinebackup/backup_label.h b/src/bin/pg_combinebackup/backup_label.h new file mode 100644 index 0000000000..08d6ed67a9 --- /dev/null +++ b/src/bin/pg_combinebackup/backup_label.h @@ -0,0 +1,29 @@ +/*------------------------------------------------------------------------- + * + * Read and manipulate backup label files + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/backup_label.h + * + *------------------------------------------------------------------------- + */ +#ifndef BACKUP_LABEL_H +#define BACKUP_LABEL_H + +#include "common/checksum_helper.h" +#include "lib/stringinfo.h" + +struct manifest_writer; + +extern void parse_backup_label(char *filename, StringInfo buf, + TimeLineID *start_tli, + XLogRecPtr *start_lsn, + TimeLineID *previous_tli, + XLogRecPtr *previous_lsn); +extern void write_backup_label(char *output_directory, StringInfo buf, + pg_checksum_type checksum_type, + struct manifest_writer *mwriter); + +#endif /* BACKUP_LABEL_H */ diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c new file mode 100644 index 0000000000..8ba6cc09e4 --- /dev/null +++ b/src/bin/pg_combinebackup/copy_file.c @@ -0,0 +1,169 @@ +/* + * Copy entire files. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/copy_file.h + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#ifdef HAVE_COPYFILE_H +#include +#endif +#include +#include +#include + +#include "common/file_perm.h" +#include "common/logging.h" +#include "copy_file.h" + +static void copy_file_blocks(const char *src, const char *dst, + pg_checksum_context *checksum_ctx); + +#ifdef WIN32 +static void copy_file_copyfile(const char *src, const char *dst); +#endif + +/* + * Copy a regular file, optionally computing a checksum, and emitting + * appropriate debug messages. But if we're in dry-run mode, then just emit + * the messages and don't copy anything. + */ +void +copy_file(const char *src, const char *dst, + pg_checksum_context *checksum_ctx, bool dry_run) +{ + /* + * In dry-run mode, we don't actually copy anything, nor do we read any + * data from the source file, but we do verify that we can open it. + */ + if (dry_run) + { + int fd; + + if ((fd = open(src, O_RDONLY | PG_BINARY)) < 0) + pg_fatal("could not open \"%s\": %m", src); + if (close(fd) < 0) + pg_fatal("could not close \"%s\": %m", src); + } + + /* + * If we don't need to compute a checksum, then we can use any special + * operating system primitives that we know about to copy the file; this + * may be quicker than a naive block copy. + */ + if (checksum_ctx->type != CHECKSUM_TYPE_NONE) + { + char *strategy_name = NULL; + void (*strategy_implementation) (const char *, const char *) = NULL; + +#ifdef WIN32 + strategy_name = "CopyFile"; + strategy_implementation = copy_file_copyfile; +#endif + + if (strategy_name != NULL) + { + if (dry_run) + pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s", + src, dst, strategy_name); + else + { + pg_log_debug("copying \"%s\" to \"%s\" using strategy %s", + src, dst, strategy_name); + (*strategy_implementation) (src, dst); + } + return; + } + } + + /* + * Fall back to the simple approach of reading and writing all the blocks, + * feeding them into the checksum context as we go. + */ + if (dry_run) + { + if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + pg_log_debug("would copy \"%s\" to \"%s\"", + src, dst); + else + pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s", + src, dst, pg_checksum_type_name(checksum_ctx->type)); + } + else + { + if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + pg_log_debug("copying \"%s\" to \"%s\"", + src, dst); + else + pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s", + src, dst, pg_checksum_type_name(checksum_ctx->type)); + copy_file_blocks(src, dst, checksum_ctx); + } +} + +/* + * Copy a file block by block, and optionally compute a checksum as we go. + */ +static void +copy_file_blocks(const char *src, const char *dst, + pg_checksum_context *checksum_ctx) +{ + int src_fd; + int dest_fd; + uint8 *buffer; + const int buffer_size = 50 * BLCKSZ; + ssize_t rb; + unsigned offset = 0; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", src); + + if ((dest_fd = open(dst, O_WRONLY | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", dst); + + buffer = pg_malloc(buffer_size); + + while ((rb = read(src_fd, buffer, buffer_size)) > 0) + { + ssize_t wb; + + if ((wb = write(dest_fd, buffer, rb)) != rb) + { + if (wb < 0) + pg_fatal("could not write file \"%s\": %m", dst); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes at offset %u", + dst, (int) wb, (int) rb, offset); + } + + if (pg_checksum_update(checksum_ctx, buffer, rb) < 0) + pg_fatal("could not update checksum of file \"%s\"", dst); + + offset += rb; + } + + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", dst); + + pg_free(buffer); + close(src_fd); + close(dest_fd); +} + +#ifdef WIN32 +static void +copy_file_copyfile(const char *src, const char *dst) +{ + if (CopyFile(src, dst, true) == 0) + { + _dosmaperr(GetLastError()); + pg_fatal("could not copy \"%s\" to \"%s\": %m", src, dst); + } +} +#endif /* WIN32 */ diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h new file mode 100644 index 0000000000..031030bacb --- /dev/null +++ b/src/bin/pg_combinebackup/copy_file.h @@ -0,0 +1,19 @@ +/* + * Copy entire files. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/copy_file.h + * + *------------------------------------------------------------------------- + */ +#ifndef COPY_FILE_H +#define COPY_FILE_H + +#include "common/checksum_helper.h" + +extern void copy_file(const char *src, const char *dst, + pg_checksum_context *checksum_ctx, bool dry_run); + +#endif /* COPY_FILE_H */ diff --git a/src/bin/pg_combinebackup/load_manifest.c b/src/bin/pg_combinebackup/load_manifest.c new file mode 100644 index 0000000000..d0b8de7912 --- /dev/null +++ b/src/bin/pg_combinebackup/load_manifest.c @@ -0,0 +1,245 @@ +/*------------------------------------------------------------------------- + * + * Load data from a backup manifest into memory. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/load_manifest.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include +#include + +#include "common/hashfn.h" +#include "common/logging.h" +#include "common/parse_manifest.h" +#include "load_manifest.h" + +/* + * For efficiency, we'd like our hash table containing information about the + * manifest to start out with approximately the correct number of entries. + * There's no way to know the exact number of entries without reading the whole + * file, but we can get an estimate by dividing the file size by the estimated + * number of bytes per line. + * + * This could be off by about a factor of two in either direction, because the + * checksum algorithm has a big impact on the line lengths; e.g. a SHA512 + * checksum is 128 hex bytes, whereas a CRC-32C value is only 8, and there + * might be no checksum at all. + */ +#define ESTIMATED_BYTES_PER_MANIFEST_LINE 100 + +/* + * Define a hash table which we can use to store information about the files + * mentioned in the backup manifest. + */ +static uint32 hash_string_pointer(char *s); +#define SH_PREFIX manifest_files +#define SH_ELEMENT_TYPE manifest_file +#define SH_KEY_TYPE char * +#define SH_KEY pathname +#define SH_HASH_KEY(tb, key) hash_string_pointer(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE extern +#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_DEFINE +#include "lib/simplehash.h" + +static void record_manifest_details_for_file(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload); +static void record_manifest_details_for_wal_range(JsonManifestParseContext *context, + TimeLineID tli, + XLogRecPtr start_lsn, + XLogRecPtr end_lsn); +static void report_manifest_error(JsonManifestParseContext *context, + const char *fmt,...) + pg_attribute_printf(2, 3) pg_attribute_noreturn(); + +/* + * Load backup_manifest files from an array of backups and produces an array + * of manifest_data objects. + * + * NB: Since load_backup_manifest() can return NULL, the resulting array could + * contain NULL entries. + */ +manifest_data ** +load_backup_manifests(int n_backups, char **backup_directories) +{ + manifest_data **result; + int i; + + result = pg_malloc(sizeof(manifest_data *) * n_backups); + for (i = 0; i < n_backups; ++i) + result[i] = load_backup_manifest(backup_directories[i]); + + return result; +} + +/* + * Parse the backup_manifest file in the named backup directory. Construct a + * hash table with information about all the files it mentions, and a linked + * list of all the WAL ranges it mentions. + * + * If the backup_manifest file simply doesn't exist, logs a warning and returns + * NULL. Any other error, or any error parsing the contents of the file, is + * fatal. + */ +manifest_data * +load_backup_manifest(char *backup_directory) +{ + char pathname[MAXPGPATH]; + int fd; + struct stat statbuf; + off_t estimate; + uint32 initial_size; + manifest_files_hash *ht; + char *buffer; + int rc; + JsonManifestParseContext context; + manifest_data *result; + + /* Open the manifest file. */ + snprintf(pathname, MAXPGPATH, "%s/backup_manifest", backup_directory); + if ((fd = open(pathname, O_RDONLY | PG_BINARY, 0)) < 0) + { + if (errno == EEXIST) + { + pg_log_warning("\"%s\" does not exist", pathname); + return NULL; + } + pg_fatal("could not open file \"%s\": %m", pathname); + } + + /* Figure out how big the manifest is. */ + if (fstat(fd, &statbuf) != 0) + pg_fatal("could not stat file \"%s\": %m", pathname); + + /* Guess how large to make the hash table based on the manifest size. */ + estimate = statbuf.st_size / ESTIMATED_BYTES_PER_MANIFEST_LINE; + initial_size = Min(PG_UINT32_MAX, Max(estimate, 256)); + + /* Create the hash table. */ + ht = manifest_files_create(initial_size, NULL); + + /* + * Slurp in the whole file. + * + * This is not ideal, but there's currently no way to get pg_parse_json() + * to perform incremental parsing. + */ + buffer = pg_malloc(statbuf.st_size); + rc = read(fd, buffer, statbuf.st_size); + if (rc != statbuf.st_size) + { + if (rc < 0) + pg_fatal("could not read file \"%s\": %m", pathname); + else + pg_fatal("could not read file \"%s\": read %d of %lld", + pathname, rc, (long long int) statbuf.st_size); + } + + /* Close the manifest file. */ + close(fd); + + /* Parse the manifest. */ + result = pg_malloc0(sizeof(manifest_data)); + result->files = ht; + context.private_data = result; + context.perfile_cb = record_manifest_details_for_file; + context.perwalrange_cb = record_manifest_details_for_wal_range; + context.error_cb = report_manifest_error; + json_parse_manifest(&context, buffer, statbuf.st_size); + + /* All done. */ + pfree(buffer); + return result; +} + +/* + * Report an error while parsing the manifest. + * + * We consider all such errors to be fatal errors. The manifest parser + * expects this function not to return. + */ +static void +report_manifest_error(JsonManifestParseContext *context, const char *fmt,...) +{ + va_list ap; + + va_start(ap, fmt); + pg_log_generic_v(PG_LOG_ERROR, PG_LOG_PRIMARY, gettext(fmt), ap); + va_end(ap); + + exit(1); +} + +/* + * Record details extracted from the backup manifest for one file. + */ +static void +record_manifest_details_for_file(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, uint8 *checksum_payload) +{ + manifest_data *manifest = context->private_data; + manifest_file *m; + bool found; + + /* Make a new entry in the hash table for this file. */ + m = manifest_files_insert(manifest->files, pathname, &found); + if (found) + pg_fatal("duplicate path name in backup manifest: \"%s\"", pathname); + + /* Initialize the entry. */ + m->size = size; + m->checksum_type = checksum_type; + m->checksum_length = checksum_length; + m->checksum_payload = checksum_payload; +} + +/* + * Record details extracted from the backup manifest for one WAL range. + */ +static void +record_manifest_details_for_wal_range(JsonManifestParseContext *context, + TimeLineID tli, + XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + manifest_data *manifest = context->private_data; + manifest_wal_range *range; + + /* Allocate and initialize a struct describing this WAL range. */ + range = palloc(sizeof(manifest_wal_range)); + range->tli = tli; + range->start_lsn = start_lsn; + range->end_lsn = end_lsn; + range->prev = manifest->last_wal_range; + range->next = NULL; + + /* Add it to the end of the list. */ + if (manifest->first_wal_range == NULL) + manifest->first_wal_range = range; + else + manifest->last_wal_range->next = range; + manifest->last_wal_range = range; +} + +/* + * Helper function for manifest_files hash table. + */ +static uint32 +hash_string_pointer(char *s) +{ + unsigned char *ss = (unsigned char *) s; + + return hash_bytes(ss, strlen(s)); +} diff --git a/src/bin/pg_combinebackup/load_manifest.h b/src/bin/pg_combinebackup/load_manifest.h new file mode 100644 index 0000000000..2bfeeff156 --- /dev/null +++ b/src/bin/pg_combinebackup/load_manifest.h @@ -0,0 +1,67 @@ +/*------------------------------------------------------------------------- + * + * Load data from a backup manifest into memory. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/load_manifest.h + * + *------------------------------------------------------------------------- + */ +#ifndef LOAD_MANIFEST_H +#define LOAD_MANIFEST_H + +#include "access/xlogdefs.h" +#include "common/checksum_helper.h" + +/* + * Each file described by the manifest file is parsed to produce an object + * like this. + */ +typedef struct manifest_file +{ + uint32 status; /* hash status */ + char *pathname; + size_t size; + pg_checksum_type checksum_type; + int checksum_length; + uint8 *checksum_payload; +} manifest_file; + +#define SH_PREFIX manifest_files +#define SH_ELEMENT_TYPE manifest_file +#define SH_KEY_TYPE char * +#define SH_SCOPE extern +#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_DECLARE +#include "lib/simplehash.h" + +/* + * Each WAL range described by the manifest file is parsed to produce an + * object like this. + */ +typedef struct manifest_wal_range +{ + TimeLineID tli; + XLogRecPtr start_lsn; + XLogRecPtr end_lsn; + struct manifest_wal_range *next; + struct manifest_wal_range *prev; +} manifest_wal_range; + +/* + * All the data parsed from a backup_manifest file. + */ +typedef struct manifest_data +{ + manifest_files_hash *files; + manifest_wal_range *first_wal_range; + manifest_wal_range *last_wal_range; +} manifest_data; + +extern manifest_data *load_backup_manifest(char *backup_directory); +extern manifest_data **load_backup_manifests(int n_backups, + char **backup_directories); + +#endif /* LOAD_MANIFEST_H */ diff --git a/src/bin/pg_combinebackup/meson.build b/src/bin/pg_combinebackup/meson.build new file mode 100644 index 0000000000..bea0db405e --- /dev/null +++ b/src/bin/pg_combinebackup/meson.build @@ -0,0 +1,29 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +pg_combinebackup_sources = files( + 'pg_combinebackup.c', + 'backup_label.c', + 'copy_file.c', + 'load_manifest.c', + 'reconstruct.c', + 'write_manifest.c', +) + +if host_system == 'windows' + pg_combinebackup_sources += rc_bin_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_combinebackup', + '--FILEDESC', 'pg_combinebackup - combine incremental backups',]) +endif + +pg_combinebackup = executable('pg_combinebackup', + pg_combinebackup_sources, + dependencies: [frontend_code], + kwargs: default_bin_args, +) +bin_targets += pg_combinebackup + +tests += { + 'name': 'pg_combinebackup', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir() +} diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c new file mode 100644 index 0000000000..6c7fd3290e --- /dev/null +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -0,0 +1,1268 @@ +/*------------------------------------------------------------------------- + * + * pg_combinebackup.c + * Combine incremental backups with prior backups. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_combinebackup/pg_combinebackup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include +#include +#include + +#include "access/xlogdefs.h" +#include "backup_label.h" +#include "common/blkreftable.h" +#include "common/checksum_helper.h" +#include "common/controldata_utils.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "common/logging.h" +#include "copy_file.h" +#include "fe_utils/option_utils.h" +#include "lib/stringinfo.h" +#include "load_manifest.h" +#include "getopt_long.h" +#include "reconstruct.h" +#include "write_manifest.h" + +/* Incremental file naming convention. */ +#define INCREMENTAL_PREFIX "INCREMENTAL." +#define INCREMENTAL_PREFIX_LENGTH 12 + +/* + * Tracking for directories that need to be removed, or have their contents + * removed, if the operation fails. + */ +typedef struct cb_cleanup_dir +{ + char *target_path; + bool rmtopdir; + struct cb_cleanup_dir *next; +} cb_cleanup_dir; + +/* + * Stores a tablespace mapping provided using -T, --tablespace-mapping. + */ +typedef struct cb_tablespace_mapping +{ + char old_dir[MAXPGPATH]; + char new_dir[MAXPGPATH]; + struct cb_tablespace_mapping *next; +} cb_tablespace_mapping; + +/* + * Stores data parsed from all command-line options. + */ +typedef struct cb_options +{ + bool debug; + char *output; + bool dry_run; + bool no_sync; + bool progress; + cb_tablespace_mapping *tsmappings; + pg_checksum_type manifest_checksums; + bool no_manifest; +} cb_options; + +/* + * Data about a tablespace. + * + * Every normal tablespace needs a tablespace mapping, but in-place tablespaces + * don't, so the list of tablespaces can contain more entries than the list of + * tablespace mappings. + */ +typedef struct cb_tablespace +{ + Oid oid; + bool in_place; + char old_dir[MAXPGPATH]; + char new_dir[MAXPGPATH]; + struct cb_tablespace *next; +} cb_tablespace; + +/* Directories to be removed if we exit uncleanly. */ +cb_cleanup_dir *cleanup_dir_list = NULL; + +static void add_tablespace_mapping(cb_options *opt, char *arg); +static StringInfo check_backup_label_files(int n_backups, char **backup_dirs); +static void check_control_files(int n_backups, char **backup_dirs); +static void check_input_dir_permissions(char *dir); +static void cleanup_directories_atexit(void); +static void create_output_directory(char *dirname, cb_options *opt); +static void help(const char *progname); +static bool parse_oid(char *s, Oid *result); +static void process_directory_recursively(Oid tsoid, + char *input_directory, + char *output_directory, + char *relative_path, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + manifest_writer *mwriter, + cb_options *opt); +static int read_pg_version_file(char *directory); +static void remember_to_cleanup_directory(char *target_path, bool rmtopdir); +static void reset_directory_cleanup_list(void); +static cb_tablespace *scan_for_existing_tablespaces(char *pathname, + cb_options *opt); +static void slurp_file(int fd, char *filename, StringInfo buf, int maxlen); + +/* + * Main program. + */ +int +main(int argc, char *argv[]) +{ + static struct option long_options[] = { + {"debug", no_argument, NULL, 'd'}, + {"output", required_argument, NULL, 'o'}, + {"dry-run", no_argument, NULL, 'n'}, + {"no-sync", no_argument, NULL, 'N'}, + {"progress", no_argument, NULL, 'P'}, + {"tablespace-mapping", no_argument, NULL, 'T'}, + {"manifest-checksums", required_argument, NULL, 1}, + {"no-manifest", no_argument, NULL, 2}, + {NULL, 0, NULL, 0} + }; + + const char *progname; + char *last_input_dir; + int optindex; + int c; + int n_backups; + int n_prior_backups; + int version; + char **prior_backup_dirs; + cb_options opt; + cb_tablespace *tablespaces; + cb_tablespace *ts; + StringInfo last_backup_label; + manifest_data **manifests; + manifest_writer *mwriter; + + pg_logging_init(argv[0]); + progname = get_progname(argv[0]); + handle_help_version_opts(argc, argv, progname, help); + + memset(&opt, 0, sizeof(opt)); + opt.manifest_checksums = CHECKSUM_TYPE_CRC32C; + + /* process command-line options */ + while ((c = getopt_long(argc, argv, "do:nNPT:", + long_options, &optindex)) != -1) + { + switch (c) + { + case 'd': + opt.debug = true; + pg_logging_increase_verbosity(); + break; + case 'o': + opt.output = optarg; + break; + case 'n': + opt.dry_run = true; + break; + case 'N': + opt.no_sync = true; + break; + case 'P': + opt.progress = true; + break; + case 'T': + add_tablespace_mapping(&opt, optarg); + break; + case 1: + if (!pg_checksum_parse_type(optarg, + &opt.manifest_checksums)) + pg_fatal("unrecognized checksum algorithm: \"%s\"", + optarg); + break; + case 2: + opt.no_manifest = true; + break; + default: + /* getopt_long already emitted a complaint */ + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + } + + if (optind >= argc) + { + pg_log_error("%s: no input directories specified", progname); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + + if (opt.output == NULL) + pg_fatal("no output directory specified"); + + /* If no manifest is needed, no checksums are needed, either. */ + if (opt.no_manifest) + opt.manifest_checksums = CHECKSUM_TYPE_NONE; + + /* Read the server version from the final backup. */ + version = read_pg_version_file(argv[argc - 1]); + + /* Sanity-check control files. */ + n_backups = argc - optind; + check_control_files(n_backups, argv + optind); + + /* Sanity-check backup_label files, and get the contents of the last one. */ + last_backup_label = check_backup_label_files(n_backups, argv + optind); + + /* Load backup manifests. */ + manifests = load_backup_manifests(n_backups, argv + optind); + + /* Figure out which tablespaces are going to be included in the output. */ + last_input_dir = argv[argc - 1]; + check_input_dir_permissions(last_input_dir); + tablespaces = scan_for_existing_tablespaces(last_input_dir, &opt); + + /* + * Create output directories. + * + * We create one output directory for the main data directory plus one for + * each non-in-place tablespace. create_output_directory() will arrange + * for those directories to be cleaned up on failure. In-place tablespaces + * aren't handled at this stage because they're located beneath the main + * output directory, and thus the cleanup of that directory will get rid + * of them. Plus, the pg_tblspc directory that needs to contain them + * doesn't exist yet. + */ + atexit(cleanup_directories_atexit); + create_output_directory(opt.output, &opt); + for (ts = tablespaces; ts != NULL; ts = ts->next) + if (!ts->in_place) + create_output_directory(ts->new_dir, &opt); + + /* If we need to write a backup_manifest, prepare to do so. */ + if (!opt.dry_run && !opt.no_manifest) + mwriter = create_manifest_writer(opt.output); + else + mwriter = NULL; + + /* Write backup label into output directory. */ + if (opt.dry_run) + pg_log_debug("would generate \"%s/backup_label\"", opt.output); + else + { + pg_log_debug("generating \"%s/backup_label\"", opt.output); + last_backup_label->cursor = 0; + write_backup_label(opt.output, last_backup_label, + opt.manifest_checksums, mwriter); + } + + /* + * We'll need the pathnames to the prior backups. By "prior" we mean all + * but the last one listed on the command line. + */ + n_prior_backups = argc - optind - 1; + prior_backup_dirs = argv + optind; + + /* Process everything that's not part of a user-defined tablespace. */ + pg_log_debug("processing backup directory \"%s\"", last_input_dir); + process_directory_recursively(InvalidOid, last_input_dir, opt.output, + NULL, n_prior_backups, prior_backup_dirs, + manifests, mwriter, &opt); + + /* Process user-defined tablespaces. */ + for (ts = tablespaces; ts != NULL; ts = ts->next) + { + pg_log_debug("processing tablespace directory \"%s\"", ts->old_dir); + + /* + * If it's a normal tablespace, we need to set up a symbolic link from + * pg_tblspc/${OID} to the target directory; if it's an in-place + * tablespace, we need to create a directory at pg_tblspc/${OID}. + */ + if (!ts->in_place) + { + char linkpath[MAXPGPATH]; + + snprintf(linkpath, MAXPGPATH, "%s/pg_tblspc/%u", opt.output, + ts->oid); + + if (opt.dry_run) + pg_log_debug("would create symbolic link from \"%s\" to \"%s\"", + linkpath, ts->new_dir); + else + { + pg_log_debug("creating symbolic link from \"%s\" to \"%s\"", + linkpath, ts->new_dir); + if (symlink(ts->new_dir, linkpath) != 0) + pg_fatal("could not create symbolic link from \"%s\" to \"%s\": %m", + linkpath, ts->new_dir); + } + } + else + { + if (opt.dry_run) + pg_log_debug("would create directory \"%s\"", ts->new_dir); + else + { + pg_log_debug("creating directory \"%s\"", ts->new_dir); + if (pg_mkdir_p(ts->new_dir, pg_dir_create_mode) == -1) + pg_fatal("could not create directory \"%s\": %m", + ts->new_dir); + } + } + + /* OK, now handle the directory contents. */ + process_directory_recursively(ts->oid, ts->old_dir, ts->new_dir, + NULL, n_prior_backups, prior_backup_dirs, + manifests, mwriter, &opt); + } + + /* Finalize the backup_manifest, if we're generating one. */ + if (mwriter != NULL) + finalize_manifest(mwriter, + manifests[n_prior_backups]->first_wal_range); + + /* fsync that output directory unless we've been told not to do so */ + if (!opt.no_sync) + { + if (opt.dry_run) + pg_log_debug("would recursively fsync \"%s\"", opt.output); + else + { + pg_log_debug("recursively fsyncing \"%s\"", opt.output); + fsync_pgdata(opt.output, version * 10000); + } + } + + /* It's a success, so don't remove the output directories. */ + reset_directory_cleanup_list(); + exit(0); +} + +/* + * Process the option argument for the -T, --tablespace-mapping switch. + */ +static void +add_tablespace_mapping(cb_options *opt, char *arg) +{ + cb_tablespace_mapping *tsmap = pg_malloc0(sizeof(cb_tablespace_mapping)); + char *dst; + char *dst_ptr; + char *arg_ptr; + + /* + * Basically, we just want to copy everything before the equals sign to + * tsmap->old_dir and everything afterwards to tsmap->new_dir, but if + * there's more or less than one equals sign, that's an error, and if + * there's an equals sign preceded by a backslash, don't treat it as a + * field separator but instead copy a literal equals sign. + */ + dst_ptr = dst = tsmap->old_dir; + for (arg_ptr = arg; *arg_ptr != '\0'; arg_ptr++) + { + if (dst_ptr - dst >= MAXPGPATH) + pg_fatal("directory name too long"); + + if (*arg_ptr == '\\' && *(arg_ptr + 1) == '=') + ; /* skip backslash escaping = */ + else if (*arg_ptr == '=' && (arg_ptr == arg || *(arg_ptr - 1) != '\\')) + { + if (tsmap->new_dir[0] != '\0') + pg_fatal("multiple \"=\" signs in tablespace mapping"); + else + dst = dst_ptr = tsmap->new_dir; + } + else + *dst_ptr++ = *arg_ptr; + } + if (!tsmap->old_dir[0] || !tsmap->new_dir[0]) + pg_fatal("invalid tablespace mapping format \"%s\", must be \"OLDDIR=NEWDIR\"", arg); + + /* + * All tablespaces are created with absolute directories, so specifying a + * non-absolute path here would never match, possibly confusing users. + * + * In contrast to pg_basebackup, both the old and new directories are on + * the local machine, so the local machine's definition of an absolute + * path is the only relevant one. + */ + if (!is_absolute_path(tsmap->old_dir)) + pg_fatal("old directory is not an absolute path in tablespace mapping: %s", + tsmap->old_dir); + + if (!is_absolute_path(tsmap->new_dir)) + pg_fatal("old directory is not an absolute path in tablespace mapping: %s", + tsmap->new_dir); + + /* Canonicalize paths to avoid spurious failures when comparing. */ + canonicalize_path(tsmap->old_dir); + canonicalize_path(tsmap->new_dir); + + /* Add it to the list. */ + tsmap->next = opt->tsmappings; + opt->tsmappings = tsmap; +} + +/* + * Check that the backup_label files form a coherent backup chain, and return + * the contents of the backup_label file from the latest backup. + */ +static StringInfo +check_backup_label_files(int n_backups, char **backup_dirs) +{ + StringInfo buf = makeStringInfo(); + StringInfo lastbuf = buf; + int i; + TimeLineID check_tli = 0; + XLogRecPtr check_lsn = InvalidXLogRecPtr; + + /* Try to read each backup_label file in turn, last to first. */ + for (i = n_backups - 1; i >= 0; --i) + { + char pathbuf[MAXPGPATH]; + int fd; + TimeLineID start_tli; + TimeLineID previous_tli; + XLogRecPtr start_lsn; + XLogRecPtr previous_lsn; + + /* Open the backup_label file. */ + snprintf(pathbuf, MAXPGPATH, "%s/backup_label", backup_dirs[i]); + pg_log_debug("reading \"%s\"", pathbuf); + if ((fd = open(pathbuf, O_RDONLY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", pathbuf); + + /* + * Slurp the whole file into memory. + * + * The exact size limit that we impose here doesn't really matter -- + * most of what's supposed to be in the file is fixed size and quite + * short. However, the length of the backup_label is limited (at least + * by some parts of the code) to MAXGPATH, so include that value in + * the maximum length that we tolerate. + */ + slurp_file(fd, pathbuf, buf, 10000 + MAXPGPATH); + + /* Close the file. */ + if (close(fd) != 0) + pg_fatal("could not close \"%s\": %m", pathbuf); + + /* Parse the file contents. */ + parse_backup_label(pathbuf, buf, &start_tli, &start_lsn, + &previous_tli, &previous_lsn); + + /* + * Sanity checks. + * + * XXX. It's actually not required that start_lsn == check_lsn. It + * would be OK if start_lsn > check_lsn provided that start_lsn is + * less than or equal to the relevant switchpoint. But at the moment + * we don't have that information. + */ + if (i > 0 && previous_tli == 0) + pg_fatal("backup at \"%s\" is a full backup, but only the first backup should be a full backup", + backup_dirs[i]); + if (i == 0 && previous_tli != 0) + pg_fatal("backup at \"%s\" is an incremental backup, but the first backup should be a full backup", + backup_dirs[i]); + if (i < n_backups - 1 && start_tli != check_tli) + pg_fatal("backup at \"%s\" starts on timeline %u, but expected %u", + backup_dirs[i], start_tli, check_tli); + if (i < n_backups - 1 && start_lsn != check_lsn) + pg_fatal("backup at \"%s\" starts at LSN %X/%X, but expected %X/%X", + backup_dirs[i], + LSN_FORMAT_ARGS(start_lsn), + LSN_FORMAT_ARGS(check_lsn)); + check_tli = previous_tli; + check_lsn = previous_lsn; + + /* + * The last backup label in the chain needs to be saved for later use, + * while the others are only needed within this loop. + */ + if (lastbuf == buf) + buf = makeStringInfo(); + else + resetStringInfo(buf); + } + + /* Free memory that we don't need any more. */ + if (lastbuf != buf) + { + pfree(buf->data); + pfree(buf); + } + + /* + * Return the data from the first backup_info that we read (which is the + * backup_label from the last directory specified on the command line). + */ + return lastbuf; +} + +/* + * Sanity check control files. + */ +static void +check_control_files(int n_backups, char **backup_dirs) +{ + int i; + uint64 system_identifier; + + /* Try to read each control file in turn, last to first. */ + for (i = n_backups - 1; i >= 0; --i) + { + ControlFileData *control_file; + bool crc_ok; + + pg_log_debug("reading \"%s/global/pg_control\"", backup_dirs[i]); + control_file = get_controlfile(backup_dirs[i], &crc_ok); + + /* Control file contents not meaningful if CRC is bad. */ + if (!crc_ok) + pg_fatal("%s/global/pg_control: crc is incorrect", backup_dirs[i]); + + /* Can't interpret control file if not current version. */ + if (control_file->pg_control_version != PG_CONTROL_VERSION) + pg_fatal("%s/global/pg_control: unexpected control file version", + backup_dirs[i]); + + /* System identifiers should all match. */ + if (i == n_backups - 1) + system_identifier = control_file->system_identifier; + else if (system_identifier != control_file->system_identifier) + pg_fatal("%s/global/pg_control: expected system identifier %llu, but found %llu", + backup_dirs[i], (unsigned long long) system_identifier, + (unsigned long long) control_file->system_identifier); + + /* Release memory. */ + pfree(control_file); + } + + /* + * If debug output is enabled, make a note of the system identifier that + * we found in all of the relevant control files. + */ + pg_log_debug("system identifier is %llu", + (unsigned long long) system_identifier); +} + +/* + * Set default permissions for new files and directories based on the + * permissions of the given directory. The intent here is that the output + * directory should use the same permissions scheme as the final input + * directory. + */ +static void +check_input_dir_permissions(char *dir) +{ + struct stat st; + + if (stat(dir, &st) != 0) + pg_fatal("could not stat \"%s\": %m", dir); + + SetDataDirectoryCreatePerm(st.st_mode); +} + +/* + * Clean up output directories before exiting. + */ +static void +cleanup_directories_atexit(void) +{ + while (cleanup_dir_list != NULL) + { + cb_cleanup_dir *dir = cleanup_dir_list; + + if (dir->rmtopdir) + { + pg_log_info("removing output directory \"%s\"", dir->target_path); + if (!rmtree(dir->target_path, dir->rmtopdir)) + pg_log_error("failed to remove output directory"); + } + else + { + pg_log_info("removing contents of output directory \"%s\"", + dir->target_path); + if (!rmtree(dir->target_path, dir->rmtopdir)) + pg_log_error("failed to remove contents of output directory"); + } + + cleanup_dir_list = cleanup_dir_list->next; + pfree(dir); + } +} + +/* + * Create the named output directory, unless it already exists or we're in + * dry-run mode. If it already exists but is not empty, that's a fatal error. + * + * Adds the created directory to the list of directories to be cleaned up + * at process exit. + */ +static void +create_output_directory(char *dirname, cb_options *opt) +{ + switch (pg_check_dir(dirname)) + { + case 0: + if (opt->dry_run) + { + pg_log_debug("would create directory \"%s\"", dirname); + return; + } + pg_log_debug("creating directory \"%s\"", dirname); + if (pg_mkdir_p(dirname, pg_dir_create_mode) == -1) + pg_fatal("could not create directory \"%s\": %m", dirname); + remember_to_cleanup_directory(dirname, true); + break; + + case 1: + pg_log_debug("using existing directory \"%s\"", dirname); + remember_to_cleanup_directory(dirname, false); + break; + + case 2: + case 3: + case 4: + pg_fatal("directory \"%s\" exists but is not empty", dirname); + + case -1: + pg_fatal("could not access directory \"%s\": %m", dirname); + } +} + +/* + * help + * + * Prints help page for the program + * + * progname: the name of the executed program, such as "pg_combinebackup" + */ +static void +help(const char *progname) +{ + printf(_("%s combines incremental backups.\n\n"), progname); + printf(_("Usage:\n")); + printf(_(" %s [OPTION]... DIRECTORY...\n"), progname); + printf(_("\nOptions:\n")); + printf(_(" -d, --debug generate lots of debugging output\n")); + printf(_(" -o, --output output directory\n")); + printf(_(" -n, --dry-run don't actually do anything\n")); + printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n")); + printf(_(" -P, --progress show progress information\n")); + printf(_(" -T, --tablespace-mapping=OLDDIR=NEWDIR\n")); + printf(_(" relocate tablespace in OLDDIR to NEWDIR\n")); + printf(_(" --manifest-checksums=SHA{224,256,384,512}|CRC32C|NONE\n" + " use algorithm for manifest checksums\n")); + printf(_(" --no-manifest suppress generation of backup manifest\n")); + printf(_(" -?, --help show this help, then exit\n")); + + printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); + printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); +} + +/* + * Try to parse a string as a non-zero OID without leading zeroes. + * + * If it works, return true and set *result to the answer, else return false. + */ +static bool +parse_oid(char *s, Oid *result) +{ + Oid oid; + char *ep; + + errno = 0; + oid = strtoul(s, &ep, 10); + if (errno != 0 || *ep != '\0' || oid < 1 || oid > PG_UINT32_MAX) + return false; + + *result = oid; + return true; +} + +/* + * Copy files from the input directory to the output directory, reconstructing + * full files from incremental files as required. + * + * If processing is a user-defined tablespace, the tsoid should be the OID + * of that tablespace and input_directory and output_directory should be the + * toplevel input and output directories for that tablespace. Otherwise, + * tsoid should be InvalidOid and input_directory and output_directory should + * be the main input and output directories. + * + * relative_path is the path beneath the given input and output directories + * that we are currently processing. If NULL, it indicates that we're + * processing the input and output directories themselves. + * + * n_prior_backups is the number of prior backups that we have available. + * This doesn't count the very last backup, which is referenced by + * output_directory, just the older ones. prior_backup_dirs is an array of + * the locations of those previous backups. + */ +static void +process_directory_recursively(Oid tsoid, + char *input_directory, + char *output_directory, + char *relative_path, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + manifest_writer *mwriter, + cb_options *opt) +{ + char ifulldir[MAXPGPATH]; + char ofulldir[MAXPGPATH]; + char manifest_prefix[MAXPGPATH]; + DIR *dir; + struct dirent *de; + bool is_pg_tblspc; + bool is_pg_wal; + manifest_data *latest_manifest = manifests[n_prior_backups]; + pg_checksum_type checksum_type; + + StaticAssertStmt(strlen(INCREMENTAL_PREFIX) == INCREMENTAL_PREFIX_LENGTH, + "INCREMENTAL_PREFIX_LENGTH is incorrect"); + + /* + * pg_tblspc and pg_wal are special cases, so detect those here. + * + * pg_tblspc is only special at the top level, but subdirectories of + * pg_wal are just as special as the top level directory. + * + * Since incremental backup does not exist in pre-v10 versions, we don't + * have to worry about the old pg_xlog naming. + */ + is_pg_tblspc = !OidIsValid(tsoid) && relative_path != NULL && + strcmp(relative_path, "pg_tblspc") == 0; + is_pg_wal = !OidIsValid(tsoid) && relative_path != NULL && + (strcmp(relative_path, "pg_wal") == 0 || + strncmp(relative_path, "pg_wal/", 7) == 0); + + /* + * If we're under pg_wal, then we don't need checksums, because these + * files aren't included in the backup manifest. Otherwise use whatever + * type of checksum is configured. + */ + if (!is_pg_wal) + checksum_type = opt->manifest_checksums; + else + checksum_type = CHECKSUM_TYPE_NONE; + + /* + * Append the relative path to the input and output directories, and + * figure out the appropriate prefix to add to files in this directory + * when looking them up in a backup manifest. + */ + if (relative_path == NULL) + { + strncpy(ifulldir, input_directory, MAXPGPATH); + strncpy(ofulldir, output_directory, MAXPGPATH); + if (OidIsValid(tsoid)) + snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/", tsoid); + else + manifest_prefix[0] = '\0'; + } + else + { + snprintf(ifulldir, MAXPGPATH, "%s/%s", input_directory, + relative_path); + snprintf(ofulldir, MAXPGPATH, "%s/%s", output_directory, + relative_path); + if (OidIsValid(tsoid)) + snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/%s/", + tsoid, relative_path); + else + snprintf(manifest_prefix, MAXPGPATH, "%s/", relative_path); + } + + /* + * Toplevel output directories have already been created by the time this + * function is called, but any subdirectories are our responsibility. + */ + if (relative_path != NULL) + { + if (opt->dry_run) + pg_log_debug("would create directory \"%s\"", ofulldir); + else + { + pg_log_debug("creating directory \"%s\"", ofulldir); + if (mkdir(ofulldir, pg_dir_create_mode) == -1) + pg_fatal("could not create directory \"%s\": %m", ofulldir); + } + } + + /* It's time to scan the directory. */ + if ((dir = opendir(ifulldir)) == NULL) + pg_fatal("could not open directory \"%s\": %m", ifulldir); + while (errno = 0, (de = readdir(dir)) != NULL) + { + PGFileType type; + char ifullpath[MAXPGPATH]; + char ofullpath[MAXPGPATH]; + char manifest_path[MAXPGPATH]; + Oid oid = InvalidOid; + int checksum_length = 0; + uint8 *checksum_payload = NULL; + pg_checksum_context checksum_ctx; + + /* Ignore "." and ".." entries. */ + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + /* Construct input path. */ + snprintf(ifullpath, MAXPGPATH, "%s/%s", ifulldir, de->d_name); + + /* Figure out what kind of directory entry this is. */ + type = get_dirent_type(ifullpath, de, false, PG_LOG_ERROR); + if (type == PGFILETYPE_ERROR) + exit(1); + + /* + * If we're processing pg_tblspc, then check whether the filename + * looks like it could be a tablespace OID. If so, and if the + * directory entry is a symbolic link or a directory, skip it. + * + * Our goal here is to ignore anything that would have been considered + * by scan_for_existing_tablespaces to be a tablespace. + */ + if (is_pg_tblspc && parse_oid(de->d_name, &oid) && + (type == PGFILETYPE_LNK || type == PGFILETYPE_DIR)) + continue; + + /* If it's a directory, recurse. */ + if (type == PGFILETYPE_DIR) + { + char new_relative_path[MAXPGPATH]; + + /* Append new pathname component to relative path. */ + if (relative_path == NULL) + strncpy(new_relative_path, de->d_name, MAXPGPATH); + else + snprintf(new_relative_path, MAXPGPATH, "%s/%s", relative_path, + de->d_name); + + /* And recurse. */ + process_directory_recursively(tsoid, + input_directory, output_directory, + new_relative_path, + n_prior_backups, prior_backup_dirs, + manifests, mwriter, opt); + continue; + } + + /* Skip anything that's not a regular file. */ + if (type != PGFILETYPE_REG) + { + if (type == PGFILETYPE_LNK) + pg_log_warning("skipping symbolic link \"%s\"", ifullpath); + else + pg_log_warning("skipping special file \"%s\"", ifullpath); + continue; + } + + /* + * Skip the backup_label and backup_manifest files; they require + * special handling and are handled elsewhere. + */ + if (relative_path == NULL && + (strcmp(de->d_name, "backup_label") == 0 || + strcmp(de->d_name, "backup_manifest") == 0)) + continue; + + /* + * If it's an incremental file, hand it off to the reconstruction + * code, which will figure out what to do. + */ + if (strncmp(de->d_name, INCREMENTAL_PREFIX, + INCREMENTAL_PREFIX_LENGTH) == 0) + { + /* Output path should not include "INCREMENTAL." prefix. */ + snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, + de->d_name + INCREMENTAL_PREFIX_LENGTH); + + + /* Manifest path likewise omits incremental prefix. */ + snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix, + de->d_name + INCREMENTAL_PREFIX_LENGTH); + + /* Reconstruction logic will do the rest. */ + reconstruct_from_incremental_file(ifullpath, ofullpath, + relative_path, + de->d_name + INCREMENTAL_PREFIX_LENGTH, + n_prior_backups, + prior_backup_dirs, + manifests, + manifest_path, + checksum_type, + &checksum_length, + &checksum_payload, + opt->dry_run); + } + else + { + /* Construct the path that the backup_manifest will use. */ + snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix, + de->d_name); + + /* + * It's not an incremental file, so we need to copy the entire + * file to the output directory. + * + * If a checksum of the required type already exists in the + * backup_manifest for the final input directory, we can save some + * work by reusing that checksum instead of computing a new one. + */ + if (checksum_type != CHECKSUM_TYPE_NONE && + latest_manifest != NULL) + { + manifest_file *mfile; + + mfile = manifest_files_lookup(latest_manifest->files, + manifest_path); + if (mfile == NULL) + { + /* + * The directory is out of sync with the backup_manifest, + * so emit a warning. + */ + pg_log_warning("\"%s/backup_manifest\" contains no entry for \"%s\"", + input_directory, manifest_path); + } + else if (mfile->checksum_type == checksum_type) + { + checksum_length = mfile->checksum_length; + checksum_payload = mfile->checksum_payload; + } + } + + /* + * If we're reusing a checksum, then we don't need copy_file() to + * compute one for us, but otherwise, it needs to compute whatever + * type of checksum we need. + */ + if (checksum_length != 0) + pg_checksum_init(&checksum_ctx, CHECKSUM_TYPE_NONE); + else + pg_checksum_init(&checksum_ctx, checksum_type); + + /* Actually copy the file. */ + snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name); + copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run); + + /* + * If copy_file() performed a checksum calculation for us, then + * save the results (except in dry-run mode, when there's no + * point). + */ + if (checksum_ctx.type != CHECKSUM_TYPE_NONE && !opt->dry_run) + { + checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH); + checksum_length = pg_checksum_final(&checksum_ctx, + checksum_payload); + } + } + + /* Generate manifest entry, if needed. */ + if (mwriter != NULL) + { + struct stat sb; + + /* + * In order to generate a manifest entry, we need the file size + * and mtime. We have no way to know the correct mtime except to + * stat() the file, so just do that and get the size as well. + * + * If we didn't need the mtime here, we could try to obtain the + * file size from the reconstruction or file copy process above, + * although that is actually not convenient in all cases. If we + * write the file ourselves then clearly we can keep a count of + * bytes, but if we use something like CopyFile() then it's + * trickier. Since we have to stat() anyway to get the mtime, + * there's no point in worrying about it. + */ + if (stat(ofullpath, &sb) < 0) + pg_fatal("could not stat file \"%s\": %m", ofullpath); + + /* OK, now do the work. */ + add_file_to_manifest(mwriter, manifest_path, + sb.st_size, sb.st_mtime, + checksum_type, checksum_length, + checksum_payload); + } + + /* Avoid leaking memory. */ + if (checksum_payload != NULL) + pfree(checksum_payload); + } + + closedir(dir); +} + +/* + * Read the version number from PG_VERSION and convert it to the usual server + * version number format. (e.g. If PG_VERSION contains "14\n" this function + * will return 140000) + */ +static int +read_pg_version_file(char *directory) +{ + char filename[MAXPGPATH]; + StringInfoData buf; + int fd; + int version; + char *ep; + + /* Construct pathname. */ + snprintf(filename, MAXPGPATH, "%s/PG_VERSION", directory); + + /* Open file. */ + if ((fd = open(filename, O_RDONLY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", filename); + + /* Read into memory. Length limit of 128 should be more than generous. */ + initStringInfo(&buf); + slurp_file(fd, filename, &buf, 128); + + /* Close the file. */ + if (close(fd) != 0) + pg_fatal("could not close \"%s\": %m", filename); + + /* Convert to integer. */ + errno = 0; + version = strtoul(buf.data, &ep, 10); + if (errno != 0 || *ep != '\n') + { + /* + * Incremental backup is not relevant to very old server versions that + * used multi-part version number (e.g. 9.6, or 8.4). So if we see + * what looks like the beginning of such a version number, just bail + * out. + */ + if (version < 10 && *ep == '.') + pg_fatal("%s: server version too old\n", filename); + pg_fatal("%s: could not parse version number\n", filename); + } + + /* Debugging output. */ + pg_log_debug("read server version %d from \"%s\"", version, filename); + + /* Release memory and return result. */ + pfree(buf.data); + return version * 10000; +} + +/* + * Add a directory to the list of output directories to clean up. + */ +static void +remember_to_cleanup_directory(char *target_path, bool rmtopdir) +{ + cb_cleanup_dir *dir = pg_malloc(sizeof(cb_cleanup_dir)); + + dir->target_path = target_path; + dir->rmtopdir = rmtopdir; + dir->next = cleanup_dir_list; + cleanup_dir_list = dir; +} + +/* + * Empty out the list of directories scheduled for cleanup a exit. + * + * We want to remove the output directories only on a failure, so call this + * function when we know that the operation has succeeded. + * + * Since we only expect this to be called when we're about to exit, we could + * just set cleanup_dir_list to NULL and be done with it, but we free the + * memory to be tidy. + */ +static void +reset_directory_cleanup_list(void) +{ + while (cleanup_dir_list != NULL) + { + cb_cleanup_dir *dir = cleanup_dir_list; + + cleanup_dir_list = cleanup_dir_list->next; + pfree(dir); + } +} + +/* + * Scan the pg_tblspc directory of the final input backup to get a canonical + * list of what tablespaces are part of the backup. + * + * 'pathname' should be the path to the toplevel backup directory for the + * final backup in the backup chain. + */ +static cb_tablespace * +scan_for_existing_tablespaces(char *pathname, cb_options *opt) +{ + char pg_tblspc[MAXPGPATH]; + DIR *dir; + struct dirent *de; + cb_tablespace *tslist = NULL; + + snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pathname); + pg_log_debug("scanning \"%s\"", pg_tblspc); + + if ((dir = opendir(pg_tblspc)) == NULL) + pg_fatal("could not open directory \"%s\": %m", pathname); + + while (errno = 0, (de = readdir(dir)) != NULL) + { + Oid oid; + char tblspcdir[MAXPGPATH]; + char link_target[MAXPGPATH]; + int link_length; + cb_tablespace *ts; + cb_tablespace *otherts; + PGFileType type; + + /* Silently ignore "." and ".." entries. */ + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + /* Construct full pathname. */ + snprintf(tblspcdir, MAXPGPATH, "%s/%s", pg_tblspc, de->d_name); + + /* Ignore any file name that doesn't look like a proper OID. */ + if (!parse_oid(de->d_name, &oid)) + { + pg_log_debug("skipping \"%s\" because the filename is not a legal tablespace OID", + tblspcdir); + continue; + } + + /* Only symbolic links and directories are tablespaces. */ + type = get_dirent_type(tblspcdir, de, false, PG_LOG_ERROR); + if (type == PGFILETYPE_ERROR) + exit(1); + if (type != PGFILETYPE_LNK && type != PGFILETYPE_DIR) + { + pg_log_debug("skipping \"%s\" because it is neither a symbolic link nor a directory", + tblspcdir); + continue; + } + + /* Create a new tablespace object. */ + ts = pg_malloc0(sizeof(cb_tablespace)); + ts->oid = oid; + + /* + * If it's a link, it's not an in-place tablespace. Otherwise, it must + * be a directory, and thus an in-place tablespace. + */ + if (type == PGFILETYPE_LNK) + { + cb_tablespace_mapping *tsmap; + + /* Read the link target. */ + link_length = readlink(tblspcdir, link_target, sizeof(link_target)); + if (link_length < 0) + pg_fatal("could not read symbolic link \"%s\": %m", + tblspcdir); + if (link_length >= sizeof(link_target)) + pg_fatal("symbolic link \"%s\" is too long", tblspcdir); + link_target[link_length] = '\0'; + if (!is_absolute_path(link_target)) + pg_fatal("symbolic link \"%s\" is relative", tblspcdir); + + /* Caonicalize the link target. */ + canonicalize_path(link_target); + + /* + * Find the corresponding tablespace mapping and copy the relevant + * details into the new tablespace entry. + */ + for (tsmap = opt->tsmappings; tsmap != NULL; tsmap = tsmap->next) + { + if (strcmp(tsmap->old_dir, link_target) == 0) + { + strncpy(ts->old_dir, tsmap->old_dir, MAXPGPATH); + strncpy(ts->new_dir, tsmap->new_dir, MAXPGPATH); + ts->in_place = false; + break; + } + } + + /* Every non-in-place tablespace must be mapped. */ + if (tsmap == NULL) + pg_fatal("tablespace at \"%s\" has no tablespace mapping", + link_target); + } + else + { + /* + * For an in-place tablespace, there's no separate directory, so + * we just record the paths within the data directories. + */ + snprintf(ts->old_dir, MAXPGPATH, "%s/%s", pg_tblspc, de->d_name); + snprintf(ts->new_dir, MAXPGPATH, "%s/pg_tblpc/%s", opt->output, + de->d_name); + ts->in_place = true; + } + + /* Tablespaces should not share a directory. */ + for (otherts = tslist; otherts != NULL; otherts = otherts->next) + if (strcmp(ts->new_dir, otherts->new_dir) == 0) + pg_fatal("tablespaces with OIDs %u and %u both point at \"%s\"", + otherts->oid, oid, ts->new_dir); + + /* Add this tablespace to the list. */ + ts->next = tslist; + tslist = ts; + } + + return tslist; +} + +/* + * Read a file into a StringInfo. + * + * fd is used for the actual file I/O, filename for error reporting purposes. + * A file longer than maxlen is a fatal error. + */ +static void +slurp_file(int fd, char *filename, StringInfo buf, int maxlen) +{ + struct stat st; + ssize_t rb; + + /* Check file size, and complain if it's too large. */ + if (fstat(fd, &st) != 0) + pg_fatal("could not stat \"%s\": %m", filename); + if (st.st_size > maxlen) + pg_fatal("file \"%s\" is too large", filename); + + /* Make sure we have enough space. */ + enlargeStringInfo(buf, st.st_size); + + /* Read the data. */ + rb = read(fd, &buf->data[buf->len], st.st_size); + + /* + * We don't expect any concurrent changes, so we should read exactly the + * expected number of bytes. + */ + if (rb != st.st_size) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes", + filename, (int) rb, (int) st.st_size); + } + + /* Adjust buffer length for new data and restore trailing-\0 invariant */ + buf->len += rb; + buf->data[buf->len] = '\0'; +} diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c new file mode 100644 index 0000000000..c774bf1842 --- /dev/null +++ b/src/bin/pg_combinebackup/reconstruct.c @@ -0,0 +1,618 @@ +/*------------------------------------------------------------------------- + * + * reconstruct.c + * Reconstruct full file from incremental file and backup chain. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_combinebackup/reconstruct.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include + +#include "backup/basebackup_incremental.h" +#include "common/logging.h" +#include "common/file_perm.h" +#include "copy_file.h" +#include "reconstruct.h" +#include "storage/block.h" + +/* + * An rfile stores the data that we need in order to be able to use some file + * on disk for reconstruction. For any given output file, we create one rfile + * per backup that we need to consult when we constructing that output file. + * + * If we find a full version of the file in the backup chain, then only + * filename and fd are initialized; the remaining fields are 0 or NULL. + * For an incremental file, header_length, num_blocks, relative_block_numbers, + * and truncation_block_length are also set. + * + * num_blocks_read and highest_offset_read always start out as 0. + */ +typedef struct rfile +{ + char *filename; + int fd; + size_t header_length; + unsigned num_blocks; + BlockNumber *relative_block_numbers; + unsigned truncation_block_length; + unsigned num_blocks_read; + off_t highest_offset_read; +} rfile; + +static void debug_reconstruction(int n_source, + rfile **sources, + bool dry_run); +static unsigned find_reconstructed_block_length(rfile *s); +static rfile *make_incremental_rfile(char *filename); +static rfile *make_rfile(char *filename, bool missing_ok); +static void write_reconstructed_file(char *input_filename, + char *output_filename, + unsigned block_length, + rfile **sourcemap, + off_t *offsetmap, + pg_checksum_context *checksum_ctx, + bool dry_run); +static void read_bytes(rfile *rf, void *buffer, unsigned length); + +/* + * Reconstruct a full file from an incremental file and a chain of prior + * backups. + * + * input_filename should be the path to the incremental file, and + * output_filename should be the path where the reconstructed file is to be + * written. + * + * relative_path should be the relative path to the directory containing this + * file. bare_file_name should be the name of the file within that directory, + * without "INCREMENTAL.". + * + * n_prior_backups is the number of prior backups, and prior_backup_dirs is + * an array of pathnames where those backups can be found. + */ +void +reconstruct_from_incremental_file(char *input_filename, + char *output_filename, + char *relative_path, + char *bare_file_name, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + char *manifest_path, + pg_checksum_type checksum_type, + int *checksum_length, + uint8 **checksum_payload, + bool dry_run) +{ + rfile **source; + rfile *latest_source = NULL; + rfile **sourcemap; + off_t *offsetmap; + unsigned block_length; + unsigned num_missing_blocks; + unsigned i; + unsigned sidx = n_prior_backups; + bool full_copy_possible = true; + int copy_source_index = -1; + rfile *copy_source = NULL; + pg_checksum_context checksum_ctx; + + /* + * Every block must come either from the latest version of the file or + * from one of the prior backups. + */ + source = pg_malloc0(sizeof(rfile *) * (1 + n_prior_backups)); + + /* + * Use the information from the latest incremental file to figure out how + * long the reconstructed file should be. + */ + latest_source = make_incremental_rfile(input_filename); + source[n_prior_backups] = latest_source; + block_length = find_reconstructed_block_length(latest_source); + + /* + * For each block in the output file, we need to know from which file we + * need to obtain it and at what offset in that file it's stored. + * sourcemap gives us the first of these things, and offsetmap the latter. + */ + sourcemap = pg_malloc0(sizeof(rfile *) * block_length); + offsetmap = pg_malloc0(sizeof(off_t) * block_length); + + /* + * Blocks prior to the truncation_block_length threshold must be obtained + * from some prior backup, while those after that threshold are left as + * zeroes if not present in the newest incremental file. + * num_missing_blocks counts the number of blocks that we must be found + * somewhere in the backup chain, and is thus initially equal to + * truncation_block_length. + */ + num_missing_blocks = latest_source->truncation_block_length; + + /* + * Every block that is present in the newest incremental file should be + * sourced from that file. If it precedes the truncation_block_length, + * it's a block that we would otherwise have had to find in an older + * backup and thus reduces the number of blocks remaining to be found by + * one; otherwise, it's an extra block that needs to be included in the + * output but would not have needed to be found in an older backup if it + * had not been present. + */ + for (i = 0; i < latest_source->num_blocks; ++i) + { + BlockNumber b = latest_source->relative_block_numbers[i]; + + Assert(b < block_length); + sourcemap[b] = latest_source; + offsetmap[b] = latest_source->header_length + (i * BLCKSZ); + if (b < latest_source->truncation_block_length) + num_missing_blocks--; + + /* + * A full copy of a file from an earlier backup is only possible if no + * blocks are needed from any later incremental file. + */ + full_copy_possible = false; + } + + while (num_missing_blocks > 0) + { + char source_filename[MAXPGPATH]; + rfile *s; + + /* + * Move to the next backup in the chain. If there are no more, then + * something has gone wrong and reconstruction has failed. + */ + if (sidx == 0) + pg_fatal("reconstruction for file \"%s\" failed to find %u required blocks", + output_filename, num_missing_blocks); + --sidx; + + /* + * Look for the full file in the previous backup. If not found, then + * look for an incremental file instead. + */ + snprintf(source_filename, MAXPGPATH, "%s/%s/%s", + prior_backup_dirs[sidx], relative_path, bare_file_name); + if ((s = make_rfile(source_filename, true)) == NULL) + { + snprintf(source_filename, MAXPGPATH, "%s/%s/INCREMENTAL.%s", + prior_backup_dirs[sidx], relative_path, bare_file_name); + s = make_incremental_rfile(source_filename); + } + source[sidx] = s; + + /* + * If s->header_length == 0, then this is a full file; otherwise, it's + * an incremental file. + */ + if (s->header_length != 0) + { + /* + * Since we found another incremental file, source all blocks from + * it that we need but don't yet have. + */ + for (i = 0; i < s->num_blocks; ++i) + { + BlockNumber b = s->relative_block_numbers[i]; + + if (b < latest_source->truncation_block_length && + sourcemap[b] == NULL) + { + sourcemap[b] = s; + offsetmap[b] = s->header_length + (i * BLCKSZ); + + Assert(num_missing_blocks > 0); + --num_missing_blocks; + + /* + * A full copy of a file from an earlier backup is only + * possible if no blocks are needed from any later + * incremental file. + */ + full_copy_possible = false; + } + } + } + else + { + BlockNumber b; + + /* + * Since we found a full file, source all remaining required + * blocks from it. + */ + for (b = 0; b < latest_source->truncation_block_length; ++b) + { + if (sourcemap[b] == NULL) + { + sourcemap[b] = s; + offsetmap[b] = b * BLCKSZ; + + Assert(num_missing_blocks > 0); + --num_missing_blocks; + } + } + Assert(num_missing_blocks == 0); + + /* + * If a full copy looks possible, check whether the resulting file + * should be exactly as long as the source file is. If so, a full + * copy is acceptable, otherwise not. + */ + if (full_copy_possible) + { + struct stat sb; + uint64 expected_length; + + if (fstat(s->fd, &sb) < 0) + pg_fatal("could not stat \"%s\": %m", s->filename); + expected_length = + (uint64) latest_source->truncation_block_length; + expected_length *= BLCKSZ; + if (expected_length == sb.st_size) + { + copy_source = s; + copy_source_index = sidx; + } + } + } + } + + /* + * If a checksum of the required type already exists in the + * backup_manifest for the relevant input directory, we can save some work + * by reusing that checksum instead of computing a new one. + */ + if (copy_source_index >= 0 && manifests[copy_source_index] != NULL && + checksum_type != CHECKSUM_TYPE_NONE) + { + manifest_file *mfile; + + mfile = manifest_files_lookup(manifests[copy_source_index]->files, + manifest_path); + if (mfile == NULL) + { + /* + * The directory is out of sync with the backup_manifest, so emit + * a warning. + */ + pg_log_warning("\"%s/backup_manifest\" contains no entry for \"%s\"", + prior_backup_dirs[copy_source_index], + manifest_path); + } + else if (mfile->checksum_type == checksum_type) + { + *checksum_length = mfile->checksum_length; + *checksum_payload = pg_malloc(*checksum_length); + memcpy(*checksum_payload, mfile->checksum_payload, + *checksum_length); + checksum_type = CHECKSUM_TYPE_NONE; + } + } + + /* Prepare for checksum calculation, if required. */ + pg_checksum_init(&checksum_ctx, checksum_type); + + /* + * If the full file can be created by copying a file from an older backup + * in the chain without needing to overwrite any blocks or truncate the + * result, then forget about performing reconstruction and just copy that + * file in its entirety. + * + * Otherwise, reconstruct. + */ + if (copy_source != NULL) + copy_file(copy_source->filename, output_filename, + &checksum_ctx, dry_run); + else + { + write_reconstructed_file(input_filename, output_filename, + block_length, sourcemap, offsetmap, + &checksum_ctx, dry_run); + debug_reconstruction(n_prior_backups + 1, source, dry_run); + } + + /* Save results of checksum calculation. */ + if (checksum_type != CHECKSUM_TYPE_NONE) + { + *checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH); + *checksum_length = pg_checksum_final(&checksum_ctx, + *checksum_payload); + } + + /* + * Close files and release memory. + */ + for (i = 0; i <= n_prior_backups; ++i) + { + rfile *s = source[i]; + + if (s == NULL) + continue; + if (close(s->fd) != 0) + pg_fatal("could not close \"%s\": %m", s->filename); + if (s->relative_block_numbers != NULL) + pfree(s->relative_block_numbers); + pg_free(s->filename); + } + pfree(sourcemap); + pfree(offsetmap); + pfree(source); +} + +/* + * Perform post-reconstruction logging and sanity checks. + */ +static void +debug_reconstruction(int n_source, rfile **sources, bool dry_run) +{ + unsigned i; + + for (i = 0; i < n_source; ++i) + { + rfile *s = sources[i]; + + /* Ignore source if not used. */ + if (s == NULL) + continue; + + /* If no data is needed from this file, we can ignore it. */ + if (s->num_blocks_read == 0) + continue; + + /* Debug logging. */ + if (dry_run) + pg_log_debug("would have read %u blocks from \"%s\"", + s->num_blocks_read, s->filename); + else + pg_log_debug("read %u blocks from \"%s\"", + s->num_blocks_read, s->filename); + + /* + * In dry-run mode, we don't actually try to read data from the file, + * but we do try to verify that the file is long enough that we could + * have read the data if we'd tried. + * + * If this fails, then it means that a non-dry-run attempt would fail, + * complaining of not being able to read the required bytes from the + * file. + */ + if (dry_run) + { + struct stat sb; + + if (fstat(s->fd, &sb) < 0) + pg_fatal("could not stat \"%s\": %m", s->filename); + if (sb.st_size < s->highest_offset_read) + pg_fatal("file \"%s\" is too short: expected %llu, found %llu", + s->filename, + (unsigned long long) s->highest_offset_read, + (unsigned long long) sb.st_size); + } + } +} + +/* + * When we perform reconstruction using an incremental file, the output file + * should be at least as long as the truncation_block_length. Any blocks + * present in the incremental file increase the output length as far as is + * necessary to include those blocks. + */ +static unsigned +find_reconstructed_block_length(rfile *s) +{ + unsigned block_length = s->truncation_block_length; + unsigned i; + + for (i = 0; i < s->num_blocks; ++i) + if (s->relative_block_numbers[i] >= block_length) + block_length = s->relative_block_numbers[i] + 1; + + return block_length; +} + +/* + * Initialize an incremental rfile, reading the header so that we know which + * blocks it contains. + */ +static rfile * +make_incremental_rfile(char *filename) +{ + rfile *rf; + unsigned magic; + + rf = make_rfile(filename, false); + + /* Read and validate magic number. */ + read_bytes(rf, &magic, sizeof(magic)); + if (magic != INCREMENTAL_MAGIC) + pg_fatal("file \"%s\" has bad incremental magic number (0x%x not 0x%x)", + filename, magic, INCREMENTAL_MAGIC); + + /* Read block count. */ + read_bytes(rf, &rf->num_blocks, sizeof(rf->num_blocks)); + if (rf->num_blocks > RELSEG_SIZE) + pg_fatal("file \"%s\" has block count %u in excess of segment size %u", + filename, rf->num_blocks, RELSEG_SIZE); + + /* Read truncation block length. */ + read_bytes(rf, &rf->truncation_block_length, + sizeof(rf->truncation_block_length)); + if (rf->truncation_block_length > RELSEG_SIZE) + pg_fatal("file \"%s\" has truncation block length %u in excess of segment size %u", + filename, rf->truncation_block_length, RELSEG_SIZE); + + /* Read block numbers if there are any. */ + if (rf->num_blocks > 0) + { + rf->relative_block_numbers = + pg_malloc0(sizeof(BlockNumber) * rf->num_blocks); + read_bytes(rf, rf->relative_block_numbers, + sizeof(BlockNumber) * rf->num_blocks); + } + + /* Remember length of header. */ + rf->header_length = sizeof(magic) + sizeof(rf->num_blocks) + + sizeof(rf->truncation_block_length) + + sizeof(BlockNumber) * rf->num_blocks; + + return rf; +} + +/* + * Allocate and perform basic initialization of an rfile. + */ +static rfile * +make_rfile(char *filename, bool missing_ok) +{ + rfile *rf; + + rf = pg_malloc0(sizeof(rfile)); + rf->filename = pstrdup(filename); + if ((rf->fd = open(filename, O_RDONLY | PG_BINARY, 0)) < 0) + { + if (missing_ok && errno == ENOENT) + { + pg_free(rf); + return NULL; + } + pg_fatal("could not open file \"%s\": %m", filename); + } + + return rf; +} + +/* + * Read the indicated number of bytes from an rfile into the buffer. + */ +static void +read_bytes(rfile *rf, void *buffer, unsigned length) +{ + unsigned rb = read(rf->fd, buffer, length); + + if (rb != length) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", rf->filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes", + rf->filename, (int) rb, length); + } +} + +/* + * Write out a reconstructed file. + */ +static void +write_reconstructed_file(char *input_filename, + char *output_filename, + unsigned block_length, + rfile **sourcemap, + off_t *offsetmap, + pg_checksum_context *checksum_ctx, + bool dry_run) +{ + int wfd = -1; + unsigned i; + unsigned zero_blocks = 0; + + /* Debugging output. */ + if (dry_run) + pg_log_debug("would reconstruct \"%s\" (%u blocks, checksum %s)", + output_filename, block_length, + pg_checksum_type_name(checksum_ctx->type)); + else + pg_log_debug("reconstructing \"%s\" (%u blocks, checksum %s)", + output_filename, block_length, + pg_checksum_type_name(checksum_ctx->type)); + + /* Open the output file, except in dry_run mode. */ + if (!dry_run && + (wfd = open(output_filename, + O_RDWR | PG_BINARY | O_CREAT | O_EXCL, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", output_filename); + + /* Read and write the blocks as required. */ + for (i = 0; i < block_length; ++i) + { + uint8 buffer[BLCKSZ]; + rfile *s = sourcemap[i]; + unsigned wb; + + /* Update accounting information. */ + if (s == NULL) + ++zero_blocks; + else + { + s->num_blocks_read++; + s->highest_offset_read = Max(s->highest_offset_read, + offsetmap[i] + BLCKSZ); + } + + /* Skip the rest of this in dry-run mode. */ + if (dry_run) + continue; + + /* Read or zero-fill the block as appropriate. */ + if (s == NULL) + { + /* + * New block not mentioned in the WAL summary. Should have been an + * uninitialized block, so just zero-fill it. + */ + memset(buffer, 0, BLCKSZ); + } + else + { + unsigned rb; + + /* Read the block from the correct source, except if dry-run. */ + rb = pg_pread(s->fd, buffer, BLCKSZ, offsetmap[i]); + if (rb != BLCKSZ) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", s->filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %u", + s->filename, (int) rb, BLCKSZ, + (unsigned) offsetmap[i]); + } + } + + /* Write out the block. */ + if ((wb = write(wfd, buffer, BLCKSZ)) != BLCKSZ) + { + if (wb < 0) + pg_fatal("could not write file \"%s\": %m", output_filename); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", + output_filename, (int) wb, BLCKSZ); + } + + /* Update the checksum computation. */ + if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0) + pg_fatal("could not update checksum of file \"%s\"", + output_filename); + } + + /* Debugging output. */ + if (zero_blocks > 0) + { + if (dry_run) + pg_log_debug("would have zero-filled %u blocks", zero_blocks); + else + pg_log_debug("zero-filled %u blocks", zero_blocks); + } + + /* Close the output file. */ + if (wfd >= 0 && close(wfd) != 0) + pg_fatal("could not close \"%s\": %m", output_filename); +} diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h new file mode 100644 index 0000000000..c599a70d42 --- /dev/null +++ b/src/bin/pg_combinebackup/reconstruct.h @@ -0,0 +1,32 @@ +/*------------------------------------------------------------------------- + * + * reconstruct.h + * Reconstruct full file from incremental file and backup chain. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_combinebackup/reconstruct.h + * + *------------------------------------------------------------------------- + */ +#ifndef RECONSTRUCT_H +#define RECONSTRUCT_H + +#include "common/checksum_helper.h" +#include "load_manifest.h" + +extern void reconstruct_from_incremental_file(char *input_filename, + char *output_filename, + char *relative_path, + char *bare_file_name, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + char *manifest_path, + pg_checksum_type checksum_type, + int *checksum_length, + uint8 **checksum_payload, + bool dry_run); + +#endif diff --git a/src/bin/pg_combinebackup/write_manifest.c b/src/bin/pg_combinebackup/write_manifest.c new file mode 100644 index 0000000000..82160134d8 --- /dev/null +++ b/src/bin/pg_combinebackup/write_manifest.c @@ -0,0 +1,293 @@ +/*------------------------------------------------------------------------- + * + * Write a new backup manifest. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/write_manifest.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include +#include +#include + +#include "common/checksum_helper.h" +#include "common/file_perm.h" +#include "common/logging.h" +#include "lib/stringinfo.h" +#include "load_manifest.h" +#include "mb/pg_wchar.h" +#include "write_manifest.h" + +struct manifest_writer +{ + char pathname[MAXPGPATH]; + int fd; + StringInfoData buf; + bool first_file; + bool still_checksumming; + pg_checksum_context manifest_ctx; +}; + +static void escape_json(StringInfo buf, const char *str); +static void flush_manifest(manifest_writer *mwriter); +static size_t hex_encode(const uint8 *src, size_t len, char *dst); + +/* + * Create a new backup manifest writer. + * + * The backup manifest will be written into a file named backup_manifest + * in the specified directory. + */ +manifest_writer * +create_manifest_writer(char *directory) +{ + manifest_writer *mwriter = pg_malloc(sizeof(manifest_writer)); + + snprintf(mwriter->pathname, MAXPGPATH, "%s/backup_manifest", directory); + mwriter->fd = -1; + initStringInfo(&mwriter->buf); + mwriter->first_file = true; + mwriter->still_checksumming = true; + pg_checksum_init(&mwriter->manifest_ctx, CHECKSUM_TYPE_SHA256); + + appendStringInfo(&mwriter->buf, + "{ \"PostgreSQL-Backup-Manifest-Version\": 1,\n" + "\"Files\": ["); + + return mwriter; +} + +/* + * Add an entry for a file to a backup manifest. + * + * This is very similar to the backend's AddFileToBackupManifest, but + * various adjustments are required due to frontend/backend differences + * and other details. + */ +void +add_file_to_manifest(manifest_writer *mwriter, const char *manifest_path, + size_t size, pg_time_t mtime, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload) +{ + int pathlen = strlen(manifest_path); + + if (mwriter->first_file) + { + appendStringInfoChar(&mwriter->buf, '\n'); + mwriter->first_file = false; + } + else + appendStringInfoString(&mwriter->buf, ",\n"); + + if (pg_encoding_verifymbstr(PG_UTF8, manifest_path, pathlen) == pathlen) + { + appendStringInfoString(&mwriter->buf, "{ \"Path\": "); + escape_json(&mwriter->buf, manifest_path); + appendStringInfoString(&mwriter->buf, ", "); + } + else + { + appendStringInfoString(&mwriter->buf, "{ \"Encoded-Path\": \""); + enlargeStringInfo(&mwriter->buf, 2 * pathlen); + mwriter->buf.len += hex_encode((const uint8 *) manifest_path, pathlen, + &mwriter->buf.data[mwriter->buf.len]); + appendStringInfoString(&mwriter->buf, "\", "); + } + + appendStringInfo(&mwriter->buf, "\"Size\": %zu, ", size); + + appendStringInfoString(&mwriter->buf, "\"Last-Modified\": \""); + enlargeStringInfo(&mwriter->buf, 128); + mwriter->buf.len += strftime(&mwriter->buf.data[mwriter->buf.len], 128, + "%Y-%m-%d %H:%M:%S %Z", + gmtime(&mtime)); + appendStringInfoChar(&mwriter->buf, '"'); + + if (mwriter->buf.len > 128 * 1024) + flush_manifest(mwriter); + + if (checksum_length > 0) + { + appendStringInfo(&mwriter->buf, + ", \"Checksum-Algorithm\": \"%s\", \"Checksum\": \"", + pg_checksum_type_name(checksum_type)); + + enlargeStringInfo(&mwriter->buf, 2 * checksum_length); + mwriter->buf.len += hex_encode(checksum_payload, checksum_length, + &mwriter->buf.data[mwriter->buf.len]); + + appendStringInfoChar(&mwriter->buf, '"'); + } + + appendStringInfoString(&mwriter->buf, " }"); + + if (mwriter->buf.len > 128 * 1024) + flush_manifest(mwriter); +} + +/* + * Finalize the backup_manifest. + */ +void +finalize_manifest(manifest_writer *mwriter, + manifest_wal_range *first_wal_range) +{ + uint8 checksumbuf[PG_SHA256_DIGEST_LENGTH]; + int len; + manifest_wal_range *wal_range; + + /* Terminate the list of files. */ + appendStringInfoString(&mwriter->buf, "\n],\n"); + + /* Start a list of LSN ranges. */ + appendStringInfoString(&mwriter->buf, "\"WAL-Ranges\": [\n"); + + for (wal_range = first_wal_range; wal_range != NULL; + wal_range = wal_range->next) + appendStringInfo(&mwriter->buf, + "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%X\", \"End-LSN\": \"%X/%X\" }", + wal_range == first_wal_range ? "" : ",\n", + wal_range->tli, + LSN_FORMAT_ARGS(wal_range->start_lsn), + LSN_FORMAT_ARGS(wal_range->end_lsn)); + + /* Terminate the list of WAL ranges. */ + appendStringInfoString(&mwriter->buf, "\n],\n"); + + /* Flush accumulated data and update checksum calculation. */ + flush_manifest(mwriter); + + /* Checksum only includes data up to this point. */ + mwriter->still_checksumming = false; + + /* Compute and insert manifest checksum. */ + appendStringInfoString(&mwriter->buf, "\"Manifest-Checksum\": \""); + enlargeStringInfo(&mwriter->buf, 2 * PG_SHA256_DIGEST_STRING_LENGTH); + len = pg_checksum_final(&mwriter->manifest_ctx, checksumbuf); + Assert(len == PG_SHA256_DIGEST_LENGTH); + mwriter->buf.len += + hex_encode(checksumbuf, len, &mwriter->buf.data[mwriter->buf.len]); + appendStringInfoString(&mwriter->buf, "\"}\n"); + + /* Flush the last manifest checksum itself. */ + flush_manifest(mwriter); + + /* Close the file. */ + if (close(mwriter->fd) != 0) + pg_fatal("could not close \"%s\": %m", mwriter->pathname); + mwriter->fd = -1; +} + +/* + * Produce a JSON string literal, properly escaping characters in the text. + */ +static void +escape_json(StringInfo buf, const char *str) +{ + const char *p; + + appendStringInfoCharMacro(buf, '"'); + for (p = str; *p; p++) + { + switch (*p) + { + case '\b': + appendStringInfoString(buf, "\\b"); + break; + case '\f': + appendStringInfoString(buf, "\\f"); + break; + case '\n': + appendStringInfoString(buf, "\\n"); + break; + case '\r': + appendStringInfoString(buf, "\\r"); + break; + case '\t': + appendStringInfoString(buf, "\\t"); + break; + case '"': + appendStringInfoString(buf, "\\\""); + break; + case '\\': + appendStringInfoString(buf, "\\\\"); + break; + default: + if ((unsigned char) *p < ' ') + appendStringInfo(buf, "\\u%04x", (int) *p); + else + appendStringInfoCharMacro(buf, *p); + break; + } + } + appendStringInfoCharMacro(buf, '"'); +} + +/* + * Flush whatever portion of the backup manifest we have generated and + * buffered in memory out to a file on disk. + * + * The first call to this function will create the file. After that, we + * keep it open and just append more data. + */ +static void +flush_manifest(manifest_writer *mwriter) +{ + char pathname[MAXPGPATH]; + + if (mwriter->fd == -1 && + (mwriter->fd = open(mwriter->pathname, + O_WRONLY | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", mwriter->pathname); + + if (mwriter->buf.len > 0) + { + ssize_t wb; + + wb = write(mwriter->fd, mwriter->buf.data, mwriter->buf.len); + if (wb != mwriter->buf.len) + { + if (wb < 0) + pg_fatal("could not write \"%s\": %m", mwriter->pathname); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", + pathname, (int) wb, mwriter->buf.len); + } + + if (mwriter->still_checksumming) + pg_checksum_update(&mwriter->manifest_ctx, + (uint8 *) mwriter->buf.data, + mwriter->buf.len); + resetStringInfo(&mwriter->buf); + } +} + +/* + * Encode bytes using two hexademical digits for each one. + */ +static size_t +hex_encode(const uint8 *src, size_t len, char *dst) +{ + const uint8 *end = src + len; + + while (src < end) + { + unsigned n1 = (*src >> 4) & 0xF; + unsigned n2 = *src & 0xF; + + *dst++ = n1 < 10 ? '0' + n1 : 'a' + n1 - 10; + *dst++ = n2 < 10 ? '0' + n2 : 'a' + n2 - 10; + ++src; + } + + return len * 2; +} diff --git a/src/bin/pg_combinebackup/write_manifest.h b/src/bin/pg_combinebackup/write_manifest.h new file mode 100644 index 0000000000..8fd7fe02c8 --- /dev/null +++ b/src/bin/pg_combinebackup/write_manifest.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * Write a new backup manifest. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/write_manifest.h + * + *------------------------------------------------------------------------- + */ +#ifndef WRITE_MANIFEST_H +#define WRITE_MANIFEST_H + +#include "common/checksum_helper.h" +#include "pgtime.h" + +struct manifest_wal_range; + +struct manifest_writer; +typedef struct manifest_writer manifest_writer; + +extern manifest_writer *create_manifest_writer(char *directory); +extern void add_file_to_manifest(manifest_writer *mwriter, + const char *manifest_path, + size_t size, pg_time_t mtime, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload); +extern void finalize_manifest(manifest_writer *mwriter, + struct manifest_wal_range *first_wal_range); + +#endif /* WRITE_MANIFEST_H */ diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index e7ef2b8bd0..f35302e994 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -85,6 +85,7 @@ static void RewriteControlFile(void); static void FindEndOfXLOG(void); static void KillExistingXLOG(void); static void KillExistingArchiveStatus(void); +static void KillExistingWALSummaries(void); static void WriteEmptyXLOG(void); static void usage(void); @@ -488,6 +489,7 @@ main(int argc, char *argv[]) RewriteControlFile(); KillExistingXLOG(); KillExistingArchiveStatus(); + KillExistingWALSummaries(); WriteEmptyXLOG(); printf(_("Write-ahead log reset\n")); @@ -1029,6 +1031,40 @@ KillExistingArchiveStatus(void) pg_fatal("could not close directory \"%s\": %m", ARCHSTATDIR); } +/* + * Remove existing WAL summary files + */ +static void +KillExistingWALSummaries(void) +{ +#define WALSUMMARYDIR XLOGDIR "/summaries" +#define WALSUMMARY_NHEXCHARS 40 + + DIR *xldir; + struct dirent *xlde; + char path[MAXPGPATH + sizeof(WALSUMMARYDIR)]; + + xldir = opendir(WALSUMMARYDIR); + if (xldir == NULL) + pg_fatal("could not open directory \"%s\": %m", WALSUMMARYDIR); + + while (errno = 0, (xlde = readdir(xldir)) != NULL) + { + if (strspn(xlde->d_name, "0123456789ABCDEF") == WALSUMMARY_NHEXCHARS && + strcmp(xlde->d_name + WALSUMMARY_NHEXCHARS, ".summary") == 0) + { + snprintf(path, sizeof(path), "%s/%s", WALSUMMARYDIR, xlde->d_name); + if (unlink(path) < 0) + pg_fatal("could not delete file \"%s\": %m", path); + } + } + + if (errno) + pg_fatal("could not read directory \"%s\": %m", WALSUMMARYDIR); + + if (closedir(xldir)) + pg_fatal("could not close directory \"%s\": %m", ARCHSTATDIR); +} /* * Write an empty XLOG file, containing only the checkpoint record diff --git a/src/common/Makefile b/src/common/Makefile index e4cd26762b..ef38cc2f03 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -48,6 +48,7 @@ LIBS += $(PTHREAD_LIBS) OBJS_COMMON = \ archive.o \ base64.o \ + blkreftable.o \ checksum_helper.o \ compression.o \ config_info.o \ diff --git a/src/common/blkreftable.c b/src/common/blkreftable.c new file mode 100644 index 0000000000..012a443584 --- /dev/null +++ b/src/common/blkreftable.c @@ -0,0 +1,1309 @@ +/*------------------------------------------------------------------------- + * + * blkreftable.c + * Block reference tables. + * + * A block reference table is used to keep track of which blocks have + * been modified by WAL records within a certain LSN range. + * + * For each relation fork, we keep track of all blocks that have appeared + * in block reference in the WAL. We also keep track of the "limit block", + * which is the smallest relation length in blocks known to have occurred + * during that range of WAL records. This should be set to 0 if the relation + * fork is created or destroyed, and to the post-truncation length if + * truncated. + * + * Whenever we set the limit block, we also forget about any modified blocks + * beyond that point. Those blocks don't exist any more. Such blocks can + * later be marked as modified again; if that happens, it means the relation + * was re-extended. + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * src/common/blkreftable.c + * + *------------------------------------------------------------------------- + */ + + +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#ifdef FRONTEND +#include "common/logging.h" +#endif + +#include "common/blkreftable.h" +#include "common/hashfn.h" +#include "port/pg_crc32c.h" + +/* + * A block reference table keeps track of the status of each relation + * fork individually. + */ +typedef struct BlockRefTableKey +{ + RelFileLocator rlocator; + ForkNumber forknum; +} BlockRefTableKey; + +/* + * We could need to store data either for a relation in which only a + * tiny fraction of the blocks have been modified or for a relation in + * which nearly every block has been modified, and we want a + * space-efficient representation in both cases. To accomplish this, + * we divide the relation into chunks of 2^16 blocks and choose between + * an array representation and a bitmap representation for each chunk. + * + * When the number of modified blocks in a given chunk is small, we + * essentially store an array of block numbers, but we need not store the + * entire block number: instead, we store each block number as a 2-byte + * offset from the start of the chunk. + * + * When the number of modified blocks in a given chunk is large, we switch + * to a bitmap representation. + * + * These same basic representational choices are used both when a block + * reference table is stored in memory and when it is serialized to disk. + * + * In the in-memory representation, we initially allocate each chunk with + * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and + * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK. + * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted + * to a bitmap, and thus never needs to grow further. + */ +#define BLOCKS_PER_CHUNK (1 << 16) +#define BLOCKS_PER_ENTRY (BITS_PER_BYTE * sizeof(uint16)) +#define MAX_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY) +#define INITIAL_ENTRIES_PER_CHUNK 16 +typedef uint16 *BlockRefTableChunk; + +/* + * State for one relation fork. + * + * 'rlocator' and 'forknum' identify the relation fork to which this entry + * pertains. + * + * 'limit_block' is the shortest known length of the relation in blocks + * within the LSN range covered by a particular block reference table. + * It should be set to 0 if the relation fork is created or dropped. If the + * relation fork is truncated, it should be set to the number of blocks that + * remain after truncation. + * + * 'nchunks' is the allocated length of each of the three arrays that follow. + * We can only represent the status of block numbers less than nchunks * + * BLOCKS_PER_CHUNK. + * + * 'chunk_size' is an array storing the allocated size of each chunk. + * + * 'chunk_usage' is an array storing the number of elements used in each + * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresonding + * chunk is used as an array; else the corresponding chunk is used as a bitmap. + * When used as a bitmap, the least significant bit of the first array element + * is the status of the lowest-numbered block covered by this chunk. + * + * 'chunk_data' is the array of chunks. + */ +struct BlockRefTableEntry +{ + BlockRefTableKey key; + BlockNumber limit_block; + char status; + uint32 nchunks; + uint16 *chunk_size; + uint16 *chunk_usage; + BlockRefTableChunk *chunk_data; +}; + +/* Declare and define a hash table over type BlockRefTableEntry. */ +#define SH_PREFIX blockreftable +#define SH_ELEMENT_TYPE BlockRefTableEntry +#define SH_KEY_TYPE BlockRefTableKey +#define SH_KEY key +#define SH_HASH_KEY(tb, key) \ + hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey)) +#define SH_EQUAL(tb, a, b) memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0 +#define SH_SCOPE static inline +#ifdef FRONTEND +#define SH_RAW_ALLOCATOR pg_malloc0 +#endif +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +/* + * A block reference table is basically just the hash table, but we don't + * want to expose that to outside callers. + * + * We keep track of the memory context in use explicitly too, so that it's + * easy to place all of our allocations in the same context. + */ +struct BlockRefTable +{ + blockreftable_hash *hash; +#ifndef FRONTEND + MemoryContext mcxt; +#endif +}; + +/* + * On-disk serialization format for block reference table entries. + */ +typedef struct BlockRefTableSerializedEntry +{ + RelFileLocator rlocator; + ForkNumber forknum; + BlockNumber limit_block; + uint32 nchunks; +} BlockRefTableSerializedEntry; + +/* + * Buffer size, so that we avoid doing many small I/Os. + */ +#define BUFSIZE 65536 + +/* + * Ad-hoc buffer for file I/O. + */ +typedef struct BlockRefTableBuffer +{ + io_callback_fn io_callback; + void *io_callback_arg; + char data[BUFSIZE]; + int used; + int cursor; + pg_crc32c crc; +} BlockRefTableBuffer; + +/* + * State for keeping track of progress while incrementally reading a block + * table reference file from disk. + * + * total_chunks means the number of chunks for the RelFileLocator/ForkNumber + * combination that is curently being read, and consumed_chunks is the number + * of those that have been read. (We always read all the information for + * a single chunk at one time, so we don't need to be able to represent the + * state where a chunk has been partially read.) + * + * chunk_size is the array of chunk sizes. The length is given by total_chunks. + * + * chunk_data holds the current chunk. + * + * chunk_position helps us figure out how much progress we've made in returning + * the block numbers for the current chunk to the caller. If the chunk is a + * bitmap, it's the number of bits we've scanned; otherwise, it's the number + * of chunk entries we've scanned. + */ +struct BlockRefTableReader +{ + BlockRefTableBuffer buffer; + char *error_filename; + report_error_fn error_callback; + void *error_callback_arg; + uint32 total_chunks; + uint32 consumed_chunks; + uint16 *chunk_size; + uint16 chunk_data[MAX_ENTRIES_PER_CHUNK]; + uint32 chunk_position; +}; + +/* + * State for keeping track of progress while incrementally writing a block + * reference table file to disk. + */ +struct BlockRefTableWriter +{ + BlockRefTableBuffer buffer; +}; + +/* Function prototypes. */ +static int BlockRefTableComparator(const void *a, const void *b); +static void BlockRefTableFlush(BlockRefTableBuffer *buffer); +static void BlockRefTableRead(BlockRefTableReader *reader, void *data, + int length); +static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, + int length); +static void BlockRefTableFileTerminate(BlockRefTableBuffer *buffer); + +/* + * Create an empty block reference table. + */ +BlockRefTable * +CreateEmptyBlockRefTable(void) +{ + BlockRefTable *brtab = palloc(sizeof(BlockRefTable)); + + /* + * Even completely empty database has a few hundred relation forks, so it + * seems best to size the hash on the assumption that we're going to have + * at least a few thousand entries. + */ +#ifdef FRONTEND + brtab->hash = blockreftable_create(4096, NULL); +#else + brtab->mcxt = CurrentMemoryContext; + brtab->hash = blockreftable_create(brtab->mcxt, 4096, NULL); +#endif + + return brtab; +} + +/* + * Set the "limit block" for a relation fork and forget any modified blocks + * with equal or higher block numbers. + * + * The "limit block" is the shortest known length of the relation within the + * range of WAL records covered by this block reference table. + */ +void +BlockRefTableSetLimitBlock(BlockRefTable *brtab, + const RelFileLocator *rlocator, + ForkNumber forknum, + BlockNumber limit_block) +{ + BlockRefTableEntry *brtentry; + BlockRefTableKey key; + bool found; + + memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator)); + key.forknum = forknum; + brtentry = blockreftable_insert(brtab->hash, key, &found); + + if (!found) + { + /* + * We have no existing data about this relation fork, so just record + * the limit_block value supplied by the caller, and make sure other + * parts of the entry are properly initialized. + */ + brtentry->limit_block = limit_block; + brtentry->nchunks = 0; + brtentry->chunk_size = NULL; + brtentry->chunk_usage = NULL; + brtentry->chunk_data = NULL; + return; + } + + BlockRefTableEntrySetLimitBlock(brtentry, limit_block); +} + +/* + * Mark a block in a given relation fork as known to have been modified. + */ +void +BlockRefTableMarkBlockModified(BlockRefTable *brtab, + const RelFileLocator *rlocator, + ForkNumber forknum, + BlockNumber blknum) +{ + BlockRefTableEntry *brtentry; + BlockRefTableKey key; + bool found; +#ifndef FRONTEND + MemoryContext oldcontext = MemoryContextSwitchTo(brtab->mcxt); +#endif + + memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator)); + key.forknum = forknum; + brtentry = blockreftable_insert(brtab->hash, key, &found); + + if (!found) + { + /* + * We want to set the initial limit block value to something higher + * than any legal block number. InvalidBlockNumber fits the bill. + */ + brtentry->limit_block = InvalidBlockNumber; + brtentry->nchunks = 0; + brtentry->chunk_size = NULL; + brtentry->chunk_usage = NULL; + brtentry->chunk_data = NULL; + } + + BlockRefTableEntryMarkBlockModified(brtentry, forknum, blknum); + +#ifndef FRONTEND + MemoryContextSwitchTo(oldcontext); +#endif +} + +/* + * Get an entry from a block reference table. + * + * If the entry does not exist, this function returns NULL. Otherwise, it + * returns the entry and sets *limit_block to the value from the entry. + */ +BlockRefTableEntry * +BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator, + ForkNumber forknum, BlockNumber *limit_block) +{ + BlockRefTableKey key; + BlockRefTableEntry *entry; + + Assert(limit_block != NULL); + + memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator)); + key.forknum = forknum; + entry = blockreftable_lookup(brtab->hash, key); + + if (entry != NULL) + *limit_block = entry->limit_block; + + return entry; +} + +/* + * Get block numbers from a table entry. + * + * 'blocks' must point to enough space to hold at least 'nblocks' block + * numbers, and any block numbers we manage to get will be written there. + * The return value is the number of block numbers actually written. + * + * We do not return block numbers unless they are greater than or equal to + * start_blkno and strictly less than stop_blkno. + */ +int +BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry, + BlockNumber start_blkno, + BlockNumber stop_blkno, + BlockNumber *blocks, + int nblocks) +{ + uint32 start_chunkno; + uint32 stop_chunkno; + uint32 chunkno; + int nresults = 0; + + Assert(entry != NULL); + + /* + * Figure out which chunks could potentially contain blocks of interest. + * + * We need to be careful about overflow here, because stop_blkno could be + * InvalidBlockNumber or something very close to it. + */ + start_chunkno = start_blkno / BLOCKS_PER_CHUNK; + stop_chunkno = stop_blkno / BLOCKS_PER_CHUNK; + if ((stop_blkno % BLOCKS_PER_CHUNK) != 0) + ++stop_chunkno; + if (stop_chunkno > entry->nchunks) + stop_chunkno = entry->nchunks; + + /* + * Loop over chunks. + */ + for (chunkno = start_chunkno; chunkno < stop_chunkno; ++chunkno) + { + uint16 chunk_usage = entry->chunk_usage[chunkno]; + BlockRefTableChunk chunk_data = entry->chunk_data[chunkno]; + unsigned start_offset = 0; + unsigned stop_offset = BLOCKS_PER_CHUNK; + + /* + * If the start and/or stop block number falls within this chunk, the + * whole chunk may not be of interest. Figure out which portion we + * care about, if it's not the whole thing. + */ + if (chunkno == start_chunkno) + start_offset = start_blkno % BLOCKS_PER_CHUNK; + if (chunkno == stop_chunkno) + stop_offset = stop_blkno % BLOCKS_PER_CHUNK; + + /* + * Handling differs depending on whether this is an array of offsets + * or a bitmap. + */ + if (chunk_usage == MAX_ENTRIES_PER_CHUNK) + { + unsigned i; + + /* It's a bitmap, so test every relevant bit. */ + for (i = start_offset; i < BLOCKS_PER_CHUNK; ++i) + { + uint16 w = chunk_data[i / BLOCKS_PER_ENTRY]; + + if ((w & (1 << (i % BLOCKS_PER_ENTRY))) != 0) + { + BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + i; + + blocks[nresults++] = blkno; + + /* Early exit if we run out of output space. */ + if (nresults == nblocks) + return nresults; + } + } + } + else + { + unsigned i; + + /* It's an array of offsets, so check each one. */ + for (i = 0; i < chunk_usage; ++i) + { + uint16 offset = chunk_data[i]; + + if (offset >= start_offset && offset < stop_offset) + { + BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + offset; + + blocks[nresults++] = blkno; + + /* Early exit if we run out of output space. */ + if (nresults == nblocks) + return nresults; + } + } + } + } + + return nresults; +} + +/* + * Serialize a block reference table to a file. + */ +void +WriteBlockRefTable(BlockRefTable *brtab, + io_callback_fn write_callback, + void *write_callback_arg) +{ + BlockRefTableSerializedEntry *sdata = NULL; + BlockRefTableBuffer buffer; + uint32 magic = BLOCKREFTABLE_MAGIC; + + /* Prepare buffer. */ + memset(&buffer, 0, sizeof(BlockRefTableBuffer)); + buffer.io_callback = write_callback; + buffer.io_callback_arg = write_callback_arg; + INIT_CRC32C(buffer.crc); + + /* Write magic number. */ + BlockRefTableWrite(&buffer, &magic, sizeof(uint32)); + + /* Write the entries, assuming there are some. */ + if (brtab->hash->members > 0) + { + unsigned i = 0; + blockreftable_iterator it; + BlockRefTableEntry *brtentry; + + /* Extract entries into serializable format and sort them. */ + sdata = + palloc(brtab->hash->members * sizeof(BlockRefTableSerializedEntry)); + blockreftable_start_iterate(brtab->hash, &it); + while ((brtentry = blockreftable_iterate(brtab->hash, &it)) != NULL) + { + BlockRefTableSerializedEntry *sentry = &sdata[i++]; + + sentry->rlocator = brtentry->key.rlocator; + sentry->forknum = brtentry->key.forknum; + sentry->limit_block = brtentry->limit_block; + sentry->nchunks = brtentry->nchunks; + + /* trim trailing zero entries */ + while (sentry->nchunks > 0 && + brtentry->chunk_usage[sentry->nchunks - 1] == 0) + sentry->nchunks--; + } + Assert(i == brtab->hash->members); + qsort(sdata, i, sizeof(BlockRefTableSerializedEntry), + BlockRefTableComparator); + + /* Loop over entries in sorted order and serialize each one. */ + for (i = 0; i < brtab->hash->members; ++i) + { + BlockRefTableSerializedEntry *sentry = &sdata[i]; + BlockRefTableEntry *brtentry; + BlockRefTableKey key; + unsigned j; + + /* Write the serialized entry itself. */ + BlockRefTableWrite(&buffer, sentry, + sizeof(BlockRefTableSerializedEntry)); + + /* Look up the original entry so we can access the chunks. */ + memcpy(&key.rlocator, &sentry->rlocator, sizeof(RelFileLocator)); + key.forknum = sentry->forknum; + brtentry = blockreftable_lookup(brtab->hash, key); + Assert(brtentry != NULL); + + /* Write the untruncated portion of the chunk length array. */ + if (sentry->nchunks != 0) + BlockRefTableWrite(&buffer, brtentry->chunk_usage, + sentry->nchunks * sizeof(uint16)); + + /* Write the contents of each chunk. */ + for (j = 0; j < brtentry->nchunks; ++j) + { + if (brtentry->chunk_usage[j] == 0) + continue; + BlockRefTableWrite(&buffer, brtentry->chunk_data[j], + brtentry->chunk_usage[j] * sizeof(uint16)); + } + } + } + + /* Write out appropriate terminator and CRC and flush buffer. */ + BlockRefTableFileTerminate(&buffer); +} + +/* + * Prepare to incrementally read a block reference table file. + * + * 'read_callback' is a function that can be called to read data from the + * underlying file (or other data source) into our internal buffer. + * + * 'read_callback_arg' is an opaque argument to be passed to read_callback. + * + * 'error_filename' is the filename that should be included in error messages + * if the file is found to be malformed. The value is not copied, so the + * caller should ensure that it remains valid until done with this + * BlockRefTableReader. + * + * 'error_callback' is a function to be called if the file is found to be + * malformed. This is not used for I/O errors, which must be handled internally + * by read_callback. + * + * 'error_callback_arg' is an opaque arguent to be passed to error_callback. + */ +BlockRefTableReader * +CreateBlockRefTableReader(io_callback_fn read_callback, + void *read_callback_arg, + char *error_filename, + report_error_fn error_callback, + void *error_callback_arg) +{ + BlockRefTableReader *reader; + uint32 magic; + + /* Initialize data structure. */ + reader = palloc0(sizeof(BlockRefTableReader)); + reader->buffer.io_callback = read_callback; + reader->buffer.io_callback_arg = read_callback_arg; + reader->error_filename = error_filename; + reader->error_callback = error_callback; + reader->error_callback_arg = error_callback_arg; + INIT_CRC32C(reader->buffer.crc); + + /* Verify magic number. */ + BlockRefTableRead(reader, &magic, sizeof(uint32)); + if (magic != BLOCKREFTABLE_MAGIC) + error_callback(error_callback_arg, + "file \"%s\" has wrong magic number: expected %u, found %u", + error_filename, + BLOCKREFTABLE_MAGIC, magic); + + return reader; +} + +/* + * Read next relation fork covered by this block reference table file. + * + * After calling this function, you must call BlockRefTableReaderGetBlocks + * until it returns 0 before calling it again. + */ +bool +BlockRefTableReaderNextRelation(BlockRefTableReader *reader, + RelFileLocator *rlocator, + ForkNumber *forknum, + BlockNumber *limit_block) +{ + BlockRefTableSerializedEntry sentry; + BlockRefTableSerializedEntry zentry = {0}; + + /* + * Sanity check: caller must read all blocks from all chunks before moving + * on to the next relation. + */ + Assert(reader->total_chunks == reader->consumed_chunks); + + /* Read serialized entry. */ + BlockRefTableRead(reader, &sentry, + sizeof(BlockRefTableSerializedEntry)); + + /* + * If we just read the sentinel entry indicating that we've reached the + * end, read and check the CRC. + */ + if (memcmp(&sentry, &zentry, sizeof(BlockRefTableSerializedEntry)) == 0) + { + pg_crc32c expected_crc; + pg_crc32c actual_crc; + + /* + * We want to know the CRC of the file excluding the 4-byte CRC + * itself, so copy the current value of the CRC accumulator before + * reading those bytes, and use the copy to finalize the calculation. + */ + expected_crc = reader->buffer.crc; + FIN_CRC32C(expected_crc); + + /* Now we can read the actual value. */ + BlockRefTableRead(reader, &actual_crc, sizeof(pg_crc32c)); + + /* Throw an error if there is a mismatch. */ + if (!EQ_CRC32C(expected_crc, actual_crc)) + reader->error_callback(reader->error_callback_arg, + "file \"%s\" has wrong checksum: expected %08X, found %08X", + reader->error_filename, expected_crc, actual_crc); + + return false; + } + + /* Read chunk size array. */ + if (reader->chunk_size != NULL) + pfree(reader->chunk_size); + reader->chunk_size = palloc(sentry.nchunks * sizeof(uint16)); + BlockRefTableRead(reader, reader->chunk_size, + sentry.nchunks * sizeof(uint16)); + + /* Set up for chunk scan. */ + reader->total_chunks = sentry.nchunks; + reader->consumed_chunks = 0; + + /* Return data to caller. */ + memcpy(rlocator, &sentry.rlocator, sizeof(RelFileLocator)); + *forknum = sentry.forknum; + *limit_block = sentry.limit_block; + return true; +} + +/* + * Get modified blocks associated with the relation fork returned by + * the most recent call to BlockRefTableReaderNextRelation. + * + * On return, block numbers will be written into the 'blocks' array, whose + * length should be passed via 'nblocks'. The return value is the number of + * entries actually written into the 'blocks' array, which may be less than + * 'nblocks' if we run out of modified blocks in the relation fork before + * we run out of room in the array. + */ +unsigned +BlockRefTableReaderGetBlocks(BlockRefTableReader *reader, + BlockNumber *blocks, + int nblocks) +{ + unsigned blocks_found = 0; + + /* Must provide space for at least one block number to be returned. */ + Assert(nblocks > 0); + + /* Loop collecting blocks to return to caller. */ + for (;;) + { + uint16 next_chunk_size; + + /* + * If we've read at least one chunk, maybe it contains some block + * numbers that could satisfy caller's request. + */ + if (reader->consumed_chunks > 0) + { + uint32 chunkno = reader->consumed_chunks - 1; + uint16 chunk_size = reader->chunk_size[chunkno]; + + if (chunk_size == MAX_ENTRIES_PER_CHUNK) + { + /* Bitmap format, so search for bits that are set. */ + while (reader->chunk_position < BLOCKS_PER_CHUNK && + blocks_found < nblocks) + { + uint16 chunkoffset = reader->chunk_position; + uint16 w; + + w = reader->chunk_data[chunkoffset / BLOCKS_PER_ENTRY]; + if ((w & (1u << (chunkoffset % BLOCKS_PER_ENTRY))) != 0) + blocks[blocks_found++] = + chunkno * BLOCKS_PER_CHUNK + chunkoffset; + ++reader->chunk_position; + } + } + else + { + /* Not in bitmap format, so each entry is a 2-byte offset. */ + while (reader->chunk_position < chunk_size && + blocks_found < nblocks) + { + blocks[blocks_found++] = chunkno * BLOCKS_PER_CHUNK + + reader->chunk_data[reader->chunk_position]; + ++reader->chunk_position; + } + } + } + + /* We found enough blocks, so we're done. */ + if (blocks_found >= nblocks) + break; + + /* + * We didn't find enough blocks, so we must need the next chunk. If + * there are none left, though, then we're done anyway. + */ + if (reader->consumed_chunks == reader->total_chunks) + break; + + /* + * Read data for next chunk and reset scan position to beginning of + * chunk. Note that the next chunk might be empty, in which case we + * consume the chunk without actually consuming any bytes from the + * underlying file. + */ + next_chunk_size = reader->chunk_size[reader->consumed_chunks]; + if (next_chunk_size > 0) + BlockRefTableRead(reader, reader->chunk_data, + next_chunk_size * sizeof(uint16)); + ++reader->consumed_chunks; + reader->chunk_position = 0; + } + + return blocks_found; +} + +/* + * Release memory used while reading a block reference table from a file. + */ +void +DestroyBlockRefTableReader(BlockRefTableReader *reader) +{ + if (reader->chunk_size != NULL) + { + pfree(reader->chunk_size); + reader->chunk_size = NULL; + } + pfree(reader); +} + +/* + * Prepare to write a block reference table file incrementally. + * + * Caller must be able to supply BlockRefTableEntry objects sorted in the + * appropriate order. + */ +BlockRefTableWriter * +CreateBlockRefTableWriter(io_callback_fn write_callback, + void *write_callback_arg) +{ + BlockRefTableWriter *writer; + uint32 magic = BLOCKREFTABLE_MAGIC; + + /* Prepare buffer and CRC check and save callbacks. */ + writer = palloc0(sizeof(BlockRefTableWriter)); + writer->buffer.io_callback = write_callback; + writer->buffer.io_callback_arg = write_callback_arg; + INIT_CRC32C(writer->buffer.crc); + + /* Write magic number. */ + BlockRefTableWrite(&writer->buffer, &magic, sizeof(uint32)); + + return writer; +} + +/* + * Append one entry to a block reference table file. + * + * Note that entries must be written in the proper order, that is, sorted by + * tablespace, then database, then relfilenumber, then fork number. Caller + * is responsible for supplying data in the correct order. If that seems hard, + * use an in-memory BlockRefTable instead. + */ +void +BlockRefTableWriteEntry(BlockRefTableWriter *writer, BlockRefTableEntry *entry) +{ + BlockRefTableSerializedEntry sentry; + unsigned j; + + /* Convert to serialized entry format. */ + sentry.rlocator = entry->key.rlocator; + sentry.forknum = entry->key.forknum; + sentry.limit_block = entry->limit_block; + sentry.nchunks = entry->nchunks; + + /* Trim trailing zero entries. */ + while (sentry.nchunks > 0 && entry->chunk_usage[sentry.nchunks - 1] == 0) + sentry.nchunks--; + + /* Write the serialized entry itself. */ + BlockRefTableWrite(&writer->buffer, &sentry, + sizeof(BlockRefTableSerializedEntry)); + + /* Write the untruncated portion of the chunk length array. */ + if (sentry.nchunks != 0) + BlockRefTableWrite(&writer->buffer, entry->chunk_usage, + sentry.nchunks * sizeof(uint16)); + + /* Write the contents of each chunk. */ + for (j = 0; j < entry->nchunks; ++j) + { + if (entry->chunk_usage[j] == 0) + continue; + BlockRefTableWrite(&writer->buffer, entry->chunk_data[j], + entry->chunk_usage[j] * sizeof(uint16)); + } +} + +/* + * Finalize an incremental write of a block reference table file. + */ +void +DestroyBlockRefTableWriter(BlockRefTableWriter *writer) +{ + BlockRefTableFileTerminate(&writer->buffer); + pfree(writer); +} + +/* + * Allocate a standalone BlockRefTableEntry. + * + * When we're manipulating a full in-memory BlockRefTable, the entries are + * part of the hash table and are allocated by simplehash. This routine is + * used by callers that want to write out a BlockRefTable to a file without + * needing to store the whole thing in memory at once. + * + * Entries allocated by this function can be manipulated using the functions + * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified + * and then written using BlockRefTableWriteEntry and freed using + * BlockRefTableFreeEntry. + */ +BlockRefTableEntry * +CreateBlockRefTableEntry(RelFileLocator rlocator, ForkNumber forknum) +{ + BlockRefTableEntry *entry = palloc0(sizeof(BlockRefTableEntry)); + + memcpy(&entry->key.rlocator, &rlocator, sizeof(RelFileLocator)); + entry->key.forknum = forknum; + entry->limit_block = InvalidBlockNumber; + + return entry; +} + +/* + * Update a BlockRefTableEntry with a new value for the "limit block" and + * forget any equal-or-higher-numbered modified blocks. + * + * The "limit block" is the shortest known length of the relation within the + * range of WAL records covered by this block reference table. + */ +void +BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry, + BlockNumber limit_block) +{ + unsigned chunkno; + unsigned limit_chunkno; + unsigned limit_chunkoffset; + BlockRefTableChunk limit_chunk; + + /* If we already have an equal or lower limit block, do nothing. */ + if (limit_block >= entry->limit_block) + return; + + /* Record the new limit block value. */ + entry->limit_block = limit_block; + + /* + * Figure out which chunk would store the state of the new limit block, + * and which offset within that chunk. + */ + limit_chunkno = limit_block / BLOCKS_PER_CHUNK; + limit_chunkoffset = limit_block % BLOCKS_PER_CHUNK; + + /* + * If the number of chunks is not large enough for any blocks with equal + * or higher block numbers to exist, then there is nothing further to do. + */ + if (limit_chunkno >= entry->nchunks) + return; + + /* Discard entire contents of any higher-numbered chunks. */ + for (chunkno = limit_chunkno + 1; chunkno < entry->nchunks; ++chunkno) + entry->chunk_usage[chunkno] = 0; + + /* + * Next, we need to discard any offsets within the chunk that would + * contain the limit_block. We must handle this differenly depending on + * whether the chunk that would contain limit_block is a bitmap or an + * array of offsets. + */ + limit_chunk = entry->chunk_data[limit_chunkno]; + if (entry->chunk_usage[limit_chunkno] == MAX_ENTRIES_PER_CHUNK) + { + unsigned chunkoffset; + + /* It's a bitmap. Unset bits. */ + for (chunkoffset = limit_chunkoffset; chunkoffset < BLOCKS_PER_CHUNK; + ++chunkoffset) + limit_chunk[chunkoffset / BLOCKS_PER_ENTRY] &= + ~(1 << (chunkoffset % BLOCKS_PER_ENTRY)); + } + else + { + unsigned i, + j = 0; + + /* It's an offset array. Filter out large offsets. */ + for (i = 0; i < entry->chunk_usage[limit_chunkno]; ++i) + { + Assert(j <= i); + if (limit_chunk[i] < limit_chunkoffset) + limit_chunk[j++] = limit_chunk[i]; + } + Assert(j <= entry->chunk_usage[limit_chunkno]); + entry->chunk_usage[limit_chunkno] = j; + } +} + +/* + * Mark a block in a given BlkRefTableEntry as known to have been modified. + */ +void +BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry, + ForkNumber forknum, + BlockNumber blknum) +{ + unsigned chunkno; + unsigned chunkoffset; + unsigned i; + + /* + * Which chunk should store the state of this block? And what is the + * offset of this block relative to the start of that chunk? + */ + chunkno = blknum / BLOCKS_PER_CHUNK; + chunkoffset = blknum % BLOCKS_PER_CHUNK; + + /* + * If 'nchunks' isn't big enough for us to be able to represent the state + * of this block, we need to enlarge our arrays. + */ + if (chunkno >= entry->nchunks) + { + unsigned max_chunks; + unsigned extra_chunks; + + /* + * New array size is a power of 2, at least 16, big enough so that + * chunkno will be a valid array index. + */ + max_chunks = Max(16, entry->nchunks); + while (max_chunks < chunkno + 1) + chunkno *= 2; + Assert(max_chunks > chunkno); + extra_chunks = max_chunks - entry->nchunks; + + if (entry->nchunks == 0) + { + entry->chunk_size = palloc0(sizeof(uint16) * max_chunks); + entry->chunk_usage = palloc0(sizeof(uint16) * max_chunks); + entry->chunk_data = + palloc0(sizeof(BlockRefTableChunk) * max_chunks); + } + else + { + entry->chunk_size = repalloc(entry->chunk_size, + sizeof(uint16) * max_chunks); + memset(&entry->chunk_size[entry->nchunks], 0, + extra_chunks * sizeof(uint16)); + entry->chunk_usage = repalloc(entry->chunk_usage, + sizeof(uint16) * max_chunks); + memset(&entry->chunk_usage[entry->nchunks], 0, + extra_chunks * sizeof(uint16)); + entry->chunk_data = repalloc(entry->chunk_data, + sizeof(BlockRefTableChunk) * max_chunks); + memset(&entry->chunk_data[entry->nchunks], 0, + extra_chunks * sizeof(BlockRefTableChunk)); + } + entry->nchunks = max_chunks; + } + + /* + * If the chunk that covers this block number doesn't exist yet, create it + * as an array and add the appropriate offset to it. We make it pretty + * small initially, because there might only be 1 or a few block + * references in this chunk and we don't want to use up too much memory. + */ + if (entry->chunk_size[chunkno] == 0) + { + entry->chunk_data[chunkno] = + palloc(sizeof(uint16) * INITIAL_ENTRIES_PER_CHUNK); + entry->chunk_size[chunkno] = INITIAL_ENTRIES_PER_CHUNK; + entry->chunk_data[chunkno][0] = chunkoffset; + entry->chunk_usage[chunkno] = 1; + return; + } + + /* + * If the number of entries in this chunk is already maximum, it must be a + * bitmap. Just set the appropriate bit. + */ + if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK) + { + BlockRefTableChunk chunk = entry->chunk_data[chunkno]; + + chunk[chunkoffset / BLOCKS_PER_ENTRY] |= + 1 << (chunkoffset % BLOCKS_PER_ENTRY); + return; + } + + /* + * There is an existing chunk and it's in array format. Let's find out + * whether it already has an entry for this block. If so, we do not need + * to do anything. + */ + for (i = 0; i < entry->chunk_usage[chunkno]; ++i) + { + if (entry->chunk_data[chunkno][i] == chunkoffset) + return; + } + + /* + * If the number of entries currently used is one less than the maximum, + * it's time to convert to bitmap format. + */ + if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK - 1) + { + BlockRefTableChunk newchunk; + unsigned j; + + /* Allocate a new chunk. */ + newchunk = palloc0(MAX_ENTRIES_PER_CHUNK * sizeof(uint16)); + + /* Set the bit for each existing entry. */ + for (j = 0; j < entry->chunk_usage[chunkno]; ++j) + { + unsigned coff = entry->chunk_data[chunkno][j]; + + newchunk[coff / BLOCKS_PER_ENTRY] |= + 1 << (coff % BLOCKS_PER_ENTRY); + } + + /* Set the bit for the new entry. */ + newchunk[chunkoffset / BLOCKS_PER_ENTRY] |= + 1 << (chunkoffset % BLOCKS_PER_ENTRY); + + /* Swap the new chunk into place and update metadata. */ + pfree(entry->chunk_data[chunkno]); + entry->chunk_data[chunkno] = newchunk; + entry->chunk_size[chunkno] = MAX_ENTRIES_PER_CHUNK; + entry->chunk_usage[chunkno] = MAX_ENTRIES_PER_CHUNK; + return; + } + + /* + * OK, we currently have an array, and we don't need to convert to a + * bitmap, but we do need to add a new element. If there's not enough + * room, we'll have to expand the array. + */ + if (entry->chunk_usage[chunkno] == entry->chunk_size[chunkno]) + { + unsigned newsize = entry->chunk_size[chunkno] * 2; + + Assert(newsize <= MAX_ENTRIES_PER_CHUNK); + entry->chunk_data[chunkno] = repalloc(entry->chunk_data[chunkno], + newsize * sizeof(uint16)); + entry->chunk_size[chunkno] = newsize; + } + + /* Now we can add the new entry. */ + entry->chunk_data[chunkno][entry->chunk_usage[chunkno]] = + chunkoffset; + entry->chunk_usage[chunkno]++; +} + +/* + * Release memory for a BlockRefTablEntry that was created by + * CreateBlockRefTableEntry. + */ +void +BlockRefTableFreeEntry(BlockRefTableEntry *entry) +{ + if (entry->chunk_size != NULL) + { + pfree(entry->chunk_size); + entry->chunk_size = NULL; + } + + if (entry->chunk_usage != NULL) + { + pfree(entry->chunk_usage); + entry->chunk_usage = NULL; + } + + if (entry->chunk_data != NULL) + { + pfree(entry->chunk_data); + entry->chunk_data = NULL; + } + + pfree(entry); +} + +/* + * Comparator for BlockRefTableSerializedEntry objects. + * + * We make the tablespace OID the first column of the sort key to match + * the on-disk tree structure. + */ +static int +BlockRefTableComparator(const void *a, const void *b) +{ + const BlockRefTableSerializedEntry *sa = a; + const BlockRefTableSerializedEntry *sb = b; + + if (sa->rlocator.spcOid > sb->rlocator.spcOid) + return 1; + if (sa->rlocator.spcOid < sb->rlocator.spcOid) + return -1; + + if (sa->rlocator.dbOid > sb->rlocator.dbOid) + return 1; + if (sa->rlocator.dbOid < sb->rlocator.dbOid) + return -1; + + if (sa->rlocator.relNumber > sb->rlocator.relNumber) + return 1; + if (sa->rlocator.relNumber < sb->rlocator.relNumber) + return -1; + + if (sa->forknum > sb->forknum) + return 1; + if (sa->forknum < sb->forknum) + return -1; + + return 0; +} + +/* + * Flush any buffered data out of a BlockRefTableBuffer. + */ +static void +BlockRefTableFlush(BlockRefTableBuffer *buffer) +{ + buffer->io_callback(buffer->io_callback_arg, buffer->data, buffer->used); + buffer->used = 0; +} + +/* + * Read data from a BlockRefTableBuffer, and update the running CRC + * calculation for the returned data (but not any data that we may have + * buffered but not yet actually returned). + */ +static void +BlockRefTableRead(BlockRefTableReader *reader, void *data, int length) +{ + BlockRefTableBuffer *buffer = &reader->buffer; + + /* Loop until read is fully satisfied. */ + while (length > 0) + { + if (buffer->cursor < buffer->used) + { + /* + * If any buffered data is available, use that to satisfy as much + * of the request as possible. + */ + int bytes_to_copy = Min(length, buffer->used - buffer->cursor); + + memcpy(data, &buffer->data[buffer->cursor], bytes_to_copy); + COMP_CRC32C(buffer->crc, &buffer->data[buffer->cursor], + bytes_to_copy); + buffer->cursor += bytes_to_copy; + data = ((char *) data) + bytes_to_copy; + length -= bytes_to_copy; + } + else if (length >= BUFSIZE) + { + /* + * If the request length is long, read directly into caller's + * buffer. + */ + int bytes_read; + + bytes_read = buffer->io_callback(buffer->io_callback_arg, + data, length); + COMP_CRC32C(buffer->crc, data, bytes_read); + data = ((char *) data) + bytes_read; + length -= bytes_read; + + /* If we didn't get anything, that's bad. */ + if (bytes_read == 0) + reader->error_callback(reader->error_callback_arg, + "file \"%s\" ends unexpectedly", + reader->error_filename); + } + else + { + /* + * Refill our buffer. + */ + buffer->used = buffer->io_callback(buffer->io_callback_arg, + buffer->data, BUFSIZE); + buffer->cursor = 0; + + /* If we didn't get anything, that's bad. */ + if (buffer->used == 0) + reader->error_callback(reader->error_callback_arg, + "file \"%s\" ends unexpectedly", + reader->error_filename); + } + } +} + +/* + * Supply data to a BlockRefTableBuffer for write to the underlying File, + * and update the running CRC calculation for that data. + */ +static void +BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, int length) +{ + /* Update running CRC calculation. */ + COMP_CRC32C(buffer->crc, data, length); + + /* If the new data can't fit into the buffer, flush the buffer. */ + if (buffer->used + length > BUFSIZE) + { + buffer->io_callback(buffer->io_callback_arg, buffer->data, + buffer->used); + buffer->used = 0; + } + + /* If the new data would fill the buffer, or more, write it directly. */ + if (length >= BUFSIZE) + { + buffer->io_callback(buffer->io_callback_arg, data, length); + return; + } + + /* Otherwise, copy the new data into the buffer. */ + memcpy(&buffer->data[buffer->used], data, length); + buffer->used += length; + Assert(buffer->used <= BUFSIZE); +} + +/* + * Generate the sentinel and CRC required at the end of a block reference + * table file and flush them out of our internal buffer. + */ +static void +BlockRefTableFileTerminate(BlockRefTableBuffer *buffer) +{ + BlockRefTableSerializedEntry zentry = {0}; + pg_crc32c crc; + + /* Write a sentinel indicating that there are no more entries. */ + BlockRefTableWrite(buffer, &zentry, + sizeof(BlockRefTableSerializedEntry)); + + /* + * Writing the checksum will perturb the ongoing checksum calculation, so + * copy the state first and finalize the computation using the copy. + */ + crc = buffer->crc; + FIN_CRC32C(crc); + BlockRefTableWrite(buffer, &crc, sizeof(pg_crc32c)); + + /* Flush any leftover data out of our buffer. */ + BlockRefTableFlush(buffer); +} diff --git a/src/common/meson.build b/src/common/meson.build index cc6671edca..4ee0ea1f9d 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -3,6 +3,7 @@ common_sources = files( 'archive.c', 'base64.c', + 'blkreftable.c', 'checksum_helper.c', 'compression.c', 'controldata_utils.c', diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 48ca852381..fed5d790cc 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -206,6 +206,7 @@ extern int XLogFileOpen(XLogSegNo segno, TimeLineID tli); extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli); extern XLogSegNo XLogGetLastRemovedSegno(void); +extern XLogSegNo XLogGetOldestSegno(TimeLineID tli); extern void XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN); extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); diff --git a/src/include/access/xlogbackup.h b/src/include/access/xlogbackup.h index 1611358137..90e04cad56 100644 --- a/src/include/access/xlogbackup.h +++ b/src/include/access/xlogbackup.h @@ -28,6 +28,8 @@ typedef struct BackupState XLogRecPtr checkpointloc; /* last checkpoint location */ pg_time_t starttime; /* backup start time */ bool started_in_recovery; /* backup started in recovery? */ + XLogRecPtr istartpoint; /* incremental based on backup at this LSN */ + TimeLineID istarttli; /* incremental based on backup on this TLI */ /* Fields saved at the end of backup */ XLogRecPtr stoppoint; /* backup stop WAL location */ diff --git a/src/include/backup/basebackup.h b/src/include/backup/basebackup.h index 1432d9c206..345bd22534 100644 --- a/src/include/backup/basebackup.h +++ b/src/include/backup/basebackup.h @@ -34,6 +34,9 @@ typedef struct int64 size; /* total size as sent; -1 if not known */ } tablespaceinfo; -extern void SendBaseBackup(BaseBackupCmd *cmd); +struct IncrementalBackupInfo; + +extern void SendBaseBackup(BaseBackupCmd *cmd, + struct IncrementalBackupInfo *ib); #endif /* _BASEBACKUP_H */ diff --git a/src/include/backup/basebackup_incremental.h b/src/include/backup/basebackup_incremental.h new file mode 100644 index 0000000000..c300235a2f --- /dev/null +++ b/src/include/backup/basebackup_incremental.h @@ -0,0 +1,56 @@ +/*------------------------------------------------------------------------- + * + * basebackup_incremental.h + * API for incremental backup support + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * src/include/backup/basebackup_incremental.h + * + *------------------------------------------------------------------------- + */ +#ifndef BASEBACKUP_INCREMENTAL_H +#define BASEBACKUP_INCREMENTAL_H + +#include "access/xlogbackup.h" +#include "common/relpath.h" +#include "storage/block.h" +#include "utils/palloc.h" + +#define INCREMENTAL_MAGIC 0xd3ae1f0d + +typedef enum +{ + BACK_UP_FILE_FULLY, + BACK_UP_FILE_INCREMENTALLY, + DO_NOT_BACK_UP_FILE +} FileBackupMethod; + +struct IncrementalBackupInfo; +typedef struct IncrementalBackupInfo IncrementalBackupInfo; + +extern IncrementalBackupInfo *CreateIncrementalBackupInfo(MemoryContext); + +extern void AppendIncrementalManifestData(IncrementalBackupInfo *ib, + const char *data, + int len); +extern void FinalizeIncrementalManifest(IncrementalBackupInfo *ib); + +extern void PrepareForIncrementalBackup(IncrementalBackupInfo *ib, + BackupState *backup_state); + +extern char *GetIncrementalFilePath(Oid dboid, Oid spcoid, + RelFileNumber relfilenumber, + ForkNumber forknum, unsigned segno); +extern FileBackupMethod GetFileBackupMethod(IncrementalBackupInfo *ib, + char *path, + Oid dboid, Oid spcoid, + RelFileNumber relfilenumber, + ForkNumber forknum, + unsigned segno, size_t size, + unsigned *num_blocks_required, + BlockNumber *relative_block_numbers, + unsigned *truncation_block_length); +extern size_t GetIncrementalFileSize(unsigned num_blocks_required); + +#endif diff --git a/src/include/backup/walsummary.h b/src/include/backup/walsummary.h new file mode 100644 index 0000000000..d086e64019 --- /dev/null +++ b/src/include/backup/walsummary.h @@ -0,0 +1,49 @@ +/*------------------------------------------------------------------------- + * + * walsummary.h + * WAL summary management + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * src/include/backup/walsummary.h + * + *------------------------------------------------------------------------- + */ +#ifndef WALSUMMARY_H +#define WALSUMMARY_H + +#include + +#include "access/xlogdefs.h" +#include "nodes/pg_list.h" +#include "storage/fd.h" + +typedef struct WalSummaryIO +{ + File file; + off_t filepos; +} WalSummaryIO; + +typedef struct WalSummaryFile +{ + XLogRecPtr start_lsn; + XLogRecPtr end_lsn; + TimeLineID tli; +} WalSummaryFile; + +extern List *GetWalSummaries(TimeLineID tli, XLogRecPtr start_lsn, + XLogRecPtr end_lsn); +extern List *FilterWalSummaries(List *wslist, TimeLineID tli, + XLogRecPtr start_lsn, XLogRecPtr end_lsn); +extern bool WalSummariesAreComplete(List *wslist, + XLogRecPtr start_lsn, XLogRecPtr end_lsn, + XLogRecPtr *missing_lsn); +extern File OpenWalSummaryFile(WalSummaryFile *ws, bool missing_ok); +extern void RemoveWalSummaryIfOlderThan(WalSummaryFile *ws, + time_t cutoff_time); + +extern int ReadWalSummary(void *wal_summary_io, void *data, int length); +extern int WriteWalSummary(void *wal_summary_io, void *data, int length); +extern void ReportWalSummaryError(void *callback_arg, char *fmt,...); + +#endif /* WALSUMMARY_H */ diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 6996073989..c21573efb6 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12043,4 +12043,23 @@ proname => 'any_value_transfn', prorettype => 'anyelement', proargtypes => 'anyelement anyelement', prosrc => 'any_value_transfn' }, +{ oid => '8436', + descr => 'list of available WAL summary files', + proname => 'pg_available_wal_summaries', prorows => '100', + proretset => 't', provolatile => 'v', proparallel => 's', + prorettype => 'record', proargtypes => '', + proallargtypes => '{int8,pg_lsn,pg_lsn}', + proargmodes => '{o,o,o}', + proargnames => '{tli,start_lsn,end_lsn}', + prosrc => 'pg_available_wal_summaries' }, +{ oid => '8437', + descr => 'contents of a WAL sumamry file', + proname => 'pg_wal_summary_contents', prorows => '100', + proretset => 't', provolatile => 'v', proparallel => 's', + prorettype => 'record', proargtypes => 'int8 pg_lsn pg_lsn', + proallargtypes => '{int8,pg_lsn,pg_lsn,oid,oid,oid,int2,int8,bool}', + proargmodes => '{i,i,i,o,o,o,o,o,o}', + proargnames => '{tli,start_lsn,end_lsn,relfilenode,reltablespace,reldatabase,relforknumber,relblocknumber,is_limit_block}', + prosrc => 'pg_wal_summary_contents' }, + ] diff --git a/src/include/common/blkreftable.h b/src/include/common/blkreftable.h new file mode 100644 index 0000000000..22d9883dc5 --- /dev/null +++ b/src/include/common/blkreftable.h @@ -0,0 +1,120 @@ +/*------------------------------------------------------------------------- + * + * blkreftable.h + * Block reference tables. + * + * A block reference table is used to keep track of which blocks have + * been modified by WAL records within a certain LSN range. + * + * For each relation fork, there is a "limit block number". All existing + * blocks greater than or equal to the limit block number must be + * considered modified; for those less than the limit block number, + * we maintain a bitmap. When a relation fork is created or dropped, + * the limit block number should be set to 0. When it's truncated, + * the limit block number should be set to the length in blocks to + * which it was truncated. + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * src/include/common/blkreftable.h + * + *------------------------------------------------------------------------- + */ +#ifndef BLKREFTABLE_H +#define BLKREFTABLE_H + +#include "storage/block.h" +#include "storage/relfilelocator.h" + +/* Magic number for serialization file format. */ +#define BLOCKREFTABLE_MAGIC 0x652b137b + +struct BlockRefTable; +struct BlockRefTableEntry; +struct BlockRefTableReader; +struct BlockRefTableWriter; +typedef struct BlockRefTable BlockRefTable; +typedef struct BlockRefTableEntry BlockRefTableEntry; +typedef struct BlockRefTableReader BlockRefTableReader; +typedef struct BlockRefTableWriter BlockRefTableWriter; + +/* + * The return value of io_callback_fn should be the number of bytes read + * or written. If an error occurs, the functions should report it and + * not return. When used as a write callback, short writes should be retried + * or treated as errors, so that if the callback returns, the return value + * is always the request length. + * + * report_error_fn should not return. + */ +typedef int (*io_callback_fn) (void *callback_arg, void *data, int length); +typedef void (*report_error_fn) (void *calblack_arg, char *msg,...); + + +/* + * Functions for manipulating an entire in-memory block reference table. + */ +extern BlockRefTable *CreateEmptyBlockRefTable(void); +extern void BlockRefTableSetLimitBlock(BlockRefTable *brtab, + const RelFileLocator *rlocator, + ForkNumber forknum, + BlockNumber limit_block); +extern void BlockRefTableMarkBlockModified(BlockRefTable *brtab, + const RelFileLocator *rlocator, + ForkNumber forknum, + BlockNumber blknum); +extern void WriteBlockRefTable(BlockRefTable *brtab, + io_callback_fn write_callback, + void *write_callback_arg); + +extern BlockRefTableEntry *BlockRefTableGetEntry(BlockRefTable *brtab, + const RelFileLocator *rlocator, + ForkNumber forknum, + BlockNumber *limit_block); +extern int BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry, + BlockNumber start_blkno, + BlockNumber stop_blkno, + BlockNumber *blocks, + int nblocks); + +/* + * Functions for reading a block reference table incrementally from disk. + */ +extern BlockRefTableReader *CreateBlockRefTableReader(io_callback_fn read_callback, + void *read_callback_arg, + char *error_filename, + report_error_fn error_callback, + void *error_callback_arg); +extern bool BlockRefTableReaderNextRelation(BlockRefTableReader *reader, + RelFileLocator *rlocator, + ForkNumber *forknum, + BlockNumber *limit_block); +extern unsigned BlockRefTableReaderGetBlocks(BlockRefTableReader *reader, + BlockNumber *blocks, + int nblocks); +extern void DestroyBlockRefTableReader(BlockRefTableReader *reader); + +/* + * Functions for writing a block reference table incrementally to disk. + * + * Note that entries must be written in the proper order, that is, sorted by + * database, then tablespace, then relfilenumber, then fork number. Caller + * is responsible for supplying data in the correct order. If that seems hard, + * use an in-memory BlockRefTable instead. + */ +extern BlockRefTableWriter *CreateBlockRefTableWriter(io_callback_fn write_callback, + void *write_callback_arg); +extern void BlockRefTableWriteEntry(BlockRefTableWriter *writer, + BlockRefTableEntry *entry); +extern void DestroyBlockRefTableWriter(BlockRefTableWriter *writer); + +extern BlockRefTableEntry *CreateBlockRefTableEntry(RelFileLocator rlocator, + ForkNumber forknum); +extern void BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry, + BlockNumber limit_block); +extern void BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry, + ForkNumber forknum, + BlockNumber blknum); +extern void BlockRefTableFreeEntry(BlockRefTableEntry *entry); + +#endif /* BLKREFTABLE_H */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 14bd574fc2..898adccb25 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -338,6 +338,7 @@ typedef enum BackendType B_STARTUP, B_WAL_RECEIVER, B_WAL_SENDER, + B_WAL_SUMMARIZER, B_WAL_WRITER, } BackendType; @@ -443,6 +444,7 @@ typedef enum CheckpointerProcess, WalWriterProcess, WalReceiverProcess, + WalSummarizerProcess, NUM_AUXPROCTYPES /* Must be last! */ } AuxProcType; @@ -455,6 +457,7 @@ extern PGDLLIMPORT AuxProcType MyAuxProcType; #define AmCheckpointerProcess() (MyAuxProcType == CheckpointerProcess) #define AmWalWriterProcess() (MyAuxProcType == WalWriterProcess) #define AmWalReceiverProcess() (MyAuxProcType == WalReceiverProcess) +#define AmWalSummarizerProcess() (MyAuxProcType == WalSummarizerProcess) /***************************************************************************** diff --git a/src/include/nodes/replnodes.h b/src/include/nodes/replnodes.h index 4321ba8f86..856491eecd 100644 --- a/src/include/nodes/replnodes.h +++ b/src/include/nodes/replnodes.h @@ -108,4 +108,13 @@ typedef struct TimeLineHistoryCmd TimeLineID timeline; } TimeLineHistoryCmd; +/* ---------------------- + * UPLOAD_MANIFEST command + * ---------------------- + */ +typedef struct UploadManifestCmd +{ + NodeTag type; +} UploadManifestCmd; + #endif /* REPLNODES_H */ diff --git a/src/include/postmaster/walsummarizer.h b/src/include/postmaster/walsummarizer.h new file mode 100644 index 0000000000..7584cb69a7 --- /dev/null +++ b/src/include/postmaster/walsummarizer.h @@ -0,0 +1,31 @@ +/*------------------------------------------------------------------------- + * + * walsummarizer.h + * + * Header file for background WAL summarization process. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/postmaster/walsummarizer.h + * + *------------------------------------------------------------------------- + */ +#ifndef WALSUMMARIZER_H +#define WALSUMMARIZER_H + +#include "access/xlogdefs.h" + +extern int wal_summarize_mb; +extern int wal_summarize_keep_time; + +extern Size WalSummarizerShmemSize(void); +extern void WalSummarizerShmemInit(void); +extern void WalSummarizerMain(void) pg_attribute_noreturn(); + +extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli, + bool *lsn_is_exact); +extern void SetWalSummarizerLatch(void); +extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout); + +#endif diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index ef74f32693..ee55008082 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -417,11 +417,12 @@ extern PGDLLIMPORT PGPROC *PreparedXactProcs; * We set aside some extra PGPROC structures for auxiliary processes, * ie things that aren't full-fledged backends but need shmem access. * - * Background writer, checkpointer, WAL writer and archiver run during normal - * operation. Startup process and WAL receiver also consume 2 slots, but WAL - * writer is launched only after startup has exited, so we only need 5 slots. + * Background writer, checkpointer, WAL writer, WAL summarizer, and archiver + * run during normal operation. Startup process and WAL receiver also consume + * 2 slots, but WAL writer is launched only after startup has exited, so we + * only need 6 slots. */ -#define NUM_AUXILIARY_PROCS 5 +#define NUM_AUXILIARY_PROCS 6 /* configurable options */ extern PGDLLIMPORT int DeadlockTimeout; diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index d5a0880678..7d3bc0f671 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -72,6 +72,7 @@ enum config_group WAL_RECOVERY, WAL_ARCHIVE_RECOVERY, WAL_RECOVERY_TARGET, + WAL_SUMMARIZATION, REPLICATION_SENDING, REPLICATION_PRIMARY, REPLICATION_STANDBY, diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 518d3b0a1f..3f99e2eddb 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -47,6 +47,7 @@ typedef enum WAIT_EVENT_SYSLOGGER_MAIN, WAIT_EVENT_WAL_RECEIVER_MAIN, WAIT_EVENT_WAL_SENDER_MAIN, + WAIT_EVENT_WAL_SUMMARIZER_WAL, WAIT_EVENT_WAL_WRITER_MAIN } WaitEventActivity; @@ -131,6 +132,7 @@ typedef enum WAIT_EVENT_SYNC_REP, WAIT_EVENT_WAL_RECEIVER_EXIT, WAIT_EVENT_WAL_RECEIVER_WAIT_START, + WAIT_EVENT_WAL_SUMMARY_READY, WAIT_EVENT_XACT_GROUP_UPDATE } WaitEventIPC; @@ -150,7 +152,8 @@ typedef enum WAIT_EVENT_REGISTER_SYNC_REQUEST, WAIT_EVENT_SPIN_DELAY, WAIT_EVENT_VACUUM_DELAY, - WAIT_EVENT_VACUUM_TRUNCATE + WAIT_EVENT_VACUUM_TRUNCATE, + WAIT_EVENT_WAL_SUMMARIZER_ERROR } WaitEventTimeout; /* ---------- @@ -232,6 +235,8 @@ typedef enum WAIT_EVENT_WAL_INIT_SYNC, WAIT_EVENT_WAL_INIT_WRITE, WAIT_EVENT_WAL_READ, + WAIT_EVENT_WAL_SUMMARY_READ, + WAIT_EVENT_WAL_SUMMARY_WRITE, WAIT_EVENT_WAL_SYNC, WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN, WAIT_EVENT_WAL_WRITE diff --git a/src/test/recovery/t/001_stream_rep.pl b/src/test/recovery/t/001_stream_rep.pl index 0c72ba0944..353db33a9f 100644 --- a/src/test/recovery/t/001_stream_rep.pl +++ b/src/test/recovery/t/001_stream_rep.pl @@ -15,6 +15,8 @@ my $node_primary = PostgreSQL::Test::Cluster->new('primary'); $node_primary->init( allows_streaming => 1, auth_extra => [ '--create-role', 'repl_role' ]); +# WAL summarization can postpone WAL recycling, leading to test failures +$node_primary->append_conf('postgresql.conf', "wal_summarize_mb = 0"); $node_primary->start; my $backup_name = 'my_backup'; diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl index 33e50ad933..6ba5eca700 100644 --- a/src/test/recovery/t/019_replslot_limit.pl +++ b/src/test/recovery/t/019_replslot_limit.pl @@ -22,6 +22,7 @@ $node_primary->append_conf( min_wal_size = 2MB max_wal_size = 4MB log_checkpoints = yes +wal_summarize_mb = 0 )); $node_primary->start; $node_primary->safe_psql('postgres', @@ -256,6 +257,7 @@ $node_primary2->append_conf( min_wal_size = 32MB max_wal_size = 32MB log_checkpoints = yes +wal_summarize_mb = 0 )); $node_primary2->start; $node_primary2->safe_psql('postgres', @@ -310,6 +312,7 @@ $node_primary3->append_conf( max_wal_size = 2MB log_checkpoints = yes max_slot_wal_keep_size = 1MB + wal_summarize_mb = 0 )); $node_primary3->start; $node_primary3->safe_psql('postgres', diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index 480e6d6caa..a91437dfa7 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -250,6 +250,7 @@ $node_primary->append_conf( wal_level = 'logical' max_replication_slots = 4 max_wal_senders = 4 +wal_summarize_mb = 0 }); $node_primary->dump_info; $node_primary->start; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 260854747b..48a10a5d39 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3979,3 +3979,27 @@ yyscan_t z_stream z_streamp zic_t +BlockRefTable +BlockRefTableBuffer +BlockRefTableEntry +BlockRefTableKey +BlockRefTableReader +BlockRefTableSerializedEntry +BlockRefTableWriter +FileBackupMethod +FileChunkContext +IncrementalBackupInfo +SummarizerReadLocalXLogPrivate +UploadManifestCmd +WalSummarizerData +WalSummaryFile +WalSummaryIO +backup_file_entry +backup_wal_range +cb_cleanup_dir +cb_options +cb_tablespace +cb_tablespace_mapping +manifest_data +manifest_writer +rfile -- 2.37.1 (Apple Git-137.1)