From c5b0a92f4808816108bdff02e5c137280749d01c Mon Sep 17 00:00:00 2001 From: Amul Sul Date: Tue, 27 Jan 2026 15:38:34 +0530 Subject: [PATCH v15 07/11] pg_waldump: Remove the restriction on the order of archived WAL files. With previous patch, pg_waldump would stop decoding if WAL files were not in the required sequence. With this patch, decoding will now continue. Any WAL file that is out of order will be written to a temporary location, from which it will be read later. Once a temporary file has been read, it will be removed. --- doc/src/sgml/ref/pg_waldump.sgml | 19 ++- src/bin/pg_waldump/archive_waldump.c | 172 +++++++++++++++++++++++++-- src/bin/pg_waldump/pg_waldump.c | 32 ++++- src/bin/pg_waldump/pg_waldump.h | 3 + src/bin/pg_waldump/t/001_basic.pl | 3 +- 5 files changed, 209 insertions(+), 20 deletions(-) diff --git a/doc/src/sgml/ref/pg_waldump.sgml b/doc/src/sgml/ref/pg_waldump.sgml index 15fb8d13199..9bbb4bd5772 100644 --- a/doc/src/sgml/ref/pg_waldump.sgml +++ b/doc/src/sgml/ref/pg_waldump.sgml @@ -149,8 +149,12 @@ PostgreSQL documentation of PGDATA. - If a tar archive is provided, its WAL segment files must be in - sequential order; otherwise, an error will be reported. + If a tar archive is provided and its WAL segment files are not in + sequential order, those files will be written to a temporary directory + named starting with waldump_tmp. This directory will be + created inside the directory specified by the TMPDIR + environment variable if it is set; otherwise, it will be created within + the same directory as the tar archive. @@ -387,6 +391,17 @@ PostgreSQL documentation + + + TMPDIR + + + Directory in which to create temporary files when reading WAL from a + tar archive with out-of-order segment files. If not set, the temporary + directory is created within the same directory as the tar archive. + + + diff --git a/src/bin/pg_waldump/archive_waldump.c b/src/bin/pg_waldump/archive_waldump.c index 4a95b47b4da..1479efe61f5 100644 --- a/src/bin/pg_waldump/archive_waldump.c +++ b/src/bin/pg_waldump/archive_waldump.c @@ -17,6 +17,7 @@ #include #include "access/xlog_internal.h" +#include "common/file_perm.h" #include "common/hashfn.h" #include "common/logging.h" #include "fe_utils/simple_list.h" @@ -27,6 +28,9 @@ */ #define READ_CHUNK_SIZE (128 * 1024) +/* Temporary exported WAL file directory */ +char *TmpWalSegDir = NULL; + /* * Check if the start segment number is zero; this indicates a request to read * any WAL file. @@ -57,6 +61,8 @@ typedef struct ArchivedWALFile const char *fname; /* hash key: WAL segment name */ StringInfo buf; /* holds WAL bytes read from archive */ + bool spilled; /* true if the WAL data was spilled to a + * temporary file */ int read_len; /* total bytes of a WAL read from archive */ } ArchivedWALFile; @@ -84,6 +90,11 @@ static ArchivedWALFile *get_archive_wal_entry(const char *fname, XLogDumpPrivate *privateInfo, int WalSegSz); static int read_archive_file(XLogDumpPrivate *privateInfo, Size count); +static void setup_tmpwal_dir(const char *waldir); +static void cleanup_tmpwal_dir_atexit(void); + +static FILE *prepare_tmp_write(const char *fname); +static void perform_tmp_write(const char *fname, StringInfo buf, FILE *file); static astreamer *astreamer_waldump_new(XLogDumpPrivate *privateInfo); static void astreamer_waldump_content(astreamer *streamer, @@ -106,7 +117,9 @@ static const astreamer_ops astreamer_waldump_ops = { /* * Initializes the tar archive reader, creates a hash table for WAL entries, * checks for existing valid WAL segments in the archive file and retrieves the - * segment size, and sets up filters for relevant entries. + * segment size, and sets up filters for relevant entries. It also configures a + * temporary directory for out-of-order WAL data and registers an exit callback + * to clean up temporary files. */ void init_archive_reader(XLogDumpPrivate *privateInfo, const char *waldir, @@ -199,6 +212,13 @@ init_archive_reader(XLogDumpPrivate *privateInfo, const char *waldir, privateInfo->start_segno > segno || privateInfo->end_segno < segno) free_archive_wal_entry(entry->fname, privateInfo); + + /* + * Setup temporary directory to store WAL segments and set up an exit + * callback to remove it upon completion. + */ + setup_tmpwal_dir(waldir); + atexit(cleanup_tmpwal_dir_atexit); } /* @@ -365,6 +385,17 @@ free_archive_wal_entry(const char *fname, XLogDumpPrivate *privateInfo) destroyStringInfo(entry->buf); entry->buf = NULL; + /* Remove temporary file if any */ + if (entry->spilled) + { + char fpath[MAXPGPATH]; + + snprintf(fpath, MAXPGPATH, "%s/%s", TmpWalSegDir, fname); + + if (unlink(fpath) == 0) + pg_log_debug("removed file \"%s\"", fpath); + } + /* Set cur_file to NULL if it matches the entry being ignored */ if (privateInfo->cur_file == entry) privateInfo->cur_file = NULL; @@ -376,12 +407,16 @@ free_archive_wal_entry(const char *fname, XLogDumpPrivate *privateInfo) * Returns the archived WAL entry from the hash table if it exists. Otherwise, * it invokes the routine to read the archived file, which then populates the * entry in the hash table if that WAL exists in the archive. + * If the archive streamer happens to be reading a + * WAL from archive file that is not currently needed, that WAL data is written + * to a temporary file. */ static ArchivedWALFile * get_archive_wal_entry(const char *fname, XLogDumpPrivate *privateInfo, int WalSegSz) { ArchivedWALFile *entry = NULL; + FILE *write_fp = NULL; /* Search hash table */ entry = ArchivedWAL_lookup(privateInfo->archive_wal_htab, fname); @@ -395,28 +430,59 @@ get_archive_wal_entry(const char *fname, XLogDumpPrivate *privateInfo, */ while (1) { + /* + * The WAL file entry currently being processed may change during + * archive streamer execution. Therefore, maintain a local variable to + * reference the previous entry, ensuring that any remaining data in + * its buffer is successfully flushed to the temporary file before + * switching to the next WAL entry. + */ + entry = privateInfo->cur_file; + /* Fetch more data */ - if (read_archive_file(privateInfo, READ_CHUNK_SIZE) == 0) - break; /* archive file ended */ + if (entry == NULL || entry->buf->len == 0) + { + if (read_archive_file(privateInfo, READ_CHUNK_SIZE) == 0) + break; /* archive file ended */ + } /* * Archived streamer is reading a non-WAL file or an irrelevant WAL * file. */ - if (privateInfo->cur_file == NULL) + if (entry == NULL) continue; - entry = privateInfo->cur_file; - /* Found the required entry */ if (strcmp(fname, entry->fname) == 0) return entry; - /* WAL segments must be archived in order */ - pg_log_error("WAL files are not archived in sequential order"); - pg_log_error_detail("Expecting segment \"%s\" but found \"%s\".", - fname, entry->fname); - exit(1); + /* + * Archive streamer is currently reading a file that isn't the one + * asked for, but it's required in the future. It should be written to + * a temporary location for retrieval when needed. + */ + + /* Create a temporary file if one does not already exist */ + if (!entry->spilled) + { + write_fp = prepare_tmp_write(entry->fname); + entry->spilled = true; + } + + /* Flush data from the buffer to the file */ + perform_tmp_write(entry->fname, entry->buf, write_fp); + resetStringInfo(entry->buf); + + /* + * The change in the current segment entry indicates that the reading + * of this file has ended. + */ + if (entry != privateInfo->cur_file && write_fp != NULL) + { + fclose(write_fp); + write_fp = NULL; + } } /* Requested WAL segment not found */ @@ -454,7 +520,88 @@ read_archive_file(XLogDumpPrivate *privateInfo, Size count) } /* - * Create an astreamer that can read WAL from a tar file. + * Set up a temporary directory to temporarily store WAL segments. + */ +static void +setup_tmpwal_dir(const char *waldir) +{ + char *template; + + /* + * Use the directory specified by the TMPDIR environment variable. If it's + * not set, use the provided WAL directory to extract WAL file + * temporarily. + */ + template = psprintf("%s/waldump_tmp-XXXXXX", + getenv("TMPDIR") ? getenv("TMPDIR") : waldir); + TmpWalSegDir = mkdtemp(template); + + if (TmpWalSegDir == NULL) + pg_fatal("could not create directory \"%s\": %m", template); + + canonicalize_path(TmpWalSegDir); + + pg_log_debug("created directory \"%s\"", TmpWalSegDir); +} + +/* + * Remove temporary directory at exit, if any. + */ +static void +cleanup_tmpwal_dir_atexit(void) +{ + rmtree(TmpWalSegDir, true); +} + +/* + * Create an empty placeholder file and return its handle. + */ +static FILE * +prepare_tmp_write(const char *fname) +{ + char fpath[MAXPGPATH]; + FILE *file; + + snprintf(fpath, MAXPGPATH, "%s/%s", TmpWalSegDir, fname); + + /* Create an empty placeholder */ + file = fopen(fpath, PG_BINARY_W); + if (file == NULL) + pg_fatal("could not create file \"%s\": %m", fpath); + +#ifndef WIN32 + if (chmod(fpath, pg_file_create_mode)) + pg_fatal("could not set permissions on file \"%s\": %m", + fpath); +#endif + + pg_log_debug("spilling to temporary file \"%s\"", fpath); + + return file; +} + +/* + * Write buffer data to the given file handle. + */ +static void +perform_tmp_write(const char *fname, StringInfo buf, FILE *file) +{ + Assert(file); + + errno = 0; + if (buf->len > 0 && fwrite(buf->data, buf->len, 1, file) != 1) + { + /* + * If write didn't set errno, assume problem is no disk space + */ + if (errno == 0) + errno = ENOSPC; + pg_fatal("could not write to file \"%s\": %m", fname); + } +} + +/* + * Create an astreamer that can read WAL from tar file. */ static astreamer * astreamer_waldump_new(XLogDumpPrivate *privateInfo) @@ -538,6 +685,7 @@ astreamer_waldump_content(astreamer *streamer, astreamer_member *member, } entry->buf = makeStringInfo(); + entry->spilled = false; entry->read_len = 0; privateInfo->cur_file = entry; } diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index a18c56a7322..4b438b53ead 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -478,10 +478,14 @@ TarWALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, return -1; /* - * If the target page is in a different segment, free the buffer space - * occupied by the previous segment data. Since pg_waldump never requests - * the same WAL bytes twice, moving to a new segment implies the previous - * buffer's data and that segment will not be needed again. + * If the target page is in a different segment, free the buffer and/or + * temporary file disk space occupied by the previous segment's data. + * Since pg_waldump never requests the same WAL bytes twice, moving to a + * new segment implies the previous buffer's data and that segment will + * not be needed again. + * + * Afterward, check for the next required WAL segment's physical existence + * in the temporary directory first before invoking the archive streamer. */ curSegNo = state->seg.ws_segno; if (!XLByteInSeg(targetPagePtr, curSegNo, WalSegSz)) @@ -497,6 +501,13 @@ TarWALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, state->seg.ws_tli = private->timeline; state->seg.ws_segno = nextSegNo; + /* Close the WAL segment file if it is currently open */ + if (state->seg.ws_file >= 0) + { + close(state->seg.ws_file); + state->seg.ws_file = -1; + } + /* * If in pre-reading mode (prior to actual decoding), do not delete any * entries that might be requested again once the decoding loop starts. @@ -507,9 +518,20 @@ TarWALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, XLogFileName(fname, state->seg.ws_tli, curSegNo, WalSegSz); free_archive_wal_entry(fname, private); } + + /* + * If the next segment exists, open it and continue reading from there + */ + XLogFileName(fname, state->seg.ws_tli, nextSegNo, WalSegSz); + state->seg.ws_file = open_file_in_directory(TmpWalSegDir, fname); } - /* Read the WAL page from the archive streamer */ + /* Continue reading from the open WAL segment, if any */ + if (state->seg.ws_file >= 0) + return WALDumpReadPage(state, targetPagePtr, count, targetPtr, + readBuff); + + /* Otherwise, read the WAL page from the archive streamer */ return read_archive_wal_page(private, targetPagePtr, count, readBuff, WalSegSz); } diff --git a/src/bin/pg_waldump/pg_waldump.h b/src/bin/pg_waldump/pg_waldump.h index 54d54a8a718..6c242b7fcbc 100644 --- a/src/bin/pg_waldump/pg_waldump.h +++ b/src/bin/pg_waldump/pg_waldump.h @@ -18,6 +18,9 @@ struct ArchivedWALFile; struct ArchivedWAL_hash; +/* Temporary directory */ +extern char *TmpWalSegDir; + /* Contains the necessary information to drive WAL decoding */ typedef struct XLogDumpPrivate { diff --git a/src/bin/pg_waldump/t/001_basic.pl b/src/bin/pg_waldump/t/001_basic.pl index 6f8ce319841..6960bd46ba4 100644 --- a/src/bin/pg_waldump/t/001_basic.pl +++ b/src/bin/pg_waldump/t/001_basic.pl @@ -7,6 +7,7 @@ use Cwd; use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; +use List::Util qw(shuffle); my $tar = $ENV{TAR}; @@ -312,7 +313,7 @@ sub generate_archive } closedir $dh; - @files = sort @files; + @files = shuffle @files; # move into the WAL directory before archiving files my $cwd = getcwd; -- 2.47.1