Where are we on this?
---------------------------------------------------------------------------
On Mon, Jan 16, 2012 at 01:52:35AM +0000, Simon Riggs wrote:
> On Fri, Dec 16, 2011 at 3:01 PM, Simon Riggs <simon@2ndquadrant.com> wrote:
> > archive_command and restore_command describe how to ship WAL files
> > to/from an archive.
> >
> > When there is nothing to ship, we delay sending WAL files. When no WAL
> > files, the standby has no information at all.
> >
> > To provide some form of keepalive on quiet systems the
> > archive_keepalive_command provides a generic hook to implement
> > keepalives. This is implemented as a separate command to avoid storing
> > keepalive messages in the archive, or at least allow overwrites using
> > a single filename like "keepalive".
> >
> > Examples
> > archive_keepalive_command = 'arch_cmd keepalive' # sends a file
> > called "keepalive" to archive, overwrites allowed
> > archive_keepalive_command = 'arch_cmd %f.%t.keepalive #sends a file
> > like 000000010000000AB00000000FE.20111216143517.keepalive
> >
> > If there is no WAL file to send, then we send a keepalive file
> > instead. Keepalive is a small file that contains same contents as a
> > streaming keepalive message (re: other patch on that).
> >
> > If no WAL file is available and we are attempting to restore in
> > standby_mode, then we execute restore_keepalive_command to see if a
> > keepalive file is available. Checks for a file in the specific
> > keepalive format and then uses that to update last received info from
> > master.
> >
> > e.g.
> > restore_keepalive_command = 'restore_cmd keepalive' # gets a file
> > called "keepalive" to archive, overwrites allowed
>
> Patch.
>
> --
> Simon Riggs http://www.2ndQuadrant.com/
> PostgreSQL Development, 24x7 Support, Training & Services
> diff --git a/src/backend/access/transam/recovery.conf.sample b/src/backend/access/transam/recovery.conf.sample
> index 5acfa57..fab288c 100644
> --- a/src/backend/access/transam/recovery.conf.sample
> +++ b/src/backend/access/transam/recovery.conf.sample
> @@ -43,6 +43,13 @@
> #
> #restore_command = '' # e.g. 'cp /mnt/server/archivedir/%f %p'
> #
> +# restore_keepalive_command
> +#
> +# specifies an optional shell command to download keepalive files
> +# e.g. archive_keepalive_command = 'cp -f %p $ARCHIVE/keepalive </dev/null'
> +# e.g. restore_keepalive_command = 'cp $ARCHIVE/keepalive %p'
> +#
> +#restore_keepalive_command = ''
> #
> # archive_cleanup_command
> #
> diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
> index ce659ec..2729141 100644
> --- a/src/backend/access/transam/xlog.c
> +++ b/src/backend/access/transam/xlog.c
> @@ -73,8 +73,10 @@ int CheckPointSegments = 3;
> int wal_keep_segments = 0;
> int XLOGbuffers = -1;
> int XLogArchiveTimeout = 0;
> +int XLogArchiveKeepaliveTimeout = 10; /* XXX set to 60 before commit */
> bool XLogArchiveMode = false;
> char *XLogArchiveCommand = NULL;
> +char *XLogArchiveKeepaliveCommand = NULL;
> bool EnableHotStandby = false;
> bool fullPageWrites = true;
> bool log_checkpoints = false;
> @@ -188,6 +190,7 @@ static bool restoredFromArchive = false;
>
> /* options taken from recovery.conf for archive recovery */
> static char *recoveryRestoreCommand = NULL;
> +static char *recoveryRestoreKeepaliveCommand = NULL;
> static char *recoveryEndCommand = NULL;
> static char *archiveCleanupCommand = NULL;
> static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
> @@ -634,6 +637,7 @@ static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
> static void XLogFileClose(void);
> static bool RestoreArchivedFile(char *path, const char *xlogfname,
> const char *recovername, off_t expectedSize);
> +static void RestoreKeepaliveFile(void);
> static void ExecuteRecoveryCommand(char *command, char *commandName,
> bool failOnerror);
> static void PreallocXlogFiles(XLogRecPtr endptr);
> @@ -2718,7 +2722,10 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
> "RECOVERYXLOG",
> XLogSegSize);
> if (!restoredFromArchive)
> + {
> + RestoreKeepaliveFile();
> return -1;
> + }
> break;
>
> case XLOG_FROM_PG_XLOG:
> @@ -3179,6 +3186,192 @@ not_available:
> return false;
> }
>
> +static void
> +RestoreKeepaliveFile(void)
> +{
> + char keepalivepath[MAXPGPATH];
> + char keepaliveRestoreCmd[MAXPGPATH];
> + char *dp;
> + char *endp;
> + const char *sp;
> + int rc;
> + bool signaled;
> + struct stat stat_buf;
> +
> + /* In standby mode, restore_command might not be supplied */
> + if (recoveryRestoreKeepaliveCommand == NULL)
> + return;
> +
> + snprintf(keepalivepath, MAXPGPATH, XLOGDIR "/archive_status/KEEPALIVE");
> +
> + /*
> + * Make sure there is no existing file in keepalivepath
> + */
> + if (stat(keepalivepath, &stat_buf) == 0)
> + {
> + if (unlink(keepalivepath) != 0)
> + ereport(FATAL,
> + (errcode_for_file_access(),
> + errmsg("could not remove file \"%s\": %m",
> + keepalivepath)));
> + }
> +
> + /*
> + * construct the command to be executed
> + */
> + dp = keepaliveRestoreCmd;
> + endp = keepaliveRestoreCmd + MAXPGPATH - 1;
> + *endp = '\0';
> +
> + for (sp = recoveryRestoreKeepaliveCommand; *sp; sp++)
> + {
> + if (*sp == '%')
> + {
> + switch (sp[1])
> + {
> + case 'p':
> + /* %p: relative path of target file */
> + sp++;
> + StrNCpy(dp, keepalivepath, endp - dp);
> + make_native_path(dp);
> + dp += strlen(dp);
> + break;
> + case '%':
> + /* convert %% to a single % */
> + sp++;
> + if (dp < endp)
> + *dp++ = *sp;
> + break;
> + default:
> + /* otherwise treat the % as not special */
> + if (dp < endp)
> + *dp++ = *sp;
> + break;
> + }
> + }
> + else
> + {
> + if (dp < endp)
> + *dp++ = *sp;
> + }
> + }
> + *dp = '\0';
> +
> + ereport(DEBUG2,
> + (errmsg_internal("executing restore keepalive command \"%s\"",
> + keepaliveRestoreCmd)));
> +
> + /*
> + * Check signals before restore command and reset afterwards.
> + */
> + PreRestoreCommand();
> +
> + /*
> + * Copy keepalive from archival storage to archive_status dir
> + */
> + rc = system(keepaliveRestoreCmd);
> +
> + PostRestoreCommand();
> +
> + if (rc == 0)
> + {
> + /*
> + * command apparently succeeded, but let's check the file is there
> + */
> + if (stat(keepalivepath, &stat_buf) == 0)
> + {
> + char kptime[15];
> + char kptimezone[4];
> + char *kdata;
> + char ch;
> + int r;
> + FILE *fd;
> +
> + fd = AllocateFile(keepalivepath, "r");
> + if (!fd)
> + {
> + ereport(ERROR,
> + (errcode_for_file_access(),
> + errmsg("could not read file \"%s\": %m",
> + keepalivepath)));
> + }
> + kdata = palloc(stat_buf.st_size + 1);
> + r = fread(kdata, stat_buf.st_size, 1, fd);
> + kdata[stat_buf.st_size] = '\0';
> +
> + /*
> + * Close and remove the keepalive file
> + */
> + if (r != 1 || ferror(fd) || FreeFile(fd))
> + ereport(ERROR,
> + (errcode_for_file_access(),
> + errmsg("could not read file \"%s\": %m",
> + keepalivepath)));
> +
> + /*
> + * Parse the keepalive file
> + */
> + if (sscanf(kdata, "KEEPALIVE TIME: %14s%3s%c",
> + kptime, kptimezone, &ch) != 3 || ch != '\n')
> + ereport(ERROR,
> + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
> + errmsg("invalid data in file \"%s\"", keepalivepath)));
> + kptime[14] = '\0';
> + kptimezone[3] = '\0';
> +
> + ereport(DEBUG2,
> + (errmsg("restored keepalive from archive %s%s", kptime, kptimezone)));
> +
> + XLogReceiptSource = XLOG_FROM_ARCHIVE;
> + XLogReceiptTime = GetCurrentTimestamp();
> + SetCurrentChunkStartTime(XLogReceiptTime);
> +
> + if (unlink(keepalivepath) != 0)
> + ereport(ERROR,
> + (errcode_for_file_access(),
> + errmsg("could not remove file \"%s\": %m",
> + keepalivepath)));
> + return;
> + }
> + }
> +
> + /*
> + * Remember, we rollforward UNTIL the restore fails so failure here is
> + * just part of the process... that makes it difficult to determine
> + * whether the restore failed because there isn't an archive to restore,
> + * or because the administrator has specified the restore program
> + * incorrectly. We have to assume the former.
> + *
> + * However, if the failure was due to any sort of signal, it's best to
> + * punt and abort recovery. (If we "return false" here, upper levels will
> + * assume that recovery is complete and start up the database!) It's
> + * essential to abort on child SIGINT and SIGQUIT, because per spec
> + * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
> + * those it's a good bet we should have gotten it too.
> + *
> + * On SIGTERM, assume we have received a fast shutdown request, and exit
> + * cleanly. It's pure chance whether we receive the SIGTERM first, or the
> + * child process. If we receive it first, the signal handler will call
> + * proc_exit, otherwise we do it here. If we or the child process received
> + * SIGTERM for any other reason than a fast shutdown request, postmaster
> + * will perform an immediate shutdown when it sees us exiting
> + * unexpectedly.
> + *
> + * Per the Single Unix Spec, shells report exit status > 128 when a called
> + * command died on a signal. Also, 126 and 127 are used to report
> + * problems such as an unfindable command; treat those as fatal errors
> + * too.
> + */
> + if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
> + proc_exit(1);
> +
> + signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
> +
> + ereport(signaled ? FATAL : DEBUG2,
> + (errmsg("could not restore keepalive file from archive: return code %d",
> + rc)));
> +}
> +
> /*
> * Attempt to execute an external shell command during recovery.
> *
> @@ -5304,6 +5497,13 @@ readRecoveryCommandFile(void)
> (errmsg_internal("restore_command = '%s'",
> recoveryRestoreCommand)));
> }
> + else if (strcmp(item->name, "restore_keepalive_command") == 0)
> + {
> + recoveryRestoreKeepaliveCommand = pstrdup(item->value);
> + ereport(DEBUG2,
> + (errmsg_internal("restore_keepalive_command = '%s'",
> + recoveryRestoreKeepaliveCommand)));
> + }
> else if (strcmp(item->name, "recovery_end_command") == 0)
> {
> recoveryEndCommand = pstrdup(item->value);
> @@ -10102,3 +10302,52 @@ WALWriterLatch(void)
> {
> return &XLogCtl->WALWriterLatch;
> }
> +
> +/*
> + * Write a keepalive and return the values of path and filename
> + */
> +void
> +XLogWriteKeepaliveFile(void)
> +{
> + char keepalivepath[MAXPGPATH];
> + char xlogfname[MAXFNAMELEN];
> + XLogRecPtr lastFlushRecPtr = GetFlushRecPtr();
> + pg_time_t stamp_time;
> + char strfbuf[128];
> + uint32 log;
> + uint32 seg;
> + FILE *fd;
> +
> + XLByteToSeg(lastFlushRecPtr, log, seg);
> + XLogFileName(xlogfname, ThisTimeLineID, log, seg);
> +
> + /* Use the log timezone here, not the session timezone */
> + stamp_time = (pg_time_t) time(NULL);
> + pg_strftime(strfbuf, sizeof(strfbuf),
> + "%Y%m%d%H%M%S%Z",
> + pg_localtime(&stamp_time, log_timezone));
> +
> + KeepaliveFilePath(keepalivepath, xlogfname, strfbuf);
> +
> + elog(DEBUG4, "keepalive %s", keepalivepath);
> +
> + fd = AllocateFile(keepalivepath, "w");
> + if (fd == NULL)
> + {
> + ereport(LOG,
> + (errcode_for_file_access(),
> + errmsg("could not create archive keepalive file \"%s\": %m",
> + keepalivepath)));
> + return;
> + }
> + fprintf(fd, "KEEPALIVE TIME: %s\n", strfbuf);
> + if (fflush(fd) || ferror(fd) || FreeFile(fd))
> + ereport(ERROR,
> + (errcode_for_file_access(),
> + errmsg("could not write file \"%s\": %m",
> + keepalivepath)));
> +
> + /* Notify archiver that it's got something to do */
> + if (IsUnderPostmaster)
> + SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
> +}
> diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
> index 0b792d2..29882b1 100644
> --- a/src/backend/postmaster/checkpointer.c
> +++ b/src/backend/postmaster/checkpointer.c
> @@ -164,6 +164,7 @@ static double ckpt_cached_elapsed;
>
> static pg_time_t last_checkpoint_time;
> static pg_time_t last_xlog_switch_time;
> +static pg_time_t last_xlog_keepalive_time;
>
> /* Prototypes for private functions */
>
> @@ -241,7 +242,7 @@ CheckpointerMain(void)
> /*
> * Initialize so that first time-driven event happens at the correct time.
> */
> - last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
> + last_xlog_keepalive_time = last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
>
> /*
> * Create a resource owner to keep track of our resources (currently only
> @@ -546,6 +547,7 @@ CheckpointerMain(void)
>
> /*
> * CheckArchiveTimeout -- check for archive_timeout and switch xlog files
> + * or write keepalive files
> *
> * This will switch to a new WAL file and force an archive file write
> * if any activity is recorded in the current WAL file, including just
> @@ -556,47 +558,83 @@ CheckArchiveTimeout(void)
> {
> pg_time_t now;
> pg_time_t last_time;
> + bool switched = false;
>
> - if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
> + if (RecoveryInProgress())
> return;
>
> now = (pg_time_t) time(NULL);
>
> + if (XLogArchiveTimeout > 0)
> + {
> + /* First we do a quick check using possibly-stale local state. */
> + if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
> + {
> + /*
> + * Update local state ... note that last_xlog_switch_time is the last time
> + * a switch was performed *or requested*.
> + */
> + last_time = GetLastSegSwitchTime();
> +
> + last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
> +
> + /* Now we can do the real check */
> + if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
> + {
> + XLogRecPtr switchpoint;
> +
> + /* OK, it's time to switch */
> + switchpoint = RequestXLogSwitch();
> +
> + /*
> + * If the returned pointer points exactly to a segment boundary,
> + * assume nothing happened.
> + */
> + if ((switchpoint.xrecoff % XLogSegSize) != 0)
> + ereport(DEBUG1,
> + (errmsg("transaction log switch forced (archive_timeout=%d)",
> + XLogArchiveTimeout)));
> +
> + /*
> + * Update state in any case, so we don't retry constantly when the
> + * system is idle.
> + */
> + last_xlog_switch_time = now;
> + switched = true;
> + }
> + }
> + }
> +
> + if (switched || !XLogArchiveKeepaliveCommandSet())
> + return;
> +
> /* First we do a quick check using possibly-stale local state. */
> - if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
> + if ((int) (now - last_xlog_switch_time) < XLogArchiveKeepaliveTimeout)
> return;
>
> /*
> - * Update local state ... note that last_xlog_switch_time is the last time
> - * a switch was performed *or requested*.
> + * Update local state if we didn't do it already.
> */
> - last_time = GetLastSegSwitchTime();
> -
> - last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
> + if (XLogArchiveTimeout <= 0)
> + last_time = GetLastSegSwitchTime();
>
> /* Now we can do the real check */
> - if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
> - {
> - XLogRecPtr switchpoint;
> + if ((int) (now - last_xlog_switch_time) < XLogArchiveKeepaliveTimeout)
> + return;
>
> - /* OK, it's time to switch */
> - switchpoint = RequestXLogSwitch();
> + if ((int) (now - last_xlog_keepalive_time) < XLogArchiveKeepaliveTimeout)
> + return;
>
> - /*
> - * If the returned pointer points exactly to a segment boundary,
> - * assume nothing happened.
> - */
> - if ((switchpoint.xrecoff % XLogSegSize) != 0)
> - ereport(DEBUG1,
> - (errmsg("transaction log switch forced (archive_timeout=%d)",
> - XLogArchiveTimeout)));
> + /*
> + * Write a keepalive file for archive_keepalive_command
> + */
> + XLogWriteKeepaliveFile();
>
> - /*
> - * Update state in any case, so we don't retry constantly when the
> - * system is idle.
> - */
> - last_xlog_switch_time = now;
> - }
> + /*
> + * We don't log a message to say keepalive sent
> + */
> +
> + last_xlog_keepalive_time = now;
> }
>
> /*
> diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
> index 37fc735..e8c19bb 100644
> --- a/src/backend/postmaster/pgarch.c
> +++ b/src/backend/postmaster/pgarch.c
> @@ -51,7 +51,8 @@
> * Timer definitions.
> * ----------
> */
> -#define PGARCH_AUTOWAKE_INTERVAL 60 /* How often to force a poll of the
> +/* XXX change only for testing */
> +#define PGARCH_AUTOWAKE_INTERVAL 10 /* How often to force a poll of the
> * archive status directory; in
> * seconds. */
> #define PGARCH_RESTART_INTERVAL 10 /* How often to attempt to restart a
> @@ -108,10 +109,14 @@ static void ArchSigTermHandler(SIGNAL_ARGS);
> static void pgarch_waken(SIGNAL_ARGS);
> static void pgarch_waken_stop(SIGNAL_ARGS);
> static void pgarch_MainLoop(void);
> -static void pgarch_ArchiverCopyLoop(void);
> +static void pgarch_ArchiverCopyLoop(bool timedout);
> static bool pgarch_archiveXlog(char *xlog);
> +static void pgarch_archiveKeepalive(void);
> static bool pgarch_readyXlog(char *xlog);
> static void pgarch_archiveDone(char *xlog);
> +static void constructArchiveCommand(char *archcmd, const char *archcmdtemplate,
> + const char *filepath, const char *filename);
> +static bool executeArchiveCommand(const char *archcmd, const char *description);
>
>
> /* ------------------------------------------------------------
> @@ -351,6 +356,7 @@ pgarch_MainLoop(void)
> {
> pg_time_t last_copy_time = 0;
> bool time_to_stop;
> + bool timedout = false;
>
> /*
> * We run the copy loop immediately upon entry, in case there are
> @@ -401,7 +407,8 @@ pgarch_MainLoop(void)
> if (wakened || time_to_stop)
> {
> wakened = false;
> - pgarch_ArchiverCopyLoop();
> + pgarch_ArchiverCopyLoop(timedout);
> + timedout = false;
> last_copy_time = time(NULL);
> }
>
> @@ -424,7 +431,10 @@ pgarch_MainLoop(void)
> WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
> timeout * 1000L);
> if (rc & WL_TIMEOUT)
> + {
> + timedout = true;
> wakened = true;
> + }
> }
> else
> wakened = true;
> @@ -444,9 +454,10 @@ pgarch_MainLoop(void)
> * Archives all outstanding xlogs then returns
> */
> static void
> -pgarch_ArchiverCopyLoop(void)
> +pgarch_ArchiverCopyLoop(bool timedout)
> {
> char xlog[MAX_XFN_CHARS + 1];
> + bool sentfile = false;
>
> /*
> * loop through all xlogs with archive_status of .ready and archive
> @@ -486,6 +497,8 @@ pgarch_ArchiverCopyLoop(void)
> {
> ereport(WARNING,
> (errmsg("archive_mode enabled, yet archive_command is not set")));
> + if (!sentfile && timedout)
> + pgarch_archiveKeepalive();
> return;
> }
>
> @@ -493,6 +506,7 @@ pgarch_ArchiverCopyLoop(void)
> {
> /* successful */
> pgarch_archiveDone(xlog);
> + sentfile = true;
> break; /* out of inner retry loop */
> }
> else
> @@ -508,151 +522,117 @@ pgarch_ArchiverCopyLoop(void)
> }
> }
> }
> +
> + if (!sentfile && timedout)
> + pgarch_archiveKeepalive();
> }
>
> /*
> - * pgarch_archiveXlog
> - *
> - * Invokes system(3) to copy one archive file to wherever it should go
> - *
> - * Returns true if successful
> + * pgarch_archiveXlog - executes archive_command for latest WAL file
> */
> static bool
> pgarch_archiveXlog(char *xlog)
> {
> char xlogarchcmd[MAXPGPATH];
> - char pathname[MAXPGPATH];
> char activitymsg[MAXFNAMELEN + 16];
> - char *dp;
> - char *endp;
> - const char *sp;
> - int rc;
> + char xlogfilepath[MAXPGPATH];
> +
> + snprintf(xlogfilepath, MAXPGPATH, XLOGDIR "/%s", xlog);
> +
> + constructArchiveCommand(xlogarchcmd, XLogArchiveCommand,
> + xlogfilepath, xlog);
> +
> + /* Report archive activity in PS display */
> + snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog);
> + set_ps_display(activitymsg, false);
> +
> + if (!executeArchiveCommand(xlogarchcmd, "archive command"))
> + return false;
> +
> + ereport(DEBUG1,
> + (errmsg("archived transaction log file \"%s\"", xlog)));
> +
> + snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog);
> + set_ps_display(activitymsg, false);
> +
> + return true;
> +}
> +
> +/*
> + * pgarch_archiveKeepalive - executes archive_keepalive_command
> + */
> +static void
> +pgarch_archiveKeepalive(void)
> +{
> +#define LENGTH_DOT_KEEPALIVE 10
> + char keepalivearchcmd[MAXPGPATH];
> + char keepalivepath[MAXPGPATH];
> + char XLogArchiveStatusDir[MAXPGPATH];
> + char keepalive[MAX_XFN_CHARS + LENGTH_DOT_KEEPALIVE + 1];
> + DIR *rldir;
> + struct dirent *rlde;
> + bool found = false;
>
> - snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog);
> + if (!XLogArchiveKeepaliveCommandSet())
> + return;
>
> /*
> - * construct the command to be executed
> + * open xlog status directory and read through list of keepalives,
> + * looking for latest file. It is possible to optimise this code
> + * though only a single file is expected on the vast majority
> + * of calls, so....
> */
> - dp = xlogarchcmd;
> - endp = xlogarchcmd + MAXPGPATH - 1;
> - *endp = '\0';
>
> - for (sp = XLogArchiveCommand; *sp; sp++)
> + snprintf(XLogArchiveStatusDir, MAXPGPATH, XLOGDIR "/archive_status");
> + rldir = AllocateDir(XLogArchiveStatusDir);
> + if (rldir == NULL)
> + ereport(ERROR,
> + (errcode_for_file_access(),
> + errmsg("could not open archive status directory \"%s\": %m",
> + XLogArchiveStatusDir)));
> +
> + while ((rlde = ReadDir(rldir, XLogArchiveStatusDir)) != NULL)
> {
> - if (*sp == '%')
> + int basenamelen = (int) strlen(rlde->d_name) - LENGTH_DOT_KEEPALIVE;
> +
> + if (strcmp(rlde->d_name + basenamelen, ".keepalive") == 0)
> {
> - switch (sp[1])
> + if (!found)
> {
> - case 'p':
> - /* %p: relative path of source file */
> - sp++;
> - strlcpy(dp, pathname, endp - dp);
> - make_native_path(dp);
> - dp += strlen(dp);
> - break;
> - case 'f':
> - /* %f: filename of source file */
> - sp++;
> - strlcpy(dp, xlog, endp - dp);
> - dp += strlen(dp);
> - break;
> - case '%':
> - /* convert %% to a single % */
> - sp++;
> - if (dp < endp)
> - *dp++ = *sp;
> - break;
> - default:
> - /* otherwise treat the % as not special */
> - if (dp < endp)
> - *dp++ = *sp;
> - break;
> + strcpy(keepalive, rlde->d_name);
> + found = true;
> + }
> + else
> + {
> + if (strcmp(rlde->d_name, keepalive) > 0)
> + {
> + sprintf(keepalivepath, "%s/%s", XLogArchiveStatusDir, keepalive);
> + unlink(keepalivepath);
> + strcpy(keepalive, rlde->d_name);
> + }
> + else
> + {
> + sprintf(keepalivepath, "%s/%s", XLogArchiveStatusDir, rlde->d_name);
> + unlink(keepalivepath);
> + }
> }
> - }
> - else
> - {
> - if (dp < endp)
> - *dp++ = *sp;
> }
> }
> - *dp = '\0';
> -
> - ereport(DEBUG3,
> - (errmsg_internal("executing archive command \"%s\"",
> - xlogarchcmd)));
> -
> - /* Report archive activity in PS display */
> - snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog);
> - set_ps_display(activitymsg, false);
> + FreeDir(rldir);
>
> - rc = system(xlogarchcmd);
> - if (rc != 0)
> - {
> - /*
> - * If either the shell itself, or a called command, died on a signal,
> - * abort the archiver. We do this because system() ignores SIGINT and
> - * SIGQUIT while waiting; so a signal is very likely something that
> - * should have interrupted us too. If we overreact it's no big deal,
> - * the postmaster will just start the archiver again.
> - *
> - * Per the Single Unix Spec, shells report exit status > 128 when a
> - * called command died on a signal.
> - */
> - int lev = (WIFSIGNALED(rc) || WEXITSTATUS(rc) > 128) ? FATAL : LOG;
> + if (!found)
> + return;
>
> - if (WIFEXITED(rc))
> - {
> - ereport(lev,
> - (errmsg("archive command failed with exit code %d",
> - WEXITSTATUS(rc)),
> - errdetail("The failed archive command was: %s",
> - xlogarchcmd)));
> - }
> - else if (WIFSIGNALED(rc))
> - {
> -#if defined(WIN32)
> - ereport(lev,
> - (errmsg("archive command was terminated by exception 0x%X",
> - WTERMSIG(rc)),
> - errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."),
> - errdetail("The failed archive command was: %s",
> - xlogarchcmd)));
> -#elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
> - ereport(lev,
> - (errmsg("archive command was terminated by signal %d: %s",
> - WTERMSIG(rc),
> - WTERMSIG(rc) < NSIG ? sys_siglist[WTERMSIG(rc)] : "(unknown)"),
> - errdetail("The failed archive command was: %s",
> - xlogarchcmd)));
> -#else
> - ereport(lev,
> - (errmsg("archive command was terminated by signal %d",
> - WTERMSIG(rc)),
> - errdetail("The failed archive command was: %s",
> - xlogarchcmd)));
> -#endif
> - }
> - else
> - {
> - ereport(lev,
> - (errmsg("archive command exited with unrecognized status %d",
> - rc),
> - errdetail("The failed archive command was: %s",
> - xlogarchcmd)));
> - }
> + sprintf(keepalivepath, "%s/%s", XLogArchiveStatusDir, keepalive);
> + constructArchiveCommand(keepalivearchcmd, XLogArchiveKeepaliveCommand,
> + keepalivepath, keepalive);
> + if (!executeArchiveCommand(keepalivearchcmd, "archive keepalive command"))
> + return;
>
> - snprintf(activitymsg, sizeof(activitymsg), "failed on %s", xlog);
> - set_ps_display(activitymsg, false);
> + unlink(keepalivepath);
>
> - return false;
> - }
> ereport(DEBUG1,
> - (errmsg("archived transaction log file \"%s\"", xlog)));
> -
> - snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog);
> - set_ps_display(activitymsg, false);
> -
> - return true;
> + (errmsg("archived keepalive file \"%s\"", keepalive)));
> }
>
> /*
> @@ -753,3 +733,138 @@ pgarch_archiveDone(char *xlog)
> errmsg("could not rename file \"%s\" to \"%s\": %m",
> rlogready, rlogdone)));
> }
> +
> +/*
> + * Constructs the executable archive command from a template for a given file
> + */
> +static void
> +constructArchiveCommand(char *archcmd, const char *archcmdtemplate,
> + const char *filepath, const char *filename)
> +{
> + char *dp;
> + char *endp;
> + const char *sp;
> +
> + /*
> + * construct the command to be executed
> + */
> + dp = archcmd;
> + endp = archcmd + MAXPGPATH - 1;
> + *endp = '\0';
> +
> + for (sp = archcmdtemplate; *sp; sp++)
> + {
> + if (*sp == '%')
> + {
> + switch (sp[1])
> + {
> + case 'p':
> + /* %p: relative path of source file */
> + sp++;
> + strlcpy(dp, filepath, endp - dp);
> + make_native_path(dp);
> + dp += strlen(dp);
> + break;
> + case 'f':
> + /* %f: filename of source file */
> + sp++;
> + strlcpy(dp, filename, endp - dp);
> + dp += strlen(dp);
> + break;
> + case '%':
> + /* convert %% to a single % */
> + sp++;
> + if (dp < endp)
> + *dp++ = *sp;
> + break;
> + default:
> + /* otherwise treat the % as not special */
> + if (dp < endp)
> + *dp++ = *sp;
> + break;
> + }
> + }
> + else
> + {
> + if (dp < endp)
> + *dp++ = *sp;
> + }
> + }
> + *dp = '\0';
> +}
> +
> +/*
> + * Invokes system(3) to execute the supplied archive command
> + *
> + * Returns true if successful
> + */
> +static bool
> +executeArchiveCommand(const char *archcmd, const char *description)
> +{
> + int rc;
> +
> + ereport(DEBUG3,
> + (errmsg_internal("executing %s \"%s\"",
> + description, archcmd)));
> +
> + rc = system(archcmd);
> + if (rc != 0)
> + {
> + /*
> + * If either the shell itself, or a called command, died on a signal,
> + * abort the archiver. We do this because system() ignores SIGINT and
> + * SIGQUIT while waiting; so a signal is very likely something that
> + * should have interrupted us too. If we overreact it's no big deal,
> + * the postmaster will just start the archiver again.
> + *
> + * Per the Single Unix Spec, shells report exit status > 128 when a
> + * called command died on a signal.
> + */
> + int lev = (WIFSIGNALED(rc) || WEXITSTATUS(rc) > 128) ? FATAL : LOG;
> +
> + if (WIFEXITED(rc))
> + {
> + ereport(lev,
> + (errmsg("%s failed with exit code %d",
> + description, WEXITSTATUS(rc)),
> + errdetail("The failed archive command was: %s",
> + archcmd)));
> + }
> + else if (WIFSIGNALED(rc))
> + {
> +#if defined(WIN32)
> + ereport(lev,
> + (errmsg("%s was terminated by exception 0x%X",
> + description, WTERMSIG(rc)),
> + errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."),
> + errdetail("The failed archive command was: %s",
> + archcmd)));
> +#elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
> + ereport(lev,
> + (errmsg("%s was terminated by signal %d: %s",
> + description, WTERMSIG(rc),
> + WTERMSIG(rc) < NSIG ? sys_siglist[WTERMSIG(rc)] : "(unknown)"),
> + errdetail("The failed archive command was: %s",
> + archcmd)));
> +#else
> + ereport(lev,
> + (errmsg("%s was terminated by signal %d",
> + description, WTERMSIG(rc)),
> + errdetail("The failed archive command was: %s",
> + archcmd)));
> +#endif
> + }
> + else
> + {
> + ereport(lev,
> + (errmsg("%s exited with unrecognized status %d",
> + description, rc),
> + errdetail("The failed archive command was: %s",
> + archcmd)));
> + }
> +
> + return false;
> + }
> +
> + return true;
> +}
> diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
> index 5c910dd..16bd77f 100644
> --- a/src/backend/utils/misc/guc.c
> +++ b/src/backend/utils/misc/guc.c
> @@ -189,6 +189,7 @@ static bool check_timezone_abbreviations(char **newval, void **extra, GucSource
> static void assign_timezone_abbreviations(const char *newval, void *extra);
> static void pg_timezone_abbrev_initialize(void);
> static const char *show_archive_command(void);
> +static const char *show_archive_keepalive_command(void);
> static void assign_tcp_keepalives_idle(int newval, void *extra);
> static void assign_tcp_keepalives_interval(int newval, void *extra);
> static void assign_tcp_keepalives_count(int newval, void *extra);
> @@ -2531,6 +2532,16 @@ static struct config_string ConfigureNamesString[] =
> },
>
> {
> + {"archive_keepalive_command", PGC_SIGHUP, WAL_ARCHIVING,
> + gettext_noop("Sets the shell command that will be called to send a keepalive file."),
> + NULL
> + },
> + &XLogArchiveKeepaliveCommand,
> + "",
> + NULL, NULL, show_archive_keepalive_command
> + },
> +
> + {
> {"client_encoding", PGC_USERSET, CLIENT_CONN_LOCALE,
> gettext_noop("Sets the client's character set encoding."),
> NULL,
> @@ -8490,6 +8501,15 @@ show_archive_command(void)
> return "(disabled)";
> }
>
> +static const char *
> +show_archive_keepalive_command(void)
> +{
> + if (XLogArchivingActive())
> + return XLogArchiveKeepaliveCommand;
> + else
> + return "(disabled)";
> +}
> +
> static void
> assign_tcp_keepalives_idle(int newval, void *extra)
> {
> diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
> index 315db46..085d5bb 100644
> --- a/src/backend/utils/misc/postgresql.conf.sample
> +++ b/src/backend/utils/misc/postgresql.conf.sample
> @@ -189,6 +189,10 @@
> # placeholders: %p = path of file to archive
> # %f = file name only
> # e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f'
> +#archive_keepalive_command = '' # command to use to archive keepalive message files
> + # placeholders: %p = path of keepalive file
> + # %f = keepalive file name only
> + # e.g. 'cp %p /mnt/server/archivedir/%f'
> #archive_timeout = 0 # force a logfile segment switch after this
> # number of seconds; 0 disables
>
> diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
> index 1ddf4bf..63174c5 100644
> --- a/src/include/access/xlog.h
> +++ b/src/include/access/xlog.h
> @@ -191,6 +191,8 @@ extern int XLOGbuffers;
> extern int XLogArchiveTimeout;
> extern bool XLogArchiveMode;
> extern char *XLogArchiveCommand;
> +extern char *XLogArchiveKeepaliveCommand;
> +extern int XLogArchiveKeepaliveTimeout;
> extern bool EnableHotStandby;
> extern bool log_checkpoints;
>
> @@ -205,6 +207,7 @@ extern int wal_level;
>
> #define XLogArchivingActive() (XLogArchiveMode && wal_level >= WAL_LEVEL_ARCHIVE)
> #define XLogArchiveCommandSet() (XLogArchiveCommand[0] != '\0')
> +#define XLogArchiveKeepaliveCommandSet() (XLogArchiveKeepaliveCommand[0] != '\0')
>
> /*
> * Is WAL-logging necessary for archival or log-shipping, or can we skip
> diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
> index db6380f..51e6558 100644
> --- a/src/include/access/xlog_internal.h
> +++ b/src/include/access/xlog_internal.h
> @@ -233,6 +233,9 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader;
> #define StatusFilePath(path, xlog, suffix) \
> snprintf(path, MAXPGPATH, XLOGDIR "/archive_status/%s%s", xlog, suffix)
>
> +#define KeepaliveFilePath(path, kfname, timestr) \
> + snprintf(path, MAXPGPATH, XLOGDIR "/archive_status/%s.%s.keepalive", kfname, timestr)
> +
> #define BackupHistoryFileName(fname, tli, log, seg, offset) \
> snprintf(fname, MAXFNAMELEN, "%08X%08X%08X.%08X.backup", tli, log, seg, offset)
>
> @@ -258,6 +261,11 @@ typedef struct RmgrData
> extern const RmgrData RmgrTable[];
>
> /*
> + * Exported to support writing keepalives from archiver
> + */
> +extern void XLogWriteKeepaliveFile(void);
> +
> +/*
> * Exported to support xlog switching from checkpointer
> */
> extern pg_time_t GetLastSegSwitchTime(void);
>
> --
> Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
> To make changes to your subscription:
> http://www.postgresql.org/mailpref/pgsql-hackers
-- Bruce Momjian <bruce@momjian.us> http://momjian.us EnterpriseDB
http://enterprisedb.com
+ It's impossible for everything to be true. +