diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml new file mode 100644 index ef0cc28..d5fdaea *** a/doc/src/sgml/ref/pg_rewind.sgml --- b/doc/src/sgml/ref/pg_rewind.sgml *************** PostgreSQL documentation *** 61,73 **** pg_rewind examines the timeline histories of the source and target clusters to determine the point where they diverged, and expects to find WAL in the target cluster's pg_xlog directory ! reaching all the way back to the point of divergence. In the typical ! failover scenario where the target cluster was shut down soon after the ! divergence, that is not a problem, but if the target cluster had run for a ! long time after the divergence, the old WAL files might not be present ! anymore. In that case, they can be manually copied from the WAL archive to ! the pg_xlog directory. Fetching missing files from a WAL ! archive automatically is currently not supported. --- 61,77 ---- pg_rewind examines the timeline histories of the source and target clusters to determine the point where they diverged, and expects to find WAL in the target cluster's pg_xlog directory ! reaching all the way back to the point of divergence. The point of divergence ! could be found either on target timeline, source timeline or their common ! ancestor. In the typical failover scenario where the target cluster was ! shut down soon after the divergence, that is not a problem, but if the ! target cluster had run for a long time after the divergence, the old WAL ! files might not be present anymore. In that case, they can be manually ! copied from the WAL archive to the pg_xlog directory. Fetching ! missing files from a WAL archive automatically is currently not supported. ! Besides, pg_rewind use cases are not limited by failover. ! For instance, standby server could be promoted, run some writes and ! then be returned back as stanby. diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile new file mode 100644 index 92b5d20..48dc770 *** a/src/bin/pg_rewind/Makefile --- b/src/bin/pg_rewind/Makefile *************** *** 8,14 **** # #------------------------------------------------------------------------- ! PGFILEDESC = "pg_rewind - repurpose an old master server as standby" PGAPPICON = win32 subdir = src/bin/pg_rewind --- 8,14 ---- # #------------------------------------------------------------------------- ! PGFILEDESC = "pg_rewind - synchronize a data directory with another one forked from" PGAPPICON = win32 subdir = src/bin/pg_rewind diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c new file mode 100644 index 2081cf8..d69eafb *** a/src/bin/pg_rewind/parsexlog.c --- b/src/bin/pg_rewind/parsexlog.c *************** static char xlogfpath[MAXPGPATH]; *** 45,51 **** typedef struct XLogPageReadPrivate { const char *datadir; ! TimeLineID tli; } XLogPageReadPrivate; static int SimpleXLogPageRead(XLogReaderState *xlogreader, --- 45,51 ---- typedef struct XLogPageReadPrivate { const char *datadir; ! int tliIndex; } XLogPageReadPrivate; static int SimpleXLogPageRead(XLogReaderState *xlogreader, *************** static int SimpleXLogPageRead(XLogReader *** 55,65 **** /* * Read WAL from the datadir/pg_xlog, starting from 'startpoint' on timeline ! * 'tli', until 'endpoint'. Make note of the data blocks touched by the WAL ! * records, and return them in a page map. */ void ! extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli, XLogRecPtr endpoint) { XLogRecord *record; --- 55,65 ---- /* * Read WAL from the datadir/pg_xlog, starting from 'startpoint' on timeline ! * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of ! * the data blocks touched by the WAL records, and return them in a page map. */ void ! extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, XLogRecPtr endpoint) { XLogRecord *record; *************** extractPageMap(const char *datadir, XLog *** 68,74 **** XLogPageReadPrivate private; private.datadir = datadir; ! private.tli = tli; xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private); if (xlogreader == NULL) pg_fatal("out of memory\n"); --- 68,74 ---- XLogPageReadPrivate private; private.datadir = datadir; ! private.tliIndex = tliIndex; xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private); if (xlogreader == NULL) pg_fatal("out of memory\n"); *************** extractPageMap(const char *datadir, XLog *** 112,118 **** * doing anything with the record itself. */ XLogRecPtr ! readOneRecord(const char *datadir, XLogRecPtr ptr, TimeLineID tli) { XLogRecord *record; XLogReaderState *xlogreader; --- 112,118 ---- * doing anything with the record itself. */ XLogRecPtr ! readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex) { XLogRecord *record; XLogReaderState *xlogreader; *************** readOneRecord(const char *datadir, XLogR *** 121,127 **** XLogRecPtr endptr; private.datadir = datadir; ! private.tli = tli; xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private); if (xlogreader == NULL) pg_fatal("out of memory\n"); --- 121,127 ---- XLogRecPtr endptr; private.datadir = datadir; ! private.tliIndex = tliIndex; xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private); if (xlogreader == NULL) pg_fatal("out of memory\n"); *************** readOneRecord(const char *datadir, XLogR *** 152,158 **** * Find the previous checkpoint preceding given WAL position. */ void ! findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, TimeLineID tli, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo) { --- 152,158 ---- * Find the previous checkpoint preceding given WAL position. */ void ! findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo) { *************** findLastCheckpoint(const char *datadir, *** 173,179 **** forkptr += (forkptr % XLogSegSize == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD; private.datadir = datadir; ! private.tli = tli; xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private); if (xlogreader == NULL) pg_fatal("out of memory\n"); --- 173,179 ---- forkptr += (forkptr % XLogSegSize == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD; private.datadir = datadir; ! private.tliIndex = tliIndex; xlogreader = XLogReaderAllocate(&SimpleXLogPageRead, &private); if (xlogreader == NULL) pg_fatal("out of memory\n"); *************** SimpleXLogPageRead(XLogReaderState *xlog *** 236,244 **** { XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; uint32 targetPageOff; ! XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; XLByteToSeg(targetPagePtr, targetSegNo); targetPageOff = targetPagePtr % XLogSegSize; /* --- 236,246 ---- { XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; uint32 targetPageOff; ! XLogRecPtr targetSegEnd; ! XLogSegNo targetSegNo; XLByteToSeg(targetPagePtr, targetSegNo); + XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, targetSegEnd); targetPageOff = targetPagePtr % XLogSegSize; /* *************** SimpleXLogPageRead(XLogReaderState *xlog *** 257,263 **** { char xlogfname[MAXFNAMELEN]; ! XLogFileName(xlogfname, private->tli, xlogreadsegno); snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s", private->datadir, xlogfname); --- 259,278 ---- { char xlogfname[MAXFNAMELEN]; ! /* ! * Since incomplete segments are copied into next timelines, switch to ! * the timeline holding the required segment. Assuming this scan can be ! * done both forward and backward, consider also switching timeline ! * accordingly. ! */ ! while (private->tliIndex < targetNentries - 1 && ! targetHistory[private->tliIndex].end < targetSegEnd) ! private->tliIndex++; ! while (private->tliIndex > 0 && ! targetHistory[private->tliIndex].begin >= targetSegEnd) ! private->tliIndex--; ! ! XLogFileName(xlogfname, targetHistory[private->tliIndex].tli, xlogreadsegno); snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s", private->datadir, xlogfname); *************** SimpleXLogPageRead(XLogReaderState *xlog *** 293,299 **** Assert(targetSegNo == xlogreadsegno); ! *pageTLI = private->tli; return XLOG_BLCKSZ; } --- 308,314 ---- Assert(targetSegNo == xlogreadsegno); ! *pageTLI = targetHistory[private->tliIndex].tli; return XLOG_BLCKSZ; } diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c new file mode 100644 index a2d9ca3..1ab82f0 *** a/src/bin/pg_rewind/pg_rewind.c --- b/src/bin/pg_rewind/pg_rewind.c *************** *** 1,7 **** /*------------------------------------------------------------------------- * * pg_rewind.c ! * Synchronizes an old master server to a new timeline * * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * --- 1,7 ---- /*------------------------------------------------------------------------- * * pg_rewind.c ! * Synchronizes a PostgreSQL data directory to a new timeline * * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * *************** static void digestControlFile(ControlFil *** 37,43 **** size_t size); static void updateControlFile(ControlFileData *ControlFile); static void sanityChecks(void); ! static void findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli); static ControlFileData ControlFile_target; static ControlFileData ControlFile_source; --- 37,43 ---- size_t size); static void updateControlFile(ControlFileData *ControlFile); static void sanityChecks(void); ! static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex); static ControlFileData ControlFile_target; static ControlFileData ControlFile_source; *************** bool debug = false; *** 53,58 **** --- 53,62 ---- bool showprogress = false; bool dry_run = false; + /* Target history */ + TimeLineHistoryEntry *targetHistory; + int targetNentries; + static void usage(const char *progname) { *************** main(int argc, char **argv) *** 88,94 **** int option_index; int c; XLogRecPtr divergerec; ! TimeLineID lastcommontli; XLogRecPtr chkptrec; TimeLineID chkpttli; XLogRecPtr chkptredo; --- 92,98 ---- int option_index; int c; XLogRecPtr divergerec; ! int lastcommontliIndex; XLogRecPtr chkptrec; TimeLineID chkpttli; XLogRecPtr chkptredo; *************** main(int argc, char **argv) *** 214,222 **** if (ControlFile_target.checkPointCopy.ThisTimeLineID == ControlFile_source.checkPointCopy.ThisTimeLineID) pg_fatal("source and target cluster are on the same timeline\n"); ! findCommonAncestorTimeline(&divergerec, &lastcommontli); printf(_("servers diverged at WAL position %X/%X on timeline %u\n"), ! (uint32) (divergerec >> 32), (uint32) divergerec, lastcommontli); /* * Check for the possibility that the target is in fact a direct ancestor --- 218,227 ---- if (ControlFile_target.checkPointCopy.ThisTimeLineID == ControlFile_source.checkPointCopy.ThisTimeLineID) pg_fatal("source and target cluster are on the same timeline\n"); ! findCommonAncestorTimeline(&divergerec, &lastcommontliIndex); printf(_("servers diverged at WAL position %X/%X on timeline %u\n"), ! (uint32) (divergerec >> 32), (uint32) divergerec, ! targetHistory[lastcommontliIndex].tli); /* * Check for the possibility that the target is in fact a direct ancestor *************** main(int argc, char **argv) *** 234,240 **** /* Read the checkpoint record on the target to see where it ends. */ chkptendrec = readOneRecord(datadir_target, ControlFile_target.checkPoint, ! ControlFile_target.checkPointCopy.ThisTimeLineID); /* * If the histories diverged exactly at the end of the shutdown --- 239,245 ---- /* Read the checkpoint record on the target to see where it ends. */ chkptendrec = readOneRecord(datadir_target, ControlFile_target.checkPoint, ! targetNentries - 1); /* * If the histories diverged exactly at the end of the shutdown *************** main(int argc, char **argv) *** 254,260 **** exit(0); } ! findLastCheckpoint(datadir_target, divergerec, lastcommontli, &chkptrec, &chkpttli, &chkptredo); printf(_("rewinding from last common checkpoint at %X/%X on timeline %u\n"), (uint32) (chkptrec >> 32), (uint32) chkptrec, --- 259,266 ---- exit(0); } ! findLastCheckpoint(datadir_target, divergerec, ! lastcommontliIndex, &chkptrec, &chkpttli, &chkptredo); printf(_("rewinding from last common checkpoint at %X/%X on timeline %u\n"), (uint32) (chkptrec >> 32), (uint32) chkptrec, *************** main(int argc, char **argv) *** 277,283 **** * we would need to replay until the end of WAL here. */ pg_log(PG_PROGRESS, "reading WAL in target\n"); ! extractPageMap(datadir_target, chkptrec, lastcommontli, ControlFile_target.checkPoint); filemap_finalize(); --- 283,289 ---- * we would need to replay until the end of WAL here. */ pg_log(PG_PROGRESS, "reading WAL in target\n"); ! extractPageMap(datadir_target, chkptrec, lastcommontliIndex, ControlFile_target.checkPoint); filemap_finalize(); *************** sanityChecks(void) *** 374,383 **** /* * Target cluster better not be running. This doesn't guard against * someone starting the cluster concurrently. Also, this is probably more ! * strict than necessary; it's OK if the master was not shut down cleanly, ! * as long as it isn't running at the moment. */ ! if (ControlFile_target.state != DB_SHUTDOWNED) pg_fatal("target server must be shut down cleanly\n"); /* --- 380,390 ---- /* * Target cluster better not be running. This doesn't guard against * someone starting the cluster concurrently. Also, this is probably more ! * strict than necessary; it's OK if the target node was not shut down ! * cleanly, as long as it isn't running at the moment. */ ! if (ControlFile_target.state != DB_SHUTDOWNED && ! ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY) pg_fatal("target server must be shut down cleanly\n"); /* *************** sanityChecks(void) *** 385,459 **** * server is shut down. There isn't any very strong reason for this * limitation, but better safe than sorry. */ ! if (datadir_source && ControlFile_source.state != DB_SHUTDOWNED) pg_fatal("source data directory must be shut down cleanly\n"); } /* ! * Determine the TLI of the last common timeline in the histories of the two ! * clusters. *tli is set to the last common timeline, and *recptr is set to ! * the position where the histories diverged (ie. the first WAL record that's ! * not the same in both clusters). ! * ! * Control files of both clusters must be read into ControlFile_target/source ! * before calling this. */ ! static void ! findCommonAncestorTimeline(XLogRecPtr *recptr, TimeLineID *tli) { ! TimeLineID targettli; ! TimeLineHistoryEntry *sourceHistory; ! int nentries; ! int i; ! TimeLineID sourcetli; ! targettli = ControlFile_target.checkPointCopy.ThisTimeLineID; ! sourcetli = ControlFile_source.checkPointCopy.ThisTimeLineID; ! /* Timeline 1 does not have a history file, so no need to check */ ! if (sourcetli == 1) { ! sourceHistory = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry)); ! sourceHistory->tli = sourcetli; ! sourceHistory->begin = sourceHistory->end = InvalidXLogRecPtr; ! nentries = 1; } else { char path[MAXPGPATH]; char *histfile; ! TLHistoryFilePath(path, sourcetli); ! histfile = fetchFile(path, NULL); ! sourceHistory = rewind_parseTimeLineHistory(histfile, ! ControlFile_source.checkPointCopy.ThisTimeLineID, ! &nentries); pg_free(histfile); } ! /* ! * Trace the history backwards, until we hit the target timeline. ! * ! * TODO: This assumes that there are no timeline switches on the target ! * cluster after the fork. ! */ ! for (i = nentries - 1; i >= 0; i--) { ! TimeLineHistoryEntry *entry = &sourceHistory[i]; ! if (entry->tli == targettli) { ! /* found it */ ! *recptr = entry->end; ! *tli = entry->tli; ! pg_free(sourceHistory); ! return; } } ! pg_fatal("could not find common ancestor of the source and target cluster's timelines\n"); } --- 392,540 ---- * server is shut down. There isn't any very strong reason for this * limitation, but better safe than sorry. */ ! if (datadir_source && ! ControlFile_source.state != DB_SHUTDOWNED && ! ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY) pg_fatal("source data directory must be shut down cleanly\n"); } /* ! * Find minimum from two XLOG positions assuming InvalidXLogRecPtr means ! * infinity as src/include/access/timeline.h states. This routine should ! * be used only when comparing XLOG positions related to history files. */ ! static XLogRecPtr ! MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b) { ! if (XLogRecPtrIsInvalid(a)) ! return b; ! else if (XLogRecPtrIsInvalid(b)) ! return a; ! else ! return Min(a, b); ! } ! /* ! * Retrieve timeline history for given control file which should behold ! * either source or target. ! */ ! static TimeLineHistoryEntry * ! getTimelineHistory(ControlFileData *controlFile, int *nentries) ! { ! TimeLineHistoryEntry *history; ! TimeLineID tli; ! tli = controlFile->checkPointCopy.ThisTimeLineID; ! ! /* ! * Timeline 1 does not have a history file, so there is no need to check and ! * fake an entry with infinite start and end positions. ! */ ! if (tli == 1) { ! history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry)); ! history->tli = tli; ! history->begin = history->end = InvalidXLogRecPtr; ! *nentries = 1; } else { char path[MAXPGPATH]; char *histfile; ! TLHistoryFilePath(path, tli); ! /* Get history file from appropriate source */ ! if (controlFile == &ControlFile_source) ! histfile = fetchFile(path, NULL); ! else if (controlFile == &ControlFile_target) ! histfile = slurpFile(datadir_target, path, NULL); ! else ! pg_fatal("Invalid control file"); ! ! history = rewind_parseTimeLineHistory(histfile, tli, nentries); pg_free(histfile); } ! if (debug) { ! int i; ! if (controlFile == &ControlFile_source) ! printf("Source timeline history:\n"); ! else if (controlFile == &ControlFile_target) ! printf("Target timeline history:\n"); ! else ! Assert(false); ! ! /* ! * Print the target timeline history. ! */ ! for (i = 0; i < targetNentries; i++) { ! TimeLineHistoryEntry *entry; ! entry = &history[i]; ! printf("%d: %X/%X - %X/%X\n", entry->tli, ! (uint32) (entry->begin >> 32), (uint32) (entry->begin), ! (uint32) (entry->end >> 32), (uint32) (entry->end)); } } ! return history; ! } ! ! /* ! * Determine the TLI of the last common timeline in the timeline history of the ! * two clusters. targetHistory is filled with target timeline history and ! * targetNentries is number of items in targetHistory. *tliIndex is set to the ! * index of last common timeline in targetHistory array, and *recptr is set to ! * the position where the timeline history diverged (ie. the first WAL record ! * that's not the same in both clusters). ! * ! * Control files of both clusters must be read into ControlFile_target/source ! * before calling this routine. ! */ ! static void ! findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex) ! { ! TimeLineHistoryEntry *sourceHistory; ! int sourceNentries; ! int i, n; ! ! /* Retrieve timelines for both source and target */ ! sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries); ! targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries); ! ! /* ! * Trace the history forward, until we hit the timeline diverge. It may ! * still be possible that the source and target nodes used the same ! * timeline number in their history but with different start position ! * depending on the history files that each node has fetched in previous ! * recovery processes. Hence check the start position of the new timeline ! * as well and move down by one extra timeline entry if they do not match. ! */ ! n = Min(sourceNentries, targetNentries); ! for (i = 0; i < n; i++) ! { ! if (sourceHistory[i].tli != targetHistory[i].tli || ! sourceHistory[i].begin != targetHistory[i].begin) ! break; ! } ! ! if (i > 0) ! { ! i--; ! *recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end); ! *tliIndex = i; ! ! pg_free(sourceHistory); ! return; ! } ! else ! { ! pg_fatal("could not find common ancestor of the source and target cluster's timelines\n"); ! } } diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h new file mode 100644 index e281369..4826dde *** a/src/bin/pg_rewind/pg_rewind.h --- b/src/bin/pg_rewind/pg_rewind.h *************** extern bool debug; *** 27,41 **** extern bool showprogress; extern bool dry_run; /* in parsexlog.c */ extern void extractPageMap(const char *datadir, XLogRecPtr startpoint, ! TimeLineID tli, XLogRecPtr endpoint); extern void findLastCheckpoint(const char *datadir, XLogRecPtr searchptr, ! TimeLineID tli, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo); extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr, ! TimeLineID tli); /* in timeline.c */ extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer, --- 27,45 ---- extern bool showprogress; extern bool dry_run; + /* Target history */ + extern TimeLineHistoryEntry *targetHistory; + extern int targetNentries; + /* in parsexlog.c */ extern void extractPageMap(const char *datadir, XLogRecPtr startpoint, ! int tliIndex, XLogRecPtr endpoint); extern void findLastCheckpoint(const char *datadir, XLogRecPtr searchptr, ! int tliIndex, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo); extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr, ! int tliIndex); /* in timeline.c */ extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer,