*** doc/src/sgml/config.sgml
--- doc/src/sgml/config.sgml
***************
*** 5335,5340 **** plruby.use_strict = true # generates error: unknown class name
--- 5335,5366 ----
+
+ trace_recovery_messages (string)
+
+ trace_recovery_messages> configuration parameter
+
+
+
+ Controls which message levels are written to the server log
+ for system modules needed for recovery processing. This allows
+ the user to override the normal setting of log_min_messages,
+ but only for specific messages. This is intended for use in
+ debugging Hot Standby.
+ Valid values are DEBUG5>, DEBUG4>,
+ DEBUG3>, DEBUG2>, DEBUG1>,
+ INFO>, NOTICE>, WARNING>,
+ ERROR>, LOG>, FATAL>, and
+ PANIC>. Each level includes all the levels that
+ follow it. The later the level, the fewer messages are sent
+ to the log. The default is WARNING>. Note that
+ LOG> has a different rank here than in
+ client_min_messages>.
+ Parameter should be set in the postgresql.conf only.
+
+
+
+
zero_damaged_pages (boolean)
*** doc/src/sgml/func.sgml
--- doc/src/sgml/func.sgml
***************
*** 12893,12898 **** postgres=# select * from pg_xlogfile_name_offset(pg_stop_backup());
--- 12893,13089 ----
.
+
+ pg_is_in_recovery
+
+
+ pg_last_completed_xact_timestamp
+
+
+ pg_last_completed_xid
+
+
+ pg_recovery_pause
+
+
+ pg_recovery_continue
+
+
+ pg_recovery_pause_cleanup
+
+
+ pg_recovery_pause_xid
+
+
+ pg_recovery_pause_time
+
+
+ pg_recovery_stop
+
+
+
+ The functions shown in assist in archive recovery.
+ Except for the first three functions, these are restricted to superusers.
+ All of these functions can only be executed during recovery.
+
+
+
+ Recovery Control Functions
+
+
+ Name Return Type Description
+
+
+
+
+
+
+ pg_is_in_recovery()
+
+ bool
+ True if recovery is still in progress.
+
+
+
+ pg_last_completed_xact_timestamp()
+
+ timestamp with time zone
+ Returns the original completion timestamp with timezone of the
+ last completed transaction in the current recovery.
+
+
+
+
+ pg_last_completed_xid()
+
+ integer
+ Returns the transaction id (32-bit) of last completed transaction
+ in the current recovery. Later numbered transaction ids may already have
+ completed. This is unrelated to transactions on the source server.
+
+
+
+
+
+ pg_recovery_pause()
+
+ void
+ Pause recovery processing, unconditionally.
+
+
+
+ pg_recovery_continue()
+
+ void
+ If recovery is paused, continue processing.
+
+
+
+ pg_recovery_stop()
+
+ void
+ End recovery and begin normal processing.
+
+
+
+ pg_recovery_pause_xid()
+
+ void
+ Continue recovery until specified xid completes, if it is ever
+ seen, then pause recovery.
+
+
+
+
+ pg_recovery_pause_time()
+
+ void
+ Continue recovery until a transaction with specified timestamp
+ completes, if one is ever seen, then pause recovery.
+
+
+
+
+ pg_recovery_pause_cleanup()
+
+ void
+ Continue recovery until the next cleanup record, then pause.
+
+
+
+ pg_recovery_pause_advance()
+
+ void
+ Advance recovery specified number of records then pause.
+
+
+
+
+
+
+ pg_recovery_pause> and pg_recovery_continue> allow
+ a superuser to control the progress of recovery on the database server.
+ While recovery is paused queries can then be executed to determine how far
+ forwards recovery should progress. Recovery can never go backwards
+ because previous values are overwritten. If the superuser wishes recovery
+ to complete and normal processing mode to start, execute
+ pg_recovery_stop>.
+
+
+
+ Variations of the pause function exist, mainly to allow PITR to dynamically
+ control where it should progress to. pg_recovery_pause_xid> and
+ pg_recovery_pause_time> allow the specification of a trial
+ recovery target, similarly to .
+ Recovery will then progress to the specified point and then pause, rather
+ than stopping permanently, allowing assessment of whether this is the
+ desired stopping point for recovery.
+
+
+
+ pg_recovery_pause_cleanup> allows recovery to progress only
+ as far as the next cleanup record. This is useful where a longer running
+ query needs to access the database in a consistent state and it is
+ more important that the query executes than it is that we keep processing
+ new WAL records. This can be used as shown:
+
+ select pg_recovery_pause_cleanup();
+
+ -- run very important query
+ select
+ from big_table1 join big_table2
+ on ...
+ where ...
+
+ select pg_recovery_continue;
+
+
+
+
+ pg_recovery_advance> allows recovery to progress record by
+ record, for very careful analysis or debugging. Step size can be 1 or
+ more records. If recovery is not yet paused then pg_recovery_advance>
+ will process the specified number of records then pause. If recovery
+ is already paused, recovery will continue for another N records before
+ pausing again.
+
+
+
+ If you pause recovery while the server is waiting for a WAL file when
+ operating in standby mode it will have apparently no effect until the
+ file arrives. Once the server begins processing WAL records again it
+ will notice the pause request and will act upon it. This is not a bug.
+ pause.
+
+
+
+ Pausing recovery will also prevent restartpoints from starting since they
+ are triggered by events in the WAL stream. In all other ways processing
+ will continue, for example the background writer will continue to clean
+ shared_buffers while paused.
+
+
The functions shown in calculate
the actual disk space usage of database objects.
*** src/backend/access/heap/heapam.c
--- src/backend/access/heap/heapam.c
***************
*** 3814,3832 **** heap_restrpos(HeapScanDesc scan)
}
/*
* Perform XLogInsert for a heap-clean operation. Caller must already
* have modified the buffer and marked it dirty.
*
* Note: prior to Postgres 8.3, the entries in the nowunused[] array were
* zero-based tuple indexes. Now they are one-based like other uses
* of OffsetNumber.
*/
XLogRecPtr
log_heap_clean(Relation reln, Buffer buffer,
OffsetNumber *redirected, int nredirected,
OffsetNumber *nowdead, int ndead,
OffsetNumber *nowunused, int nunused,
! bool redirect_move)
{
xl_heap_clean xlrec;
uint8 info;
--- 3814,3891 ----
}
/*
+ * Update the latestRemovedXid for the current VACUUM. This gets called
+ * only rarely, since we probably already removed rows earlier.
+ * see comments for vacuum_log_cleanup_info().
+ */
+ void
+ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
+ TransactionId *latestRemovedXid)
+ {
+ TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
+ TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
+ TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
+
+ if (tuple->t_infomask & HEAP_MOVED_OFF ||
+ tuple->t_infomask & HEAP_MOVED_IN)
+ {
+ if (TransactionIdPrecedes(*latestRemovedXid, xvac))
+ *latestRemovedXid = xvac;
+ }
+
+ if (TransactionIdPrecedes(*latestRemovedXid, xmax))
+ *latestRemovedXid = xmax;
+
+ if (TransactionIdPrecedes(*latestRemovedXid, xmin))
+ *latestRemovedXid = xmin;
+
+ Assert(TransactionIdIsValid(*latestRemovedXid));
+ }
+
+ /*
+ * Perform XLogInsert to register a heap cleanup info message. These
+ * messages are sent once per VACUUM and are required because
+ * of the phasing of removal operations during a lazy VACUUM.
+ * see comments for vacuum_log_cleanup_info().
+ */
+ XLogRecPtr
+ log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
+ {
+ xl_heap_cleanup_info xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata;
+
+ xlrec.node = rnode;
+ xlrec.latestRemovedXid = latestRemovedXid;
+
+ rdata.data = (char *) &xlrec;
+ rdata.len = SizeOfHeapCleanupInfo;
+ rdata.buffer = InvalidBuffer;
+ rdata.next = NULL;
+
+ recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO, &rdata);
+
+ return recptr;
+ }
+
+ /*
* Perform XLogInsert for a heap-clean operation. Caller must already
* have modified the buffer and marked it dirty.
*
* Note: prior to Postgres 8.3, the entries in the nowunused[] array were
* zero-based tuple indexes. Now they are one-based like other uses
* of OffsetNumber.
+ *
+ * For 8.4 we also include the latestRemovedXid which allows recovery
+ * processing to abort standby queries that would be have their results
+ * potentially changed if we applied these changes.
*/
XLogRecPtr
log_heap_clean(Relation reln, Buffer buffer,
OffsetNumber *redirected, int nredirected,
OffsetNumber *nowdead, int ndead,
OffsetNumber *nowunused, int nunused,
! TransactionId latestRemovedXid, bool redirect_move)
{
xl_heap_clean xlrec;
uint8 info;
***************
*** 3838,3843 **** log_heap_clean(Relation reln, Buffer buffer,
--- 3897,3903 ----
xlrec.node = reln->rd_node;
xlrec.block = BufferGetBlockNumber(buffer);
+ xlrec.latestRemovedXid = latestRemovedXid;
xlrec.nredirected = nredirected;
xlrec.ndead = ndead;
***************
*** 4129,4135 **** heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
! buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
if (!BufferIsValid(buffer))
return;
page = (Page) BufferGetPage(buffer);
--- 4189,4195 ----
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
! buffer = XLogReadBufferForCleanup(xlrec->node, xlrec->block, false);
if (!BufferIsValid(buffer))
return;
page = (Page) BufferGetPage(buffer);
***************
*** 4189,4195 **** heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
! buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
if (!BufferIsValid(buffer))
return;
page = (Page) BufferGetPage(buffer);
--- 4249,4255 ----
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
! buffer = XLogReadBufferForCleanup(xlrec->node, xlrec->block, false);
if (!BufferIsValid(buffer))
return;
page = (Page) BufferGetPage(buffer);
***************
*** 4824,4829 **** heap2_redo(XLogRecPtr lsn, XLogRecord *record)
--- 4884,4897 ----
case XLOG_HEAP2_CLEAN_MOVE:
heap_xlog_clean(lsn, record, true);
break;
+ case XLOG_HEAP2_CLEANUP_INFO:
+ /*
+ * Actual operation is a no-op. Record type exists to
+ * provide information to recovery record pre-processing,
+ * so the actions for this record have already been taken.
+ * See ResolveRedoVisibilityConflicts()
+ */
+ break;
default:
elog(PANIC, "heap2_redo: unknown op code %u", info);
}
***************
*** 4953,4969 **** heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
{
xl_heap_clean *xlrec = (xl_heap_clean *) rec;
! appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
xlrec->node.spcNode, xlrec->node.dbNode,
! xlrec->node.relNode, xlrec->block);
}
else if (info == XLOG_HEAP2_CLEAN_MOVE)
{
xl_heap_clean *xlrec = (xl_heap_clean *) rec;
! appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
xlrec->node.spcNode, xlrec->node.dbNode,
! xlrec->node.relNode, xlrec->block);
}
else
appendStringInfo(buf, "UNKNOWN");
--- 5021,5046 ----
{
xl_heap_clean *xlrec = (xl_heap_clean *) rec;
! appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
xlrec->node.spcNode, xlrec->node.dbNode,
! xlrec->node.relNode, xlrec->block,
! xlrec->latestRemovedXid);
}
else if (info == XLOG_HEAP2_CLEAN_MOVE)
{
xl_heap_clean *xlrec = (xl_heap_clean *) rec;
! appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u remxid %u",
xlrec->node.spcNode, xlrec->node.dbNode,
! xlrec->node.relNode, xlrec->block,
! xlrec->latestRemovedXid);
! }
! else if (info == XLOG_HEAP2_CLEANUP_INFO)
! {
! xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
!
! appendStringInfo(buf, "cleanup info: remxid %u",
! xlrec->latestRemovedXid);
}
else
appendStringInfo(buf, "UNKNOWN");
*** src/backend/access/heap/pruneheap.c
--- src/backend/access/heap/pruneheap.c
***************
*** 30,35 ****
--- 30,36 ----
typedef struct
{
TransactionId new_prune_xid; /* new prune hint value for page */
+ TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
int nredirected; /* numbers of entries in arrays below */
int ndead;
int nunused;
***************
*** 85,90 **** heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
--- 86,99 ----
return;
/*
+ * We can't write WAL in recovery mode, so there's no point trying to
+ * clean the page. The master will likely issue a cleaning WAL record
+ * soon anyway, so this is no particular loss.
+ */
+ if (IsRecoveryProcessingMode())
+ return;
+
+ /*
* We prune when a previous UPDATE failed to find enough space on the page
* for a new tuple version, or when free space falls below the relation's
* fill-factor target (but not less than 10%).
***************
*** 176,181 **** heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
--- 185,191 ----
* Also initialize the rest of our working state.
*/
prstate.new_prune_xid = InvalidTransactionId;
+ prstate.latestRemovedXid = InvalidTransactionId;
prstate.nredirected = prstate.ndead = prstate.nunused = 0;
memset(prstate.marked, 0, sizeof(prstate.marked));
***************
*** 258,264 **** heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
prstate.redirected, prstate.nredirected,
prstate.nowdead, prstate.ndead,
prstate.nowunused, prstate.nunused,
! redirect_move);
PageSetLSN(BufferGetPage(buffer), recptr);
PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
--- 268,274 ----
prstate.redirected, prstate.nredirected,
prstate.nowdead, prstate.ndead,
prstate.nowunused, prstate.nunused,
! prstate.latestRemovedXid, redirect_move);
PageSetLSN(BufferGetPage(buffer), recptr);
PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
***************
*** 396,401 **** heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
--- 406,413 ----
== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
{
heap_prune_record_unused(prstate, rootoffnum);
+ HeapTupleHeaderAdvanceLatestRemovedXid(htup,
+ &prstate->latestRemovedXid);
ndeleted++;
}
***************
*** 521,527 **** heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
--- 533,543 ----
* find another DEAD tuple is a fairly unusual corner case.)
*/
if (tupdead)
+ {
latestdead = offnum;
+ HeapTupleHeaderAdvanceLatestRemovedXid(htup,
+ &prstate->latestRemovedXid);
+ }
else if (!recent_dead)
break;
*** src/backend/access/index/genam.c
--- src/backend/access/index/genam.c
***************
*** 89,96 **** RelationGetIndexScan(Relation indexRelation,
else
scan->keyData = NULL;
scan->kill_prior_tuple = false;
! scan->ignore_killed_tuples = true; /* default setting */
scan->opaque = NULL;
--- 89,104 ----
else
scan->keyData = NULL;
+ /*
+ * During recovery we ignore killed tuples and don't bother to kill them
+ * either. We do this because the xmin on the primary node could easily
+ * be later than the xmin on the standby node, so that what the primary
+ * thinks is killed is supposed to be visible on standby. So for correct
+ * MVCC for queries during recovery we must ignore these hints and check
+ * all tuples.
+ */
scan->kill_prior_tuple = false;
! scan->ignore_killed_tuples = IsRecoveryProcessingMode(); /* default setting */
scan->opaque = NULL;
*** src/backend/access/index/indexam.c
--- src/backend/access/index/indexam.c
***************
*** 455,463 **** index_getnext(IndexScanDesc scan, ScanDirection direction)
/*
* If we scanned a whole HOT chain and found only dead tuples,
! * tell index AM to kill its entry for that TID.
*/
! scan->kill_prior_tuple = scan->xs_hot_dead;
/*
* The AM's gettuple proc finds the next index entry matching the
--- 455,466 ----
/*
* If we scanned a whole HOT chain and found only dead tuples,
! * tell index AM to kill its entry for that TID. We do not do
! * this when in recovery because it may violate MVCC to do so.
! * see comments in RelationGetIndexScan().
*/
! if (!IsRecoveryProcessingMode())
! scan->kill_prior_tuple = scan->xs_hot_dead;
/*
* The AM's gettuple proc finds the next index entry matching the
*** src/backend/access/nbtree/nbtinsert.c
--- src/backend/access/nbtree/nbtinsert.c
***************
*** 1924,1930 **** _bt_vacuum_one_page(Relation rel, Buffer buffer)
}
if (ndeletable > 0)
! _bt_delitems(rel, buffer, deletable, ndeletable);
/*
* Note: if we didn't find any LP_DEAD items, then the page's
--- 1924,1930 ----
}
if (ndeletable > 0)
! _bt_delitems(rel, buffer, deletable, ndeletable, false, 0);
/*
* Note: if we didn't find any LP_DEAD items, then the page's
*** src/backend/access/nbtree/nbtpage.c
--- src/backend/access/nbtree/nbtpage.c
***************
*** 652,658 **** _bt_page_recyclable(Page page)
*/
void
_bt_delitems(Relation rel, Buffer buf,
! OffsetNumber *itemnos, int nitems)
{
Page page = BufferGetPage(buf);
BTPageOpaque opaque;
--- 652,659 ----
*/
void
_bt_delitems(Relation rel, Buffer buf,
! OffsetNumber *itemnos, int nitems, bool isVacuum,
! BlockNumber lastBlockVacuumed)
{
Page page = BufferGetPage(buf);
BTPageOpaque opaque;
***************
*** 684,698 **** _bt_delitems(Relation rel, Buffer buf,
/* XLOG stuff */
if (!rel->rd_istemp)
{
- xl_btree_delete xlrec;
XLogRecPtr recptr;
XLogRecData rdata[2];
! xlrec.node = rel->rd_node;
! xlrec.block = BufferGetBlockNumber(buf);
- rdata[0].data = (char *) &xlrec;
- rdata[0].len = SizeOfBtreeDelete;
rdata[0].buffer = InvalidBuffer;
rdata[0].next = &(rdata[1]);
--- 685,721 ----
/* XLOG stuff */
if (!rel->rd_istemp)
{
XLogRecPtr recptr;
XLogRecData rdata[2];
! /* We don't need both, but it simplies the code to have both here */
! xl_btree_delete xlrec_delete;
! xl_btree_vacuum xlrec_vacuum;
!
! if (isVacuum)
! {
! xlrec_vacuum.node = rel->rd_node;
! xlrec_vacuum.block = BufferGetBlockNumber(buf);
!
! xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
! rdata[0].data = (char *) &xlrec_vacuum;
! rdata[0].len = SizeOfBtreeVacuum;
! }
! else
! {
! xlrec_delete.node = rel->rd_node;
! xlrec_delete.block = BufferGetBlockNumber(buf);
!
! /*
! * We would like to set an accurate latestRemovedXid, but there
! * is no easy way of obtaining a useful value. So we use the
! * probably far too conservative value of RecentGlobalXmin instead.
! */
! xlrec_delete.latestRemovedXid = RecentGlobalXmin;
! rdata[0].data = (char *) &xlrec_delete;
! rdata[0].len = SizeOfBtreeDelete;
! }
rdata[0].buffer = InvalidBuffer;
rdata[0].next = &(rdata[1]);
***************
*** 715,721 **** _bt_delitems(Relation rel, Buffer buf,
rdata[1].buffer_std = true;
rdata[1].next = NULL;
! recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
--- 738,747 ----
rdata[1].buffer_std = true;
rdata[1].next = NULL;
! if (isVacuum)
! recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
! else
! recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
*** src/backend/access/nbtree/nbtree.c
--- src/backend/access/nbtree/nbtree.c
***************
*** 58,64 **** typedef struct
IndexBulkDeleteCallback callback;
void *callback_state;
BTCycleId cycleid;
! BlockNumber lastUsedPage;
BlockNumber totFreePages; /* true total # of free pages */
MemoryContext pagedelcontext;
} BTVacState;
--- 58,65 ----
IndexBulkDeleteCallback callback;
void *callback_state;
BTCycleId cycleid;
! BlockNumber lastBlockVacuumed; /* last blkno reached by Vacuum scan */
! BlockNumber lastUsedPage; /* blkno of last page that is in use */
BlockNumber totFreePages; /* true total # of free pages */
MemoryContext pagedelcontext;
} BTVacState;
***************
*** 626,631 **** btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
--- 627,633 ----
vstate.callback = callback;
vstate.callback_state = callback_state;
vstate.cycleid = cycleid;
+ vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
vstate.lastUsedPage = BTREE_METAPAGE;
vstate.totFreePages = 0;
***************
*** 855,861 **** restart:
*/
if (ndeletable > 0)
{
! _bt_delitems(rel, buf, deletable, ndeletable);
stats->tuples_removed += ndeletable;
/* must recompute maxoff */
maxoff = PageGetMaxOffsetNumber(page);
--- 857,875 ----
*/
if (ndeletable > 0)
{
! BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf);
!
! _bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed);
!
! /*
! * Keep track of the block number of the lastBlockVacuumed, so
! * we can scan those blocks as well during WAL replay. This then
! * provides concurrency protection and allows btrees to be used
! * while in recovery.
! */
! if (lastBlockVacuumed > vstate->lastBlockVacuumed)
! vstate->lastBlockVacuumed = lastBlockVacuumed;
!
stats->tuples_removed += ndeletable;
/* must recompute maxoff */
maxoff = PageGetMaxOffsetNumber(page);
*** src/backend/access/nbtree/nbtxlog.c
--- src/backend/access/nbtree/nbtxlog.c
***************
*** 459,464 **** btree_xlog_split(bool onleft, bool isroot,
--- 459,534 ----
}
static void
+ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
+ {
+ xl_btree_vacuum *xlrec;
+ Buffer buffer;
+ Page page;
+ BTPageOpaque opaque;
+
+ if (record->xl_info & XLR_BKP_BLOCK_1)
+ return;
+
+ xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+
+ /*
+ * We need to ensure every block is pinned between the
+ * lastBlockVacuumed and the current block, if there are any.
+ * This ensures that every block in the index is touched during
+ * VACUUM as required to ensure scans work correctly.
+ */
+ if ((xlrec->lastBlockVacuumed + 1) != xlrec->block)
+ {
+ BlockNumber blkno = xlrec->lastBlockVacuumed + 1;
+
+ for (; blkno < xlrec->block; blkno++)
+ {
+ buffer = XLogReadBufferForCleanup(xlrec->node, blkno, false);
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+
+ /*
+ * We need to take a cleanup lock to apply these changes.
+ * See nbtree/README for details.
+ */
+ buffer = XLogReadBufferForCleanup(xlrec->node, xlrec->block, false);
+ if (!BufferIsValid(buffer))
+ return;
+ page = (Page) BufferGetPage(buffer);
+
+ if (XLByteLE(lsn, PageGetLSN(page)))
+ {
+ UnlockReleaseBuffer(buffer);
+ return;
+ }
+
+ if (record->xl_len > SizeOfBtreeVacuum)
+ {
+ OffsetNumber *unused;
+ OffsetNumber *unend;
+
+ unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeVacuum);
+ unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+
+ PageIndexMultiDelete(page, unused, unend - unused);
+ }
+
+ /*
+ * Mark the page as not containing any LP_DEAD items --- see comments in
+ * _bt_delitems().
+ */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
+ }
+
+ static void
btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
{
xl_btree_delete *xlrec;
***************
*** 470,475 **** btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
--- 540,550 ----
return;
xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+ /*
+ * We don't need to take a cleanup lock to apply these changes.
+ * See nbtree/README for details.
+ */
buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
if (!BufferIsValid(buffer))
return;
***************
*** 737,742 **** btree_redo(XLogRecPtr lsn, XLogRecord *record)
--- 812,819 ----
case XLOG_BTREE_SPLIT_R_ROOT:
btree_xlog_split(false, true, lsn, record);
break;
+ case XLOG_BTREE_VACUUM:
+ btree_xlog_vacuum(lsn, record);
case XLOG_BTREE_DELETE:
btree_xlog_delete(lsn, record);
break;
***************
*** 753,758 **** btree_redo(XLogRecPtr lsn, XLogRecord *record)
--- 830,899 ----
}
}
+ bool
+ btree_is_cleanup_record(uint8 info)
+ {
+ switch (info)
+ {
+ case XLOG_BTREE_VACUUM:
+ case XLOG_BTREE_DELETE:
+ return true;
+ break;
+
+ case XLOG_BTREE_INSERT_LEAF:
+ case XLOG_BTREE_INSERT_UPPER:
+ case XLOG_BTREE_INSERT_META:
+ case XLOG_BTREE_SPLIT_L:
+ case XLOG_BTREE_SPLIT_R:
+ case XLOG_BTREE_SPLIT_L_ROOT:
+ case XLOG_BTREE_SPLIT_R_ROOT:
+ case XLOG_BTREE_DELETE_PAGE:
+ case XLOG_BTREE_DELETE_PAGE_META:
+ case XLOG_BTREE_DELETE_PAGE_HALF:
+ case XLOG_BTREE_NEWROOT:
+ return false;
+ break;
+
+ default:
+ elog(PANIC, "btree_is_cleanup_record: unknown op code %u", info);
+ }
+
+ /* never reached */
+ return false;
+ }
+
+ bool
+ btree_needs_cleanup_lock(uint8 info)
+ {
+ switch (info)
+ {
+ case XLOG_BTREE_VACUUM:
+ return true;
+ break;
+
+ case XLOG_BTREE_INSERT_LEAF:
+ case XLOG_BTREE_INSERT_UPPER:
+ case XLOG_BTREE_INSERT_META:
+ case XLOG_BTREE_SPLIT_L:
+ case XLOG_BTREE_SPLIT_R:
+ case XLOG_BTREE_SPLIT_L_ROOT:
+ case XLOG_BTREE_SPLIT_R_ROOT:
+ case XLOG_BTREE_DELETE:
+ case XLOG_BTREE_DELETE_PAGE:
+ case XLOG_BTREE_DELETE_PAGE_META:
+ case XLOG_BTREE_DELETE_PAGE_HALF:
+ case XLOG_BTREE_NEWROOT:
+ return false;
+ break;
+
+ default:
+ elog(PANIC, "btree_needs_cleanup_lock: unknown op code %u", info);
+ }
+
+ /* never reached */
+ return false;
+ }
+
static void
out_target(StringInfo buf, xl_btreetid *target)
{
***************
*** 841,853 **** btree_desc(StringInfo buf, uint8 xl_info, char *rec)
xlrec->level, xlrec->firstright);
break;
}
case XLOG_BTREE_DELETE:
{
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
! appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u",
xlrec->node.spcNode, xlrec->node.dbNode,
! xlrec->node.relNode, xlrec->block);
break;
}
case XLOG_BTREE_DELETE_PAGE:
--- 982,1005 ----
xlrec->level, xlrec->firstright);
break;
}
+ case XLOG_BTREE_VACUUM:
+ {
+ xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
+
+ appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u",
+ xlrec->node.spcNode, xlrec->node.dbNode,
+ xlrec->node.relNode, xlrec->block,
+ xlrec->lastBlockVacuumed);
+ break;
+ }
case XLOG_BTREE_DELETE:
{
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
! appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u, latestRemovedXid %u",
xlrec->node.spcNode, xlrec->node.dbNode,
! xlrec->node.relNode, xlrec->block,
! xlrec->latestRemovedXid);
break;
}
case XLOG_BTREE_DELETE_PAGE:
*** src/backend/access/transam/clog.c
--- src/backend/access/transam/clog.c
***************
*** 475,480 **** ZeroCLOGPage(int pageno, bool writeXlog)
--- 475,483 ----
/*
* This must be called ONCE during postmaster or standalone-backend startup,
* after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ *
+ * We access just a single clog page, so this action is atomic and safe
+ * for use if other processes are active during recovery.
*/
void
StartupCLOG(void)
*** src/backend/access/transam/multixact.c
--- src/backend/access/transam/multixact.c
***************
*** 1413,1420 **** ZeroMultiXactMemberPage(int pageno, bool writeXlog)
* MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. Note that we
* may already have replayed WAL data into the SLRU files.
*
! * We don't need any locks here, really; the SLRU locks are taken
! * only because slru.c expects to be called with locks held.
*/
void
StartupMultiXact(void)
--- 1413,1423 ----
* MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. Note that we
* may already have replayed WAL data into the SLRU files.
*
! * We want this operation to be atomic to ensure that other processes can
! * use MultiXact while we complete recovery. We access one page only from the
! * offset and members buffers, so once locks are acquired they will not be
! * dropped and re-acquired by SLRU code. So we take both locks at start, then
! * hold them all the way to the end.
*/
void
StartupMultiXact(void)
***************
*** 1426,1431 **** StartupMultiXact(void)
--- 1429,1435 ----
/* Clean up offsets state */
LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
/*
* Initialize our idea of the latest page number.
***************
*** 1452,1461 **** StartupMultiXact(void)
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
}
- LWLockRelease(MultiXactOffsetControlLock);
-
/* And the same for members */
- LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
/*
* Initialize our idea of the latest page number.
--- 1456,1462 ----
***************
*** 1483,1488 **** StartupMultiXact(void)
--- 1484,1490 ----
}
LWLockRelease(MultiXactMemberControlLock);
+ LWLockRelease(MultiXactOffsetControlLock);
/*
* Initialize lastTruncationPoint to invalid, ensuring that the first
***************
*** 1542,1549 **** CheckPointMultiXact(void)
* isn't valid (because StartupMultiXact hasn't been called yet) and so
* SimpleLruTruncate would get confused. It seems best not to risk
* removing any data during recovery anyway, so don't truncate.
*/
! if (!InRecovery)
TruncateMultiXact();
TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
--- 1544,1552 ----
* isn't valid (because StartupMultiXact hasn't been called yet) and so
* SimpleLruTruncate would get confused. It seems best not to risk
* removing any data during recovery anyway, so don't truncate.
+ * We are executing in the bgwriter, so we must access shared status.
*/
! if (!IsRecoveryProcessingMode())
TruncateMultiXact();
TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
*** src/backend/access/transam/rmgr.c
--- src/backend/access/transam/rmgr.c
***************
*** 20,25 ****
--- 20,26 ----
#include "commands/dbcommands.h"
#include "commands/sequence.h"
#include "commands/tablespace.h"
+ #include "storage/sinval.h"
#include "storage/freespace.h"
***************
*** 32,38 **** const RmgrData RmgrTable[RM_MAX_ID + 1] = {
{"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL},
{"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
! {"Reserved 8", NULL, NULL, NULL, NULL, NULL},
{"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
{"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
{"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint},
--- 33,39 ----
{"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL},
{"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
! {"Relation", relation_redo, relation_desc, NULL, NULL, NULL},
{"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
{"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
{"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint},
*** src/backend/access/transam/slru.c
--- src/backend/access/transam/slru.c
***************
*** 598,604 **** SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
* commands to set the commit status of transactions whose bits are in
* already-truncated segments of the commit log (see notes in
* SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
! * where the file doesn't exist, and return zeroes instead.
*/
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0)
--- 598,605 ----
* commands to set the commit status of transactions whose bits are in
* already-truncated segments of the commit log (see notes in
* SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
! * where the file doesn't exist, and return zeroes instead. We also
! * return a zeroed page when seek and read fails.
*/
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0)
***************
*** 619,624 **** SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
--- 620,633 ----
if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
{
+ if (InRecovery)
+ {
+ ereport(LOG,
+ (errmsg("file \"%s\" doesn't exist, reading as zeroes",
+ path)));
+ MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ return true;
+ }
slru_errcause = SLRU_SEEK_FAILED;
slru_errno = errno;
close(fd);
***************
*** 628,633 **** SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
--- 637,650 ----
errno = 0;
if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
{
+ if (InRecovery)
+ {
+ ereport(LOG,
+ (errmsg("file \"%s\" doesn't exist, reading as zeroes",
+ path)));
+ MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ return true;
+ }
slru_errcause = SLRU_READ_FAILED;
slru_errno = errno;
close(fd);
*** src/backend/access/transam/subtrans.c
--- src/backend/access/transam/subtrans.c
***************
*** 223,255 **** ZeroSUBTRANSPage(int pageno)
/*
* This must be called ONCE during postmaster or standalone-backend startup,
* after StartupXLOG has initialized ShmemVariableCache->nextXid.
- *
- * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
- * if there are none.
*/
void
StartupSUBTRANS(TransactionId oldestActiveXID)
{
! int startPage;
! int endPage;
- /*
- * Since we don't expect pg_subtrans to be valid across crashes, we
- * initialize the currently-active page(s) to zeroes during startup.
- * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero
- * the new page without regard to whatever was previously on disk.
- */
LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
! startPage = TransactionIdToPage(oldestActiveXID);
! endPage = TransactionIdToPage(ShmemVariableCache->nextXid);
!
! while (startPage != endPage)
! {
! (void) ZeroSUBTRANSPage(startPage);
! startPage++;
! }
! (void) ZeroSUBTRANSPage(startPage);
LWLockRelease(SubtransControlLock);
}
--- 223,241 ----
/*
* This must be called ONCE during postmaster or standalone-backend startup,
* after StartupXLOG has initialized ShmemVariableCache->nextXid.
*/
void
StartupSUBTRANS(TransactionId oldestActiveXID)
{
! TransactionId xid = ShmemVariableCache->nextXid;
! int pageno = TransactionIdToPage(xid);
LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
! /*
! * Initialize our idea of the latest page number.
! */
! SubTransCtl->shared->latest_page_number = pageno;
LWLockRelease(SubtransControlLock);
}
*** src/backend/access/transam/twophase.c
--- src/backend/access/transam/twophase.c
***************
*** 1719,1724 **** RecordTransactionCommitPrepared(TransactionId xid,
--- 1719,1725 ----
/* Emit the XLOG commit record */
xlrec.xid = xid;
xlrec.crec.xact_time = GetCurrentTimestamp();
+ xlrec.crec.xinfo = 0;
xlrec.crec.nrels = nrels;
xlrec.crec.nsubxacts = nchildren;
rdata[0].data = (char *) (&xlrec);
***************
*** 1797,1802 **** RecordTransactionAbortPrepared(TransactionId xid,
--- 1798,1804 ----
/* Emit the XLOG abort record */
xlrec.xid = xid;
xlrec.arec.xact_time = GetCurrentTimestamp();
+ xlrec.arec.xinfo = 0;
xlrec.arec.nrels = nrels;
xlrec.arec.nsubxacts = nchildren;
rdata[0].data = (char *) (&xlrec);
*** src/backend/access/transam/xact.c
--- src/backend/access/transam/xact.c
***************
*** 40,45 ****
--- 40,46 ----
#include "storage/fd.h"
#include "storage/lmgr.h"
#include "storage/procarray.h"
+ #include "storage/sinval.h"
#include "storage/sinvaladt.h"
#include "storage/smgr.h"
#include "utils/combocid.h"
***************
*** 141,146 **** typedef struct TransactionStateData
--- 142,149 ----
Oid prevUser; /* previous CurrentUserId setting */
bool prevSecDefCxt; /* previous SecurityDefinerContext setting */
bool prevXactReadOnly; /* entry-time xact r/o state */
+ bool xidMarkedInWAL; /* is this xid present in WAL yet? */
+ bool hasUnMarkedSubXids; /* had unmarked subxids */
struct TransactionStateData *parent; /* back link to parent */
} TransactionStateData;
***************
*** 169,174 **** static TransactionStateData TopTransactionStateData = {
--- 172,179 ----
InvalidOid, /* previous CurrentUserId setting */
false, /* previous SecurityDefinerContext setting */
false, /* entry-time xact r/o state */
+ false, /* initial state for xidMarkedInWAL */
+ false, /* hasUnMarkedSubXids */
NULL /* link to parent state block */
};
***************
*** 212,217 **** static bool forceSyncCommit = false;
--- 217,232 ----
static MemoryContext TransactionAbortContext = NULL;
/*
+ * Bookkeeping for tracking emulated transactions in Recovery Procs.
+ */
+ static TransactionId latestObservedXid = InvalidTransactionId;
+
+ /*
+ * Local state to optimise XactResolveRecoveryConflicts()
+ */
+ static TransactionId localLatestRemovedXid = InvalidTransactionId;
+
+ /*
* List of add-on start- and end-of-xact callbacks
*/
typedef struct XactCallbackItem
***************
*** 237,243 **** static SubXactCallbackItem *SubXact_callbacks = NULL;
/* local function prototypes */
! static void AssignTransactionId(TransactionState s);
static void AbortTransaction(void);
static void AtAbort_Memory(void);
static void AtCleanup_Memory(void);
--- 252,258 ----
/* local function prototypes */
! static void AssignTransactionId(TransactionState s, int recursion_level);
static void AbortTransaction(void);
static void AtAbort_Memory(void);
static void AtCleanup_Memory(void);
***************
*** 331,337 **** TransactionId
GetTopTransactionId(void)
{
if (!TransactionIdIsValid(TopTransactionStateData.transactionId))
! AssignTransactionId(&TopTransactionStateData);
return TopTransactionStateData.transactionId;
}
--- 346,352 ----
GetTopTransactionId(void)
{
if (!TransactionIdIsValid(TopTransactionStateData.transactionId))
! AssignTransactionId(&TopTransactionStateData, 0);
return TopTransactionStateData.transactionId;
}
***************
*** 361,367 **** GetCurrentTransactionId(void)
TransactionState s = CurrentTransactionState;
if (!TransactionIdIsValid(s->transactionId))
! AssignTransactionId(s);
return s->transactionId;
}
--- 376,382 ----
TransactionState s = CurrentTransactionState;
if (!TransactionIdIsValid(s->transactionId))
! AssignTransactionId(s, 0);
return s->transactionId;
}
***************
*** 389,399 **** GetCurrentTransactionIdIfAny(void)
* following its parent's.
*/
static void
! AssignTransactionId(TransactionState s)
{
bool isSubXact = (s->parent != NULL);
ResourceOwner currentOwner;
/* Assert that caller didn't screw up */
Assert(!TransactionIdIsValid(s->transactionId));
Assert(s->state == TRANS_INPROGRESS);
--- 404,417 ----
* following its parent's.
*/
static void
! AssignTransactionId(TransactionState s, int recursion_level)
{
bool isSubXact = (s->parent != NULL);
ResourceOwner currentOwner;
+ if (IsRecoveryProcessingMode())
+ elog(FATAL, "cannot assign TransactionIds during recovery");
+
/* Assert that caller didn't screw up */
Assert(!TransactionIdIsValid(s->transactionId));
Assert(s->state == TRANS_INPROGRESS);
***************
*** 403,409 **** AssignTransactionId(TransactionState s)
* than its parent.
*/
if (isSubXact && !TransactionIdIsValid(s->parent->transactionId))
! AssignTransactionId(s->parent);
/*
* Generate a new Xid and record it in PG_PROC and pg_subtrans.
--- 421,427 ----
* than its parent.
*/
if (isSubXact && !TransactionIdIsValid(s->parent->transactionId))
! AssignTransactionId(s->parent, recursion_level + 1);
/*
* Generate a new Xid and record it in PG_PROC and pg_subtrans.
***************
*** 415,421 **** AssignTransactionId(TransactionState s)
*/
s->transactionId = GetNewTransactionId(isSubXact);
! if (isSubXact)
SubTransSetParent(s->transactionId, s->parent->transactionId);
/*
--- 433,446 ----
*/
s->transactionId = GetNewTransactionId(isSubXact);
! /*
! * If we have overflowed the subxid cache then we must mark subtrans
! * with the parent xid. Prior to 8.4 we marked subtrans for each
! * subtransaction, though that is no longer necessary because the
! * way snapshots are searched in XidInMVCCSnapshot() has changed to
! * allow searching of both subxid cache and subtrans, not either/or.
! */
! if (isSubXact && MyProc->subxids.overflowed)
SubTransSetParent(s->transactionId, s->parent->transactionId);
/*
***************
*** 437,444 **** AssignTransactionId(TransactionState s)
}
PG_END_TRY();
CurrentResourceOwner = currentOwner;
- }
/*
* GetCurrentSubTransactionId
--- 462,534 ----
}
PG_END_TRY();
CurrentResourceOwner = currentOwner;
+ elog(trace_recovery(DEBUG2),
+ "AssignXactId xid %d nest %d recursion %d xidMarkedInWAL %s hasParent %s",
+ s->transactionId,
+ GetCurrentTransactionNestLevel(),
+ recursion_level,
+ s->xidMarkedInWAL ? "t" : "f",
+ s->parent ? "t" : "f");
+
+ /*
+ * Recovery environment needs to know when a transaction first starts
+ * making changes to the database. We could issue an assignment WAL
+ * record for every transaction and subtransaction but that would be
+ * a large performance hit. So we go to some trouble to optimise this
+ * by marking the first WAL record with additional information, so we can
+ * piggyback on the normal flow of processing. There are still some cases
+ * where we need to write xid assignment WAL records, though these cases
+ * are rare in most applications.
+ *
+ * SO, if needed, WAL log this assignment. We can mark an xid and its
+ * immediate parent on a single WAL record, so if we recursively assign
+ * more than two xids at the same time we need to write some assignment
+ * log records.
+ */
+ if (recursion_level > 1 || (recursion_level == 1 && isSubXact))
+ {
+ XLogRecData rdata;
+ xl_xact_assignment xlrec;
+
+ xlrec.xassign = s->transactionId;
+ xlrec.isSubXact = (s->parent != NULL);
+
+ if (xlrec.isSubXact)
+ xlrec.xparent = s->parent->transactionId;
+ else
+ xlrec.xparent = InvalidTransactionId;
+
+ START_CRIT_SECTION();
+
+ rdata.data = (char *) (&xlrec);
+ rdata.len = sizeof(xl_xact_assignment);
+ rdata.buffer = InvalidBuffer;
+ rdata.next = NULL;
+
+ /*
+ * These WAL records look like no other. We are assigning a
+ * TransactionId to upper levels of the transaction stack. The
+ * transaction level we are looking may *not* be the *current*
+ * transaction. We have not yet assigned the xid for the current
+ * transaction, so the xid of this WAL record will be
+ * InvalidTransactionId, even though we are in a transaction.
+ * Got that?
+ *
+ * So we stuff the newly assigned xid into the WAL record and
+ * let WAL replay sort it out later.
+ */
+ (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT, &rdata);
+
+ END_CRIT_SECTION();
+
+ /*
+ * Mark this transaction level, so we can avoid issuing WAL records
+ * for later subtransactions also.
+ */
+ s->xidMarkedInWAL = true;
+ }
+ }
/*
* GetCurrentSubTransactionId
***************
*** 824,834 **** RecordTransactionCommit(void)
bool haveNonTemp;
int nchildren;
TransactionId *children;
/* Get data needed for commit record */
nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp);
nchildren = xactGetCommittedChildren(&children);
!
/*
* If we haven't been assigned an XID yet, we neither can, nor do we want
* to write a COMMIT record.
--- 914,928 ----
bool haveNonTemp;
int nchildren;
TransactionId *children;
+ int nmsgs;
+ SharedInvalidationMessage *invalidationMessages = NULL;
+ bool RelcacheInitFileInval;
/* Get data needed for commit record */
nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp);
nchildren = xactGetCommittedChildren(&children);
! nmsgs = xactGetCommittedInvalidationMessages(&invalidationMessages,
! &RelcacheInitFileInval);
/*
* If we haven't been assigned an XID yet, we neither can, nor do we want
* to write a COMMIT record.
***************
*** 862,868 **** RecordTransactionCommit(void)
/*
* Begin commit critical section and insert the commit XLOG record.
*/
! XLogRecData rdata[3];
int lastrdata = 0;
xl_xact_commit xlrec;
--- 956,962 ----
/*
* Begin commit critical section and insert the commit XLOG record.
*/
! XLogRecData rdata[4];
int lastrdata = 0;
xl_xact_commit xlrec;
***************
*** 870,875 **** RecordTransactionCommit(void)
--- 964,984 ----
BufmgrCommit();
/*
+ * Set flags required for recovery processing of commits.
+ * Nothing too critical here that we would want to include this
+ * within the critical section following.
+ */
+ xlrec.xinfo = 0;
+ if (CurrentTransactionState->hasUnMarkedSubXids)
+ xlrec.xinfo |= XACT_COMPLETION_UNMARKED_SUBXIDS;
+ if (AtEOXact_Database_FlatFile_Update_Needed())
+ xlrec.xinfo |= XACT_COMPLETION_UPDATE_DB_FILE;
+ if (AtEOXact_Auth_FlatFile_Update_Needed())
+ xlrec.xinfo |= XACT_COMPLETION_UPDATE_AUTH_FILE;
+ if (RelcacheInitFileInval)
+ xlrec.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE;
+
+ /*
* Mark ourselves as within our "commit critical section". This
* forces any concurrent checkpoint to wait until we've updated
* pg_clog. Without this, it is possible for the checkpoint to set
***************
*** 893,898 **** RecordTransactionCommit(void)
--- 1002,1009 ----
xlrec.xact_time = xactStopTimestamp;
xlrec.nrels = nrels;
xlrec.nsubxacts = nchildren;
+ xlrec.nmsgs = nmsgs;
+
rdata[0].data = (char *) (&xlrec);
rdata[0].len = MinSizeOfXactCommit;
rdata[0].buffer = InvalidBuffer;
***************
*** 914,919 **** RecordTransactionCommit(void)
--- 1025,1039 ----
rdata[2].buffer = InvalidBuffer;
lastrdata = 2;
}
+ /* dump shared cache invalidation messages */
+ if (nmsgs > 0)
+ {
+ rdata[lastrdata].next = &(rdata[3]);
+ rdata[3].data = (char *) invalidationMessages;
+ rdata[3].len = nmsgs * sizeof(SharedInvalidationMessage);
+ rdata[3].buffer = InvalidBuffer;
+ lastrdata = 3;
+ }
rdata[lastrdata].next = NULL;
(void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata);
***************
*** 1219,1224 **** RecordTransactionAbort(bool isSubXact)
--- 1339,1347 ----
}
xlrec.nrels = nrels;
xlrec.nsubxacts = nchildren;
+ xlrec.xinfo = 0;
+ if (CurrentTransactionState->hasUnMarkedSubXids)
+ xlrec.xinfo |= XACT_COMPLETION_UNMARKED_SUBXIDS;
rdata[0].data = (char *) (&xlrec);
rdata[0].len = MinSizeOfXactAbort;
rdata[0].buffer = InvalidBuffer;
***************
*** 1525,1530 **** StartTransaction(void)
--- 1648,1655 ----
s->childXids = NULL;
s->nChildXids = 0;
s->maxChildXids = 0;
+ s->xidMarkedInWAL = false;
+ s->hasUnMarkedSubXids = false;
GetUserIdAndContext(&s->prevUser, &s->prevSecDefCxt);
/* SecurityDefinerContext should never be set outside a transaction */
Assert(!s->prevSecDefCxt);
***************
*** 1637,1643 **** CommitTransaction(void)
* must be done _before_ releasing locks we hold and _after_
* RecordTransactionCommit.
*/
! ProcArrayEndTransaction(MyProc, latestXid);
/*
* This is all post-commit cleanup. Note that if an error is raised here,
--- 1762,1768 ----
* must be done _before_ releasing locks we hold and _after_
* RecordTransactionCommit.
*/
! ProcArrayEndTransaction(MyProc, latestXid, 0, NULL);
/*
* This is all post-commit cleanup. Note that if an error is raised here,
***************
*** 2055,2061 **** AbortTransaction(void)
* must be done _before_ releasing locks we hold and _after_
* RecordTransactionAbort.
*/
! ProcArrayEndTransaction(MyProc, latestXid);
/*
* Post-abort cleanup. See notes in CommitTransaction() concerning
--- 2180,2186 ----
* must be done _before_ releasing locks we hold and _after_
* RecordTransactionAbort.
*/
! ProcArrayEndTransaction(MyProc, latestXid, 0, NULL);
/*
* Post-abort cleanup. See notes in CommitTransaction() concerning
***************
*** 3753,3758 **** CommitSubTransaction(void)
--- 3878,3889 ----
/* Must CCI to ensure commands of subtransaction are seen as done */
CommandCounterIncrement();
+ /*
+ * Make sure we keep tracking xids that haven't marked WAL.
+ */
+ if (!s->xidMarkedInWAL || s->hasUnMarkedSubXids)
+ s->parent->hasUnMarkedSubXids = true;
+
/*
* Prior to 8.4 we marked subcommit in clog at this point. We now only
* perform that step, if required, as part of the atomic update of the
***************
*** 3872,3877 **** AbortSubTransaction(void)
--- 4003,4014 ----
s->state = TRANS_ABORT;
/*
+ * Make sure we keep tracking xids that haven't marked WAL.
+ */
+ if (!s->xidMarkedInWAL || s->hasUnMarkedSubXids)
+ s->parent->hasUnMarkedSubXids = true;
+
+ /*
* Reset user ID which might have been changed transiently. (See notes
* in AbortTransaction.)
*/
***************
*** 4214,4244 **** xactGetCommittedChildren(TransactionId **ptr)
}
/*
* XLOG support routines
*/
static void
! xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
{
TransactionId *sub_xids;
TransactionId max_xid;
int i;
- /* Mark the transaction committed in pg_clog */
- sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
- TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
-
/* Make sure nextXid is beyond any XID mentioned in the record */
max_xid = xid;
for (i = 0; i < xlrec->nsubxacts; i++)
{
if (TransactionIdPrecedes(max_xid, sub_xids[i]))
max_xid = sub_xids[i];
}
if (TransactionIdFollowsOrEquals(max_xid,
ShmemVariableCache->nextXid))
{
ShmemVariableCache->nextXid = max_xid;
TransactionIdAdvance(ShmemVariableCache->nextXid);
}
--- 4351,4852 ----
}
/*
+ * Record an enhanced snapshot of running transactions into WAL.
+ */
+ void
+ LogCurrentRunningXacts(void)
+ {
+ RunningTransactions CurrRunningXacts = GetRunningTransactionData();
+ xl_xact_running_xacts xlrec;
+ XLogRecData rdata[3];
+ int lastrdata = 0;
+ XLogRecPtr recptr;
+
+ xlrec.xcnt = CurrRunningXacts->xcnt;
+ xlrec.subxcnt = CurrRunningXacts->subxcnt;
+ xlrec.latestRunningXid = CurrRunningXacts->latestRunningXid;
+ xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
+
+ /* Header */
+ rdata[0].data = (char *) (&xlrec);
+ rdata[0].len = MinSizeOfXactRunningXacts;
+ rdata[0].buffer = InvalidBuffer;
+
+ /* array of RunningXact */
+ if (xlrec.xcnt > 0)
+ {
+ rdata[0].next = &(rdata[1]);
+ rdata[1].data = (char *) CurrRunningXacts->xrun;
+ rdata[1].len = xlrec.xcnt * sizeof(RunningXact);
+ rdata[1].buffer = InvalidBuffer;
+ lastrdata = 1;
+ }
+
+ /* array of RunningXact */
+ if (xlrec.subxcnt > 0)
+ {
+ rdata[lastrdata].next = &(rdata[2]);
+ rdata[2].data = (char *) CurrRunningXacts->subxip;
+ rdata[2].len = xlrec.subxcnt * sizeof(TransactionId);
+ rdata[2].buffer = InvalidBuffer;
+ lastrdata = 2;
+ }
+
+ rdata[lastrdata].next = NULL;
+
+ START_CRIT_SECTION();
+
+ recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_RUNNING_XACTS, rdata);
+
+ END_CRIT_SECTION();
+
+ elog(trace_recovery(DEBUG2), "captured snapshot of running xacts %X/%X", recptr.xlogid, recptr.xrecoff);
+ }
+
+ /*
+ * Is the data available to allow valid snapshots?
+ */
+ bool
+ IsRunningXactDataValid(void)
+ {
+ if (TransactionIdIsValid(latestObservedXid))
+ return true;
+
+ return false;
+ }
+
+ /*
+ * We need to issue shared invalidations and hold locks. Holding locks
+ * means others may want to wait on us, so we need to make lock table
+ * inserts to appear like a transaction. We could create and delete
+ * lock table entries for each transaction but its simpler just to create
+ * one permanent entry and leave it there all the time. Locks are then
+ * acquired and released as needed. Yes, this means you can see the
+ * Startup process in pg_locks once we have run this.
+ */
+ void
+ InitRecoveryTransactionEnvironment(void)
+ {
+ VirtualTransactionId vxid;
+
+ /*
+ * Initialise shared invalidation management for Startup process,
+ * being careful to register ourselves as a sendOnly process so
+ * we don't need to read messages, nor will we get signalled
+ * when the queue starts filling up.
+ */
+ SharedInvalBackendInit(true);
+
+ /*
+ * Additional initialisation tasks. Most of this was performed
+ * during initial stages of startup.
+ */
+ ProcArrayInitRecoveryEnvironment();
+
+ /*
+ * Lock a virtual transaction id for Startup process.
+ *
+ * We need to do GetNextLocalTransactionId() because
+ * SharedInvalBackendInit() leaves localTransactionid invalid and
+ * the lock manager doesn't like that at all.
+ *
+ * Note that we don't need to run XactLockTableInsert() because nobody
+ * needs to wait on xids. That sounds a little strange, but table locks
+ * are held by vxids and row level locks are held by xids. All queries
+ * hold AccessShareLocks so never block while we write or lock new rows.
+ */
+ vxid.backendId = MyBackendId;
+ vxid.localTransactionId = GetNextLocalTransactionId();
+ VirtualXactLockTableInsert(vxid);
+
+ /*
+ * Now that the database is consistent we can create a valid copy of
+ * the flat files required for connection and authentication. This
+ * may already have been executed at appropriate commit points, but
+ * we cannot trust that those executions were correct, so force it
+ * again now just to be safe.
+ */
+ BuildFlatFiles(false);
+ }
+
+ /*
+ * Called during archive recovery when we already know the WAL record is
+ * a cleanup record that might remove data that should be visible to
+ * some currently active snapshot.
+ *
+ * * First pull the latestRemovedXid and databaseId out of WAL record.
+ * * Get all virtual xids whose xmin is earlier than latestRemovedXid
+ * and who are in the same database
+ * * Check/Wait until we either give up waiting or vxids end
+ * * Blow away any backend we gave up waiting for it to complete
+ */
+ void
+ XactResolveRecoveryConflicts(TransactionId latestRemovedXid, Oid recDatabaseOid)
+ {
+ VirtualTransactionId *old_snapshots;
+
+ /*
+ * Don't bother checking for conflicts for cleanup records earlier than
+ * we have already tested for.
+ */
+ if (TransactionIdIsValid(localLatestRemovedXid) &&
+ TransactionIdFollowsOrEquals(localLatestRemovedXid, latestRemovedXid))
+ return;
+
+ old_snapshots = GetCurrentVirtualXIDs(latestRemovedXid,
+ recDatabaseOid,
+ 0 /* no need to exclude vacuum */);
+
+ ResolveRecoveryConflictWithVirtualXIDs(old_snapshots,
+ "cleanup redo");
+
+ /*
+ * Remember how far we've cleaned to avoid some checks in the future,
+ * since ResolveRecoveryConflictWithVirtualXIDs() accesses the ProcArray
+ * and is relatively expensive.
+ */
+ localLatestRemovedXid = latestRemovedXid;
+ }
+
+ /*
+ * During recovery we maintain ProcArray with incoming xids
+ * when we first observe them in use. Uses local variables, so
+ * should only be called by Startup process.
+ *
+ * We record all xids that we know have been assigned. That includes
+ * all the xids on the WAL record, plus all unobserved xids that
+ * we can deduce have been assigned. We can deduce the existence of
+ * unobserved xids because we know xids are in sequence, with no gaps.
+ *
+ * XXX Be careful of what happens when we use pg_resetxlogs.
+ */
+ void
+ RecordKnownAssignedTransactionIds(XLogRecPtr lsn, XLogRecord *record)
+ {
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ TransactionId xid,
+ child_xid,
+ top_xid;
+ PGPROC *proc;
+ bool first_seen;
+ bool mark_subtrans = false;
+
+ if (!IsRunningXactDataValid())
+ return;
+
+ /*
+ * If its an assignment record, we need to need extract data from
+ * the body of the record, rather than take header values. This
+ * is because an assignment record can be issued when
+ * GetCurrentTransactionIdIfAny() returns InvalidTransactionId.
+ */
+ if (record->xl_rmid == RM_XACT_ID && info == XLOG_XACT_ASSIGNMENT)
+ {
+ xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record);
+
+ child_xid = xlrec->xassign;
+ top_xid = xlrec->xparent;
+ }
+ else
+ {
+ child_xid = record->xl_xid;
+ top_xid = record->xl_parentxid;
+ }
+ xid = child_xid;
+ if (child_xid == top_xid)
+ child_xid = InvalidTransactionId;
+
+ if (!TransactionIdIsValid(top_xid))
+ return;
+
+ /*
+ elog(trace_recovery(DEBUG4), "RecordKnown xid %d parent %d"
+ " latestObsvXid %d firstXid %s firstSubXid %s markSubtrans %s",
+ xid, parent_xid, latestObservedXid,
+ XLogRecIsFirstXidRecord(record) ? "t" : "f",
+ XLogRecIsFirstSubXidRecord(record) ? "t" : "f",
+ XLogRecMustMarkSubtrans(record) ? "t" : "f");
+ */
+ /*
+ * Identify the recovery proc that holds replay info for this xid.
+ *
+ * XXX: This gets called for every WAL record (with XID). I think we'll
+ * need a faster version of BackendiXidGetProc, using a hash table or
+ * something. FWIW, the hash table wouldn't need to be in shared memory,
+ * because the startup process is the only one doing this.
+ */
+ proc = BackendXidGetProc(top_xid);
+
+ elog(trace_recovery(DEBUG4),
+ "start recovery top_xid = %u child_xid = %u lsn = %X/%X",
+ top_xid, child_xid, lsn.xlogid, lsn.xrecoff);
+
+ if (proc == NULL)
+ {
+ proc = InitRecoveryProcess();
+ proc->xid = top_xid;
+ ProcArrayAdd(proc);
+ first_seen = true;
+ }
+ else
+ first_seen = false;
+
+ /*
+ * Currently, we choose to take ProcArrayLock every time. We don't
+ * need to do this for every case, since if we know there are no
+ * UnobservedXids we could just call ProcArrayStartRecoveryTransaction()
+ * without locks, just is done during normal running. For now, be safe.
+ * See GetNewTransactionId(). XXX this comment needs updating, there's
+ * no ProcArrayStartRecoveryTransaction() anymore, for starters.
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Use volatile pointer to prevent code rearrangement; other backends
+ * could be examining my subxids info concurrently, and we don't want
+ * them to see an invalid intermediate state, such as incrementing
+ * nxids before filling the array entry. Note we are assuming that
+ * TransactionId and int fetch/store are atomic.
+ *
+ * XXX Is that a concern when we hold ProcArrayLock?
+ */
+ {
+ volatile PGPROC *myproc = proc;
+
+ myproc->lsn = lsn;
+
+ if (TransactionIdIsValid(child_xid))
+ {
+ int nxids = myproc->subxids.nxids;
+
+ if (nxids < PGPROC_MAX_CACHED_SUBXIDS)
+ {
+ /* XXX: Can we assume that subxids are seen in xid order? */
+ if (nxids > 0 && TransactionIdPrecedes(myproc->subxids.xids[nxids - 1], child_xid))
+ {
+ myproc->subxids.xids[nxids] = child_xid;
+ myproc->subxids.nxids = nxids + 1;
+ }
+ }
+ else
+ {
+ myproc->subxids.overflowed = true;
+ mark_subtrans = true;
+ }
+ }
+ }
+
+ /*
+ * When a newly observed xid arrives, it is frequently the case
+ * that it is *not* the next xid in sequence. When this occurs, we
+ * must treat the intervening xids as running also. So we maintain
+ * a special list of these UnobservedXids, so that snapshots can
+ * see the missing xids as in-progress.
+ *
+ * We maintain both recovery Procs *and* UnobservedXids because we
+ * need them both. Recovery procs allow us to store top-level xids
+ * and subtransactions separately, otherwise we wouldn't know
+ * when to overflow the subxid cache. UnobservedXids allow us to
+ * make sense of the out-of-order arrival of xids.
+ *
+ * Some examples:
+ * 1) latestObservedXid = 647
+ * next xid observed in WAL = 651 (a top-level transaction)
+ * so we add 648, 649, 650 to UnobservedXids
+ *
+ * 2) latestObservedXid = 769
+ * next xid observed in WAL = 771 (a subtransaction)
+ * so we add 770 to UnobservedXids
+ *
+ * 3) latestObservedXid = 769
+ * next xid observed in WAL = 810 (a subtransaction)
+ * 810's parent had not yet recorded WAL = 807
+ * so we add 770 thru 809 inclusive to UnobservedXids
+ * then remove 807
+ *
+ * 4) latestObservedXid = 769
+ * next xid observed in WAL = 771 (a subtransaction)
+ * 771's parent had not yet recorded WAL = 770
+ * so do nothing
+ *
+ * 5) latestObservedXid = 7747
+ * next xid observed in WAL = 7748 (a subtransaction)
+ * 7748's parent had not yet recorded WAL = 7742
+ * so we add 7748 and removed 7742
+ */
+
+ /*
+ * Just remember when reading this logic that by definition we have
+ * Assert(TransactionIdPrecedes(parent_xid, xid))
+ */
+
+ /*
+ * Just have one xid to process, so fairly simple
+ */
+
+ for (xid = top_xid; TransactionIdIsValid(xid); xid = child_xid)
+ {
+ TransactionId next_expected_xid = latestObservedXid;
+ TransactionIdAdvance(next_expected_xid);
+
+ if (next_expected_xid == xid)
+ {
+ Assert(!XidInUnobservedTransactions(xid));
+ /* XXX Assert(!XLogRecIsFirstSubXidRecord(record) ||
+ !XidInUnobservedTransactions(top_xid)); */
+ latestObservedXid = xid;
+ }
+ else if (TransactionIdPrecedes(next_expected_xid, xid))
+ {
+ UnobservedTransactionsAddXids(next_expected_xid, xid);
+ latestObservedXid = xid;
+ }
+ else if (first_seen)
+ UnobservedTransactionsRemoveXid(xid, true);
+
+ if (xid == child_xid)
+ break;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * Now we've upated the proc we can update subtrans, if appropriate.
+ * We must do this step last to avoid race conditions. See comments
+ * and code for AssignTransactionId().
+ */
+ if (mark_subtrans)
+ {
+ /* Assert(XLogRecIsFirstSubXidRecord(record)); */
+ elog(trace_recovery(DEBUG2),
+ "subtrans setting parent %d for xid %d", top_xid, child_xid);
+ SubTransSetParent(child_xid, top_xid);
+ }
+ }
+
+ /*
* XLOG support routines
*/
+ /*
+ * Before 8.4 this was a fairly short function, but now it performs many
+ * actions for which the order of execution is critical.
+ */
static void
! xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, bool preparedXact)
{
TransactionId *sub_xids;
TransactionId max_xid;
+ PGPROC *proc;
int i;
/* Make sure nextXid is beyond any XID mentioned in the record */
max_xid = xid;
+ sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+ /*
+ * Find the highest xid and remove unobserved xids if required.
+ */
for (i = 0; i < xlrec->nsubxacts; i++)
{
if (TransactionIdPrecedes(max_xid, sub_xids[i]))
max_xid = sub_xids[i];
}
+
+ if (InArchiveRecovery)
+ {
+ /*
+ * If we've just observed some new xids on the commit record
+ * make sure they're visible before we update clog.
+ */
+ if (XactCompletionHasUnMarkedSubxids(xlrec))
+ {
+ if (!IsRunningXactDataValid())
+ latestObservedXid = xid;
+
+ if (TransactionIdPrecedes(latestObservedXid, max_xid))
+ {
+ TransactionId next_expected_xid = latestObservedXid;
+
+ TransactionIdAdvance(next_expected_xid);
+ if (TransactionIdPrecedes(next_expected_xid, max_xid))
+ {
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ UnobservedTransactionsAddXids(next_expected_xid, max_xid);
+ LWLockRelease(ProcArrayLock);
+ }
+ latestObservedXid = max_xid;
+ }
+ }
+ }
+
+ /* Mark the transaction committed in pg_clog */
+ TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
+
+ if (InArchiveRecovery && (proc = BackendXidGetProc(xid)) != NULL)
+ {
+ /*
+ * We must mark clog before we update the ProcArray. Only update
+ * if we have already initialised the state and we have previously
+ * added an xid to the proc. We need no lock to check xid since it
+ * is controlled by Startup process. It's possible for xids to
+ * appear that haven't been seen before. We don't need to check
+ * UnobservedXids because in the normal case this will already have
+ * happened, but there are cases where they might sneak through.
+ * Leave these for the periodic cleanup by XACT_RUNNING_XACT records.
+ */
+ if (IsRunningXactDataValid() && !preparedXact)
+ {
+ if (XactCompletionHasUnMarkedSubxids(xlrec))
+ ProcArrayEndTransaction(proc, max_xid, xlrec->nsubxacts, sub_xids);
+ else
+ ProcArrayEndTransaction(proc, max_xid, 0, NULL);
+ ProcArrayRemove(proc, InvalidTransactionId);
+ FreeRecoveryProcess(proc);
+ }
+
+ /*
+ * If requested, update the flat files for DB and Auth Files by
+ * reading the catalog tables. Needs to be the first action taken
+ * after marking transaction complete to minimise race conditions.
+ * This is the opposite way round to the original actions, which
+ * update the files and then mark committed, so there is a race
+ * condition in both places.
+ */
+ if (XactCompletionUpdateDBFile(xlrec))
+ {
+ if (XactCompletionUpdateAuthFile(xlrec))
+ BuildFlatFiles(false);
+ else
+ BuildFlatFiles(true);
+ }
+
+ /*
+ * Send any cache invalidations attached to the commit. We must
+ * maintain the same order of invalidation then release locks
+ * as occurs in RecordTransactionCommit.
+ */
+ if (xlrec->nmsgs > 0)
+ {
+ int offset = OffsetSharedInvalInXactCommit();
+ SharedInvalidationMessage *msgs = (SharedInvalidationMessage *)
+ (((char *) xlrec) + offset);
+
+ SendSharedInvalidMessages(msgs, xlrec->nmsgs);
+ }
+
+ /*
+ * Release locks, if any.
+ */
+ RelationReleaseRecoveryLocks(xid);
+ }
+
+ /* Make sure nextXid is beyond any XID mentioned in the record */
if (TransactionIdFollowsOrEquals(max_xid,
ShmemVariableCache->nextXid))
{
ShmemVariableCache->nextXid = max_xid;
+ ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
TransactionIdAdvance(ShmemVariableCache->nextXid);
}
***************
*** 4260,4287 **** xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
}
}
static void
! xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
{
TransactionId *sub_xids;
TransactionId max_xid;
int i;
- /* Mark the transaction aborted in pg_clog */
- sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
- TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
-
/* Make sure nextXid is beyond any XID mentioned in the record */
max_xid = xid;
for (i = 0; i < xlrec->nsubxacts; i++)
{
if (TransactionIdPrecedes(max_xid, sub_xids[i]))
max_xid = sub_xids[i];
}
if (TransactionIdFollowsOrEquals(max_xid,
ShmemVariableCache->nextXid))
{
ShmemVariableCache->nextXid = max_xid;
TransactionIdAdvance(ShmemVariableCache->nextXid);
}
--- 4868,4963 ----
}
}
+ /*
+ * Be careful with the order of execution, as with xact_redo_commit().
+ * The two functions are similar but differ in key places.
+ */
static void
! xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid, bool preparedXact)
{
+ PGPROC *proc = NULL;
TransactionId *sub_xids;
TransactionId max_xid;
int i;
/* Make sure nextXid is beyond any XID mentioned in the record */
max_xid = xid;
+ sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+ /*
+ * Find the highest xid and remove unobserved xids if required.
+ */
for (i = 0; i < xlrec->nsubxacts; i++)
{
if (TransactionIdPrecedes(max_xid, sub_xids[i]))
max_xid = sub_xids[i];
}
+
+ if (InArchiveRecovery)
+ {
+ /*
+ * If we've just observed some new xids on the commit record
+ * make sure they're visible before we update clog.
+ */
+ if (XactCompletionHasUnMarkedSubxids(xlrec))
+ {
+ if (!IsRunningXactDataValid())
+ latestObservedXid = xid;
+
+ if (TransactionIdPrecedes(latestObservedXid, max_xid))
+ {
+ TransactionId next_expected_xid = latestObservedXid;
+
+ TransactionIdAdvance(next_expected_xid);
+ if (TransactionIdPrecedes(next_expected_xid, max_xid))
+ {
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ UnobservedTransactionsAddXids(next_expected_xid, max_xid);
+ LWLockRelease(ProcArrayLock);
+ }
+ latestObservedXid = max_xid;
+ }
+ }
+ }
+
+ /* Mark the transaction aborted in pg_clog */
+ TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
+
+ if (InArchiveRecovery && (proc = BackendXidGetProc(xid)) != NULL)
+ {
+ /*
+ * We must mark clog before we update the ProcArray. Only update
+ * if we have already initialised the state and we have previously
+ * added an xid to the proc. We need no lock to check xid since it
+ * is controlled by Startup process. It's possible for xids to
+ * appear that haven't been seen before. We don't need to check
+ * UnobservedXids because in the normal case this will already have
+ * happened, but there are cases where they might sneak through.
+ * Leave these for the periodic cleanup by XACT_RUNNING_XACT records.
+ */
+ if (IsRunningXactDataValid() &&
+ TransactionIdIsValid(proc->xid) && !preparedXact)
+ {
+ if (XactCompletionHasUnMarkedSubxids(xlrec))
+ ProcArrayEndTransaction(proc, max_xid, xlrec->nsubxacts, sub_xids);
+ else
+ ProcArrayEndTransaction(proc, max_xid, 0, NULL);
+ ProcArrayRemove(proc, InvalidTransactionId);
+ FreeRecoveryProcess(proc);
+ }
+
+ /*
+ * Release locks, if any. There are no invalidations to send.
+ */
+ RelationReleaseRecoveryLocks(xid);
+ }
+
+ /* Make sure nextXid is beyond any XID mentioned in the record */
if (TransactionIdFollowsOrEquals(max_xid,
ShmemVariableCache->nextXid))
{
ShmemVariableCache->nextXid = max_xid;
+ ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
TransactionIdAdvance(ShmemVariableCache->nextXid);
}
***************
*** 4312,4324 **** xact_redo(XLogRecPtr lsn, XLogRecord *record)
{
xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
! xact_redo_commit(xlrec, record->xl_xid);
}
else if (info == XLOG_XACT_ABORT)
{
xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
! xact_redo_abort(xlrec, record->xl_xid);
}
else if (info == XLOG_XACT_PREPARE)
{
--- 4988,5004 ----
{
xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
! xact_redo_commit(xlrec, record->xl_xid, false);
}
else if (info == XLOG_XACT_ABORT)
{
xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
! Assert(!XactCompletionUpdateDBFile(xlrec) &&
! !XactCompletionUpdateAuthFile(xlrec) &&
! !XactCompletionRelcacheInitFileInval(xlrec));
!
! xact_redo_abort(xlrec, record->xl_xid, false);
}
else if (info == XLOG_XACT_PREPARE)
{
***************
*** 4330,4345 **** xact_redo(XLogRecPtr lsn, XLogRecord *record)
{
xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) XLogRecGetData(record);
! xact_redo_commit(&xlrec->crec, xlrec->xid);
RemoveTwoPhaseFile(xlrec->xid, false);
}
else if (info == XLOG_XACT_ABORT_PREPARED)
{
xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) XLogRecGetData(record);
! xact_redo_abort(&xlrec->arec, xlrec->xid);
RemoveTwoPhaseFile(xlrec->xid, false);
}
else
elog(PANIC, "xact_redo: unknown op code %u", info);
}
--- 5010,5053 ----
{
xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) XLogRecGetData(record);
! xact_redo_commit(&xlrec->crec, xlrec->xid, true);
RemoveTwoPhaseFile(xlrec->xid, false);
}
else if (info == XLOG_XACT_ABORT_PREPARED)
{
xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) XLogRecGetData(record);
! xact_redo_abort(&xlrec->arec, xlrec->xid, true);
RemoveTwoPhaseFile(xlrec->xid, false);
}
+ else if (info == XLOG_XACT_ASSIGNMENT)
+ {
+ /*
+ * This is a no-op since RecordKnownAssignedTransactionIds()
+ * already did all the work on this record for us.
+ */
+ return;
+ }
+ else if (info == XLOG_XACT_RUNNING_XACTS)
+ {
+ xl_xact_running_xacts *xlrec = (xl_xact_running_xacts *) XLogRecGetData(record);
+
+ /*
+ * Initialise if we have a valid snapshot to work with
+ */
+ if (TransactionIdIsValid(xlrec->latestRunningXid) &&
+ (!IsRunningXactDataValid() ||
+ TransactionIdPrecedes(latestObservedXid, xlrec->latestRunningXid)))
+ {
+ latestObservedXid = xlrec->latestRunningXid;
+ ShmemVariableCache->latestCompletedXid = xlrec->latestCompletedXid;
+ elog(trace_recovery(DEBUG1),
+ "initial snapshot created; latestObservedXid = %d latestCompletedXid = %d",
+ latestObservedXid, xlrec->latestCompletedXid);
+ }
+
+ ProcArrayUpdateRecoveryTransactions(lsn, xlrec);
+ }
else
elog(PANIC, "xact_redo: unknown op code %u", info);
}
***************
*** 4349,4358 **** xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
{
int i;
appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time));
if (xlrec->nrels > 0)
{
! appendStringInfo(buf, "; rels:");
for (i = 0; i < xlrec->nrels; i++)
{
char *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM);
--- 5057,5075 ----
{
int i;
+ if (XactCompletionUpdateDBFile(xlrec))
+ appendStringInfo(buf, "; update db file");
+
+ if (XactCompletionUpdateDBFile(xlrec))
+ appendStringInfo(buf, "; update auth file");
+
+ if (XactCompletionRelcacheInitFileInval(xlrec))
+ appendStringInfo(buf, "; relcache init file inval");
+
appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time));
if (xlrec->nrels > 0)
{
! appendStringInfo(buf, "; %d rels:", xlrec->nrels);
for (i = 0; i < xlrec->nrels; i++)
{
char *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM);
***************
*** 4363,4374 **** xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
if (xlrec->nsubxacts > 0)
{
TransactionId *xacts = (TransactionId *)
! &xlrec->xnodes[xlrec->nrels];
!
! appendStringInfo(buf, "; subxacts:");
for (i = 0; i < xlrec->nsubxacts; i++)
appendStringInfo(buf, " %u", xacts[i]);
}
}
static void
--- 5080,5113 ----
if (xlrec->nsubxacts > 0)
{
TransactionId *xacts = (TransactionId *)
! &xlrec->xnodes[xlrec->nrels];
! appendStringInfo(buf, "; %d subxacts:", xlrec->nsubxacts);
for (i = 0; i < xlrec->nsubxacts; i++)
appendStringInfo(buf, " %u", xacts[i]);
}
+ if (xlrec->nmsgs > 0)
+ {
+ /*
+ * The invalidation messages are the third variable length array
+ * from the start of the record. The record header has everything
+ * we need to calculate where that starts.
+ */
+ int offset = OffsetSharedInvalInXactCommit();
+ SharedInvalidationMessage *msgs = (SharedInvalidationMessage *)
+ (((char *) xlrec) + offset);
+ appendStringInfo(buf, "; %d inval msgs:", xlrec->nmsgs);
+ for (i = 0; i < xlrec->nmsgs; i++)
+ {
+ SharedInvalidationMessage *msg = msgs + i;
+
+ if (msg->id >= 0)
+ appendStringInfo(buf, "catcache id%d ", msg->id);
+ else if (msg->id == SHAREDINVALRELCACHE_ID)
+ appendStringInfo(buf, "relcache ");
+ else if (msg->id == SHAREDINVALSMGR_ID)
+ appendStringInfo(buf, "smgr ");
+ }
+ }
}
static void
***************
*** 4398,4403 **** xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec)
--- 5137,5179 ----
}
}
+ static void
+ xact_desc_running_xacts(StringInfo buf, xl_xact_running_xacts *xlrec)
+ {
+ int xid_index,
+ subxid_index;
+ TransactionId *subxip = (TransactionId *) &(xlrec->xrun[xlrec->xcnt]);
+
+ appendStringInfo(buf, "nxids %u nsubxids %u latestRunningXid %d",
+ xlrec->xcnt,
+ xlrec->subxcnt,
+ xlrec->latestRunningXid);
+
+ for (xid_index = 0; xid_index < xlrec->xcnt; xid_index++)
+ {
+ RunningXact *rxact = (RunningXact *) xlrec->xrun;
+
+ appendStringInfo(buf, "; xid %d pid %d db %d role %d "
+ "vacflag %u nsubxids %u offset %d overflowed %s",
+ rxact[xid_index].xid,
+ rxact[xid_index].pid,
+ rxact[xid_index].databaseId,
+ rxact[xid_index].roleId,
+ rxact[xid_index].vacuumFlags,
+ rxact[xid_index].nsubxids,
+ rxact[xid_index].subx_offset,
+ (rxact[xid_index].overflowed ? "t" : "f"));
+
+ if (rxact[xid_index].nsubxids > 0)
+ {
+ appendStringInfo(buf, "; subxacts: ");
+ for (subxid_index = 0; subxid_index < rxact[xid_index].nsubxids; subxid_index++)
+ appendStringInfo(buf, " %u",
+ subxip[subxid_index + rxact[xid_index].subx_offset]);
+ }
+ }
+ }
+
void
xact_desc(StringInfo buf, uint8 xl_info, char *rec)
{
***************
*** 4435,4440 **** xact_desc(StringInfo buf, uint8 xl_info, char *rec)
--- 5211,5231 ----
appendStringInfo(buf, "abort %u: ", xlrec->xid);
xact_desc_abort(buf, &xlrec->arec);
}
+ else if (info == XLOG_XACT_ASSIGNMENT)
+ {
+ xl_xact_assignment *xlrec = (xl_xact_assignment *) rec;
+
+ /* ignore the main xid, it may be Invalid and misleading */
+ appendStringInfo(buf, "assignment: xid %u",
+ xlrec->xassign);
+ }
+ else if (info == XLOG_XACT_RUNNING_XACTS)
+ {
+ xl_xact_running_xacts *xlrec = (xl_xact_running_xacts *) rec;
+
+ appendStringInfo(buf, "running xacts: ");
+ xact_desc_running_xacts(buf, xlrec);
+ }
else
appendStringInfo(buf, "UNKNOWN");
}
*** src/backend/access/transam/xlog.c
--- src/backend/access/transam/xlog.c
***************
*** 24,29 ****
--- 24,30 ----
#include "access/clog.h"
#include "access/multixact.h"
+ #include "access/nbtree.h"
#include "access/subtrans.h"
#include "access/transam.h"
#include "access/tuptoaster.h"
***************
*** 43,48 ****
--- 44,50 ----
#include "storage/ipc.h"
#include "storage/pmsignal.h"
#include "storage/procarray.h"
+ #include "storage/sinval.h"
#include "storage/smgr.h"
#include "storage/spin.h"
#include "utils/builtins.h"
***************
*** 50,55 ****
--- 52,58 ----
#include "utils/ps_status.h"
#include "pg_trace.h"
+ #define WAL_DEBUG
/* File path names (all relative to $PGDATA) */
#define BACKUP_LABEL_FILE "backup_label"
***************
*** 69,75 **** bool log_checkpoints = false;
int sync_method = DEFAULT_SYNC_METHOD;
#ifdef WAL_DEBUG
! bool XLOG_DEBUG = false;
#endif
/*
--- 72,80 ----
int sync_method = DEFAULT_SYNC_METHOD;
#ifdef WAL_DEBUG
! bool XLOG_DEBUG_FLUSH = false;
! bool XLOG_DEBUG_BGFLUSH = false;
! bool XLOG_DEBUG_REDO = true;
#endif
/*
***************
*** 114,120 **** CheckpointStatsData CheckpointStats;
/*
* ThisTimeLineID will be same in all backends --- it identifies current
! * WAL timeline for the database system.
*/
TimeLineID ThisTimeLineID = 0;
--- 119,126 ----
/*
* ThisTimeLineID will be same in all backends --- it identifies current
! * WAL timeline for the database system. Zero is always a bug, so we
! * start with that to allow us to spot any errors.
*/
TimeLineID ThisTimeLineID = 0;
***************
*** 122,141 **** TimeLineID ThisTimeLineID = 0;
bool InRecovery = false;
/* Are we recovering using offline XLOG archives? */
! static bool InArchiveRecovery = false;
/* Was the last xlog file restored from archive, or local? */
static bool restoredFromArchive = false;
/* options taken from recovery.conf */
static char *recoveryRestoreCommand = NULL;
- static bool recoveryTarget = false;
static bool recoveryTargetExact = false;
static bool recoveryTargetInclusive = true;
static bool recoveryLogRestartpoints = false;
static TransactionId recoveryTargetXid;
static TimestampTz recoveryTargetTime;
static TimestampTz recoveryLastXTime = 0;
/* if recoveryStopsHere returns true, it saves actual stop xid/time here */
static TransactionId recoveryStopXid;
--- 128,171 ----
bool InRecovery = false;
/* Are we recovering using offline XLOG archives? */
! bool InArchiveRecovery = false;
!
! /* Local copy of shared RecoveryProcessingMode state */
! static bool LocalRecoveryProcessingMode = true;
! static bool knownProcessingMode = false;
!
! /* is the database proven consistent yet? */
! bool reachedSafeStartPoint = false;
/* Was the last xlog file restored from archive, or local? */
static bool restoredFromArchive = false;
/* options taken from recovery.conf */
static char *recoveryRestoreCommand = NULL;
static bool recoveryTargetExact = false;
static bool recoveryTargetInclusive = true;
static bool recoveryLogRestartpoints = false;
static TransactionId recoveryTargetXid;
static TimestampTz recoveryTargetTime;
+ static int recoveryTargetAdvance = 0;
+
+ /* recovery target modes */
+ #define RECOVERY_TARGET_NONE 0
+ #define RECOVERY_TARGET_PAUSE_ALL 1
+ #define RECOVERY_TARGET_PAUSE_CLEANUP 2
+ #define RECOVERY_TARGET_PAUSE_XID 3
+ #define RECOVERY_TARGET_PAUSE_TIME 4
+ #define RECOVERY_TARGET_ADVANCE 5
+ #define RECOVERY_TARGET_STOP_IMMEDIATE 6
+ #define RECOVERY_TARGET_STOP_XID 7
+ #define RECOVERY_TARGET_STOP_TIME 8
+ static int recoveryTargetMode = RECOVERY_TARGET_NONE;
+
+ #define DEFAULT_MAX_STANDBY_DELAY 300
+ int maxStandbyDelay = DEFAULT_MAX_STANDBY_DELAY;
+
static TimestampTz recoveryLastXTime = 0;
+ static TransactionId recoveryLastXid = InvalidTransactionId;
/* if recoveryStopsHere returns true, it saves actual stop xid/time here */
static TransactionId recoveryStopXid;
***************
*** 241,250 **** static XLogRecPtr RedoRecPtr;
* ControlFileLock: must be held to read/update control file or create
* new log file.
*
! * CheckpointLock: must be held to do a checkpoint (ensures only one
! * checkpointer at a time; currently, with all checkpoints done by the
! * bgwriter, this is just pro forma).
*
*----------
*/
--- 271,300 ----
* ControlFileLock: must be held to read/update control file or create
* new log file.
*
! * CheckpointLock: must be held to do a checkpoint or restartpoint, ensuring
! * we get just one of those at any time. In 8.4+ recovery, both startup and
! * bgwriter processes may take restartpoints, so this locking must be strict
! * to ensure there are no mistakes.
*
+ * In 8.4 we progress through a number of states at startup. Initially, the
+ * postmaster is in PM_STARTUP state and spawns the Startup process. We then
+ * progress until the database is in a consistent state, then if we are in
+ * InArchiveRecovery we go into PM_RECOVERY state. The bgwriter then starts
+ * up and takes over responsibility for performing restartpoints. We then
+ * progress until the end of recovery when we enter PM_RUN state upon
+ * termination of the Startup process. In summary:
+ *
+ * PM_STARTUP state: Startup process performs restartpoints
+ * PM_RECOVERY state: bgwriter process performs restartpoints
+ * PM_RUN state: bgwriter process performs checkpoints
+ *
+ * These transitions are fairly delicate, with many things that need to
+ * happen at the same time in order to change state successfully throughout
+ * the system. Changing PM_STARTUP to PM_RECOVERY only occurs when we can
+ * prove the databases are in a consistent state. Changing from PM_RECOVERY
+ * to PM_RUN happens whenever recovery ends, which could be forced upon us
+ * externally or it can occur because of damage or termination of the WAL
+ * sequence.
*----------
*/
***************
*** 286,296 **** typedef struct XLogCtlWrite
--- 336,353 ----
/*
* Total shared-memory state for XLOG.
+ *
+ * This small structure is accessed by many backends, so we take care to
+ * pad out the parts of the structure so they can be accessed by separate
+ * CPUs without causing false sharing cache flushes. Padding is generous
+ * to allow for a wide variety of CPU architectures.
*/
+ #define XLOGCTL_BUFFER_SPACING 128
typedef struct XLogCtlData
{
/* Protected by WALInsertLock: */
XLogCtlInsert Insert;
+ char InsertPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlInsert)];
/* Protected by info_lck: */
XLogwrtRqst LogwrtRqst;
***************
*** 298,306 **** typedef struct XLogCtlData
--- 355,370 ----
uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
TransactionId ckptXid;
XLogRecPtr asyncCommitLSN; /* LSN of newest async commit */
+ /* add data structure padding for above info_lck declarations */
+ char InfoPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogwrtRqst)
+ - sizeof(XLogwrtResult)
+ - sizeof(uint32)
+ - sizeof(TransactionId)
+ - sizeof(XLogRecPtr)];
/* Protected by WALWriteLock: */
XLogCtlWrite Write;
+ char WritePadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlWrite)];
/*
* These values do not change after startup, although the pointed-to pages
***************
*** 312,317 **** typedef struct XLogCtlData
--- 376,412 ----
int XLogCacheBlck; /* highest allocated xlog buffer index */
TimeLineID ThisTimeLineID;
+ /*
+ * IsRecoveryProcessingMode shows whether the postmaster is in a
+ * postmaster state earlier than PM_RUN, or not. This is a globally
+ * accessible state to allow EXEC_BACKEND case.
+ *
+ * We also retain a local state variable InRecovery. InRecovery=true
+ * means the code is being executed by Startup process and therefore
+ * always during Recovery Processing Mode. This allows us to identify
+ * code executed *during* Recovery Processing Mode but not necessarily
+ * by Startup process itself.
+ *
+ * Protected by mode_lck
+ */
+ bool SharedRecoveryProcessingMode;
+ slock_t mode_lck;
+
+ /*
+ * recovery target control information
+ *
+ * Protected by info_lck
+ */
+ int recoveryTargetMode;
+ TransactionId recoveryTargetXid;
+ TimestampTz recoveryTargetTime;
+ int recoveryTargetAdvance;
+
+ TimestampTz recoveryLastXTime;
+ TransactionId recoveryLastXid;
+
+ char InfoLockPadding[XLOGCTL_BUFFER_SPACING];
+
slock_t info_lck; /* locks shared variables shown above */
} XLogCtlData;
***************
*** 398,405 **** static void XLogArchiveCleanup(const char *xlog);
--- 493,502 ----
static void readRecoveryCommandFile(void);
static void exitArchiveRecovery(TimeLineID endTLI,
uint32 endLogId, uint32 endLogSeg);
+ static void exitRecovery(void);
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
+ static XLogRecPtr GetRedoLocationForCheckpoint(void);
static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
XLogRecPtr *lsn, BkpBlock *bkpb);
***************
*** 482,487 **** XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
--- 579,592 ----
bool updrqst;
bool doPageWrites;
bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+ bool isRecoveryEnd = (rmid == RM_XLOG_ID &&
+ (info == XLOG_RECOVERY_END ||
+ info == XLOG_CHECKPOINT_ONLINE));
+
+ /* cross-check on whether we should be here or not */
+ if (IsRecoveryProcessingMode() && !isRecoveryEnd)
+ elog(FATAL, "cannot make new WAL entries during recovery "
+ "(RMgrId = %d info = %d)", rmid, info);
/* info's high bits are reserved for use by me */
if (info & XLR_INFO_MASK)
***************
*** 820,825 **** begin:;
--- 925,931 ----
record->xl_len = len; /* doesn't include backup blocks */
record->xl_info = info;
record->xl_rmid = rmid;
+ record->xl_parentxid = GetTopTransactionIdIfAny();
/* Now we can finish computing the record's CRC */
COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
***************
*** 827,851 **** begin:;
FIN_CRC32(rdata_crc);
record->xl_crc = rdata_crc;
- #ifdef WAL_DEBUG
- if (XLOG_DEBUG)
- {
- StringInfoData buf;
-
- initStringInfo(&buf);
- appendStringInfo(&buf, "INSERT @ %X/%X: ",
- RecPtr.xlogid, RecPtr.xrecoff);
- xlog_outrec(&buf, record);
- if (rdata->data != NULL)
- {
- appendStringInfo(&buf, " - ");
- RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
- }
- elog(LOG, "%s", buf.data);
- pfree(buf.data);
- }
- #endif
-
/* Record begin of record in appropriate places */
ProcLastRecPtr = RecPtr;
Insert->PrevRecord = RecPtr;
--- 933,938 ----
***************
*** 1728,1735 **** XLogFlush(XLogRecPtr record)
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
! /* Disabled during REDO */
! if (InRedo)
return;
/* Quick exit if already known flushed */
--- 1815,1821 ----
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
! if (IsRecoveryProcessingMode())
return;
/* Quick exit if already known flushed */
***************
*** 1737,1743 **** XLogFlush(XLogRecPtr record)
return;
#ifdef WAL_DEBUG
! if (XLOG_DEBUG)
elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
record.xlogid, record.xrecoff,
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
--- 1823,1829 ----
return;
#ifdef WAL_DEBUG
! if (XLOG_DEBUG_FLUSH)
elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
record.xlogid, record.xrecoff,
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
***************
*** 1817,1825 **** XLogFlush(XLogRecPtr record)
* the bad page is encountered again during recovery then we would be
* unable to restart the database at all! (This scenario has actually
* happened in the field several times with 7.1 releases. Note that we
! * cannot get here while InRedo is true, but if the bad page is brought in
! * and marked dirty during recovery then CreateCheckPoint will try to
! * flush it at the end of recovery.)
*
* The current approach is to ERROR under normal conditions, but only
* WARNING during recovery, so that the system can be brought up even if
--- 1903,1911 ----
* the bad page is encountered again during recovery then we would be
* unable to restart the database at all! (This scenario has actually
* happened in the field several times with 7.1 releases. Note that we
! * cannot get here while IsRecoveryProcessingMode(), but if the bad page is
! * brought in and marked dirty during recovery then if a checkpoint were
! * performed at the end of recovery it will try to flush it.
*
* The current approach is to ERROR under normal conditions, but only
* WARNING during recovery, so that the system can be brought up even if
***************
*** 1829,1835 **** XLogFlush(XLogRecPtr record)
* and so we will not force a restart for a bad LSN on a data page.
*/
if (XLByteLT(LogwrtResult.Flush, record))
! elog(InRecovery ? WARNING : ERROR,
"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
record.xlogid, record.xrecoff,
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
--- 1915,1921 ----
* and so we will not force a restart for a bad LSN on a data page.
*/
if (XLByteLT(LogwrtResult.Flush, record))
! elog(ERROR,
"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
record.xlogid, record.xrecoff,
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
***************
*** 1887,1893 **** XLogBackgroundFlush(void)
return;
#ifdef WAL_DEBUG
! if (XLOG_DEBUG)
elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
--- 1973,1979 ----
return;
#ifdef WAL_DEBUG
! if (XLOG_DEBUG_BGFLUSH)
elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
***************
*** 2102,2108 **** XLogFileInit(uint32 log, uint32 seg,
unlink(tmppath);
}
! elog(DEBUG2, "done creating and filling new WAL file");
/* Set flag to tell caller there was no existent file */
*use_existent = false;
--- 2188,2195 ----
unlink(tmppath);
}
! XLogFileName(tmppath, ThisTimeLineID, log, seg);
! elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);
/* Set flag to tell caller there was no existent file */
*use_existent = false;
***************
*** 2408,2413 **** XLogFileRead(uint32 log, uint32 seg, int emode)
--- 2495,2522 ----
xlogfname);
set_ps_display(activitymsg, false);
+ /*
+ * Calculate and write out a new safeStartPoint. This defines
+ * the latest LSN that might appear on-disk while we apply
+ * the WAL records in this file. If we crash during recovery
+ * we must reach this point again before we can prove
+ * database consistency. Not a restartpoint! Restart points
+ * define where we should start recovery from, if we crash.
+ */
+ if (InArchiveRecovery)
+ {
+ uint32 nextLog = log;
+ uint32 nextSeg = seg;
+
+ NextLogSeg(nextLog, nextSeg);
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->minSafeStartPoint.xlogid = nextLog;
+ ControlFile->minSafeStartPoint.xrecoff = nextSeg * XLogSegSize;
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+ }
+
return fd;
}
if (errno != ENOENT) /* unexpected failure? */
***************
*** 2920,2925 **** CleanupBackupHistory(void)
--- 3029,3132 ----
FreeDir(xldir);
}
+ static void
+ ResolveRedoVisibilityConflicts(XLogRecPtr lsn, XLogRecord *record)
+ {
+ Oid recDatabaseOid = 0;
+ TransactionId latestRemovedXid = 0;
+
+ RmgrId rmid = record->xl_rmid;
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ if (rmid == RM_HEAP2_ID &&
+ (info == XLOG_HEAP2_CLEAN || info == XLOG_HEAP2_CLEAN_MOVE ))
+ {
+ xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
+
+ latestRemovedXid = xlrec->latestRemovedXid;
+ recDatabaseOid = xlrec->node.dbNode;
+ }
+ else if (rmid == RM_HEAP2_ID && info == XLOG_HEAP2_CLEANUP_INFO)
+ {
+ xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
+
+ latestRemovedXid = xlrec->latestRemovedXid;
+ recDatabaseOid = xlrec->node.dbNode;
+ }
+ else if (rmid == RM_BTREE_ID && info == XLOG_BTREE_DELETE)
+ {
+ xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+ latestRemovedXid = xlrec->latestRemovedXid;
+ recDatabaseOid = xlrec->node.dbNode;
+ }
+ else if (rmid == RM_BTREE_ID && info == XLOG_BTREE_VACUUM)
+ {
+ /*
+ * This action never conflicts with queries. Although we have to
+ * use cleanup locks to apply changes made by this record type, we
+ * are only removing tuples that have xids equal to or prior to the
+ * latestRemovedXid of a prior RM_HEAP2_ID record. That is the main
+ * purpose of a a XLOG_HEAP2_CLEANUP_INFO record during lazy vacuum.
+ * VACUUM FULL will always have seen a higher latestRemovedXid via
+ * the other record types. So this record is always a no-op here.
+ */
+ return;
+ }
+ else
+ elog(FATAL, "unrecognised cleanup record");
+
+ XactResolveRecoveryConflicts(latestRemovedXid, recDatabaseOid);
+ }
+
+ /*
+ * RecordIsCleanupRecord() determines whether or not the record
+ * will remove rows from data blocks. This is important because
+ * applying these records could effect the validity of MVCC snapshots,
+ * so there are various controls over replaying such records.
+ */
+ static bool
+ RecordIsCleanupRecord(XLogRecord *record)
+ {
+ RmgrId rmid = record->xl_rmid;
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ /*
+ * XXX should we implement this as an additional RMgr API call?
+ * We shouldn't presume we know which Rmgrs have cleanup records,
+ * which we do by including access/nbtree.h and calling an Rmgr
+ * specific function directly by name here.
+ */
+ if ((rmid == RM_HEAP2_ID) ||
+ (rmid == RM_BTREE_ID && btree_is_cleanup_record(info)))
+ return true;
+
+ return false;
+ }
+
+ /*
+ * RecordNeedsCleanupLock() determines whether or not the record
+ * requires a cleanup lock when removing rows from data blocks.
+ */
+ static bool
+ RecordNeedsCleanupLock(XLogRecord *record)
+ {
+ RmgrId rmid = record->xl_rmid;
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ /*
+ * XXX should we implement this as an additional RMgr API call?
+ * We shouldn't presume we know which Rmgrs need cleanup locks,
+ * which we do by including access/nbtree.h and calling an Rmgr
+ * specific function directly by name here.
+ */
+ if ((rmid == RM_HEAP2_ID) ||
+ (rmid == RM_BTREE_ID && btree_needs_cleanup_lock(info)))
+ return true;
+
+ return false;
+ }
+
/*
* Restore the backup blocks present in an XLOG record, if any.
*
***************
*** 2942,2947 **** RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
--- 3149,3163 ----
BkpBlock bkpb;
char *blk;
int i;
+ int lockmode;
+
+ /*
+ * What kind of lock do we need to apply the backup blocks?
+ */
+ if (RecordNeedsCleanupLock(record))
+ lockmode = BUFFER_LOCK_CLEANUP;
+ else
+ lockmode = BUFFER_LOCK_EXCLUSIVE;
blk = (char *) XLogRecGetData(record) + record->xl_len;
for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
***************
*** 2953,2959 **** RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
blk += sizeof(BkpBlock);
buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
! RBM_ZERO);
Assert(BufferIsValid(buffer));
page = (Page) BufferGetPage(buffer);
--- 3169,3175 ----
blk += sizeof(BkpBlock);
buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
! RBM_ZERO, lockmode);
Assert(BufferIsValid(buffer));
page = (Page) BufferGetPage(buffer);
***************
*** 4283,4288 **** XLOGShmemInit(void)
--- 4499,4505 ----
XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
SpinLockInit(&XLogCtl->info_lck);
+ SpinLockInit(&XLogCtl->mode_lck);
/*
* If we are not in bootstrap mode, pg_control should already exist. Read
***************
*** 4366,4371 **** BootStrapXLOG(void)
--- 4583,4589 ----
record->xl_prev.xlogid = 0;
record->xl_prev.xrecoff = 0;
record->xl_xid = InvalidTransactionId;
+ record->xl_parentxid = InvalidTransactionId;
record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
record->xl_len = sizeof(checkPoint);
record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
***************
*** 4549,4555 **** readRecoveryCommandFile(void)
ereport(LOG,
(errmsg("recovery_target_xid = %u",
recoveryTargetXid)));
! recoveryTarget = true;
recoveryTargetExact = true;
}
else if (strcmp(tok1, "recovery_target_time") == 0)
--- 4767,4773 ----
ereport(LOG,
(errmsg("recovery_target_xid = %u",
recoveryTargetXid)));
! recoveryTargetMode = RECOVERY_TARGET_STOP_XID;
recoveryTargetExact = true;
}
else if (strcmp(tok1, "recovery_target_time") == 0)
***************
*** 4560,4566 **** readRecoveryCommandFile(void)
*/
if (recoveryTargetExact)
continue;
! recoveryTarget = true;
recoveryTargetExact = false;
/*
--- 4778,4784 ----
*/
if (recoveryTargetExact)
continue;
! recoveryTargetMode = RECOVERY_TARGET_STOP_TIME;
recoveryTargetExact = false;
/*
***************
*** 4599,4604 **** readRecoveryCommandFile(void)
--- 4817,4842 ----
ereport(LOG,
(errmsg("log_restartpoints = %s", tok2)));
}
+ else if (strcmp(tok1, "max_standby_delay") == 0)
+ {
+ errno = 0;
+ maxStandbyDelay = (TransactionId) strtoul(tok2, NULL, 0);
+ if (errno == EINVAL || errno == ERANGE)
+ ereport(FATAL,
+ (errmsg("max_standby_delay is not a valid number: \"%s\"",
+ tok2)));
+ /*
+ * 2E6 seconds is about 23 days. Allows us to measure delay in
+ * milliseconds.
+ */
+ if (maxStandbyDelay > INT_MAX || maxStandbyDelay < 0)
+ ereport(FATAL,
+ (errmsg("max_standby_delay must be between 0 (wait forever) and 2 000 000 secs")));
+
+ ereport(LOG,
+ (errmsg("max_standby_delay = %u",
+ maxStandbyDelay)));
+ }
else
ereport(FATAL,
(errmsg("unrecognized recovery parameter \"%s\"",
***************
*** 4733,4755 **** exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
unlink(recoveryPath); /* ignore any error */
/*
! * Rename the config file out of the way, so that we don't accidentally
! * re-enter archive recovery mode in a subsequent crash.
*/
- unlink(RECOVERY_COMMAND_DONE);
- if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
- ereport(FATAL,
- (errcode_for_file_access(),
- errmsg("could not rename file \"%s\" to \"%s\": %m",
- RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
ereport(LOG,
(errmsg("archive recovery complete")));
}
/*
! * For point-in-time recovery, this function decides whether we want to
! * stop applying the XLOG at or after the current record.
*
* Returns TRUE if we are stopping, FALSE otherwise. On TRUE return,
* *includeThis is set TRUE if we should apply this record before stopping.
--- 4971,5027 ----
unlink(recoveryPath); /* ignore any error */
/*
! * As of 8.4 we no longer rename the recovery.conf file out of the
! * way until after we have performed a full checkpoint. This ensures
! * that any crash between now and the end of the checkpoint does not
! * attempt to restart from a WAL file that is no longer available to us.
! * As soon as we remove recovery.conf we lose our recovery_command and
! * cannot reaccess WAL files from the archive.
*/
ereport(LOG,
(errmsg("archive recovery complete")));
}
+ #ifdef DEBUG_RECOVERY_CONTROL
+ static void
+ LogRecoveryTargetModeInfo(void)
+ {
+ int lrecoveryTargetMode;
+ TransactionId lrecoveryTargetXid;
+ TimestampTz lrecoveryTargetTime;
+ int lrecoveryTargetAdvance;
+
+ TimestampTz lrecoveryLastXTime;
+ TransactionId lrecoveryLastXid;
+
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+
+ lrecoveryTargetMode = xlogctl->recoveryTargetMode;
+ lrecoveryTargetXid = xlogctl->recoveryTargetXid;
+ lrecoveryTargetTime = xlogctl->recoveryTargetTime;
+ lrecoveryTargetAdvance = xlogctl->recoveryTargetAdvance;
+ lrecoveryLastXTime = xlogctl->recoveryLastXTime;
+ lrecoveryLastXid = xlogctl->recoveryLastXid;
+
+ SpinLockRelease(&xlogctl->info_lck);
+ }
+
+ elog(LOG, "mode %d xid %u time %s adv %d",
+ lrecoveryTargetMode,
+ lrecoveryTargetXid,
+ timestamptz_to_str(lrecoveryTargetTime),
+ lrecoveryTargetAdvance);
+ }
+ #endif
+
/*
! * For archive recovery, this function decides whether we want to
! * pause or stop applying the XLOG at or after the current record.
*
* Returns TRUE if we are stopping, FALSE otherwise. On TRUE return,
* *includeThis is set TRUE if we should apply this record before stopping.
***************
*** 4762,4833 **** exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
static bool
recoveryStopsHere(XLogRecord *record, bool *includeThis)
{
! bool stopsHere;
! uint8 record_info;
! TimestampTz recordXtime;
!
/* We only consider stopping at COMMIT or ABORT records */
! if (record->xl_rmid != RM_XACT_ID)
! return false;
! record_info = record->xl_info & ~XLR_INFO_MASK;
! if (record_info == XLOG_XACT_COMMIT)
{
! xl_xact_commit *recordXactCommitData;
! recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
! recordXtime = recordXactCommitData->xact_time;
! }
! else if (record_info == XLOG_XACT_ABORT)
! {
! xl_xact_abort *recordXactAbortData;
! recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
! recordXtime = recordXactAbortData->xact_time;
! }
! else
! return false;
! /* Do we have a PITR target at all? */
! if (!recoveryTarget)
! {
! recoveryLastXTime = recordXtime;
! return false;
}
! if (recoveryTargetExact)
{
/*
! * there can be only one transaction end record with this exact
! * transactionid
! *
! * when testing for an xid, we MUST test for equality only, since
! * transactions are numbered in the order they start, not the order
! * they complete. A higher numbered xid will complete before you about
! * 50% of the time...
*/
! stopsHere = (record->xl_xid == recoveryTargetXid);
! if (stopsHere)
! *includeThis = recoveryTargetInclusive;
! }
! else
! {
/*
! * there can be many transactions that share the same commit time, so
! * we stop after the last one, if we are inclusive, or stop at the
! * first one if we are exclusive
*/
! if (recoveryTargetInclusive)
! stopsHere = (recordXtime > recoveryTargetTime);
! else
! stopsHere = (recordXtime >= recoveryTargetTime);
! if (stopsHere)
! *includeThis = false;
}
if (stopsHere)
{
recoveryStopXid = record->xl_xid;
! recoveryStopTime = recordXtime;
recoveryStopAfter = *includeThis;
if (record_info == XLOG_XACT_COMMIT)
--- 5034,5276 ----
static bool
recoveryStopsHere(XLogRecord *record, bool *includeThis)
{
! bool stopsHere = false;
! bool pauseHere = false;
! bool paused = false;
! uint8 record_info = 0; /* valid iff (is_xact_completion_record) */
! TimestampTz recordXtime = 0;
! bool is_xact_completion_record = false;
!
/* We only consider stopping at COMMIT or ABORT records */
! if (record->xl_rmid == RM_XACT_ID)
{
! record_info = record->xl_info & ~XLR_INFO_MASK;
! if (record_info == XLOG_XACT_COMMIT)
! {
! xl_xact_commit *recordXactCommitData;
! recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
! recordXtime = recordXactCommitData->xact_time;
! is_xact_completion_record = true;
! }
! else if (record_info == XLOG_XACT_ABORT)
! {
! xl_xact_abort *recordXactAbortData;
! recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
! recordXtime = recordXactAbortData->xact_time;
! is_xact_completion_record = true;
! }
! /* Remember the most recent COMMIT/ABORT time for logging purposes */
! if (is_xact_completion_record)
! {
! recoveryLastXTime = recordXtime;
! recoveryLastXid = record->xl_xid;
! }
}
! do
{
+ int prevRecoveryTargetMode = recoveryTargetMode;
+
/*
! * Let's see if user has updated our recoveryTargetMode.
*/
! {
! /* use volatile pointer to prevent code rearrangement */
! volatile XLogCtlData *xlogctl = XLogCtl;
!
! SpinLockAcquire(&xlogctl->info_lck);
! recoveryTargetMode = xlogctl->recoveryTargetMode;
! if (recoveryTargetMode != RECOVERY_TARGET_NONE)
! {
! recoveryTargetXid = xlogctl->recoveryTargetXid;
! recoveryTargetTime = xlogctl->recoveryTargetTime;
! recoveryTargetAdvance = xlogctl->recoveryTargetAdvance;
! }
! if (is_xact_completion_record)
! {
! xlogctl->recoveryLastXTime = recordXtime;
! xlogctl->recoveryLastXid = record->xl_xid;
! }
! SpinLockRelease(&xlogctl->info_lck);
! }
!
! /* Decide how to act on any pause target */
! switch (recoveryTargetMode)
! {
! case RECOVERY_TARGET_NONE:
! /*
! * If we aren't paused and we're not looking to stop,
! * just exit out quickly and get on with recovery.
! */
! if (paused)
! ereport(LOG,
! (errmsg("recovery restarting")));
! return false;
!
! case RECOVERY_TARGET_PAUSE_ALL:
! pauseHere = true;
! break;
!
! case RECOVERY_TARGET_ADVANCE:
! if (paused)
! {
! if (recoveryTargetAdvance > 0)
! return false;
! }
! else if (recoveryTargetAdvance-- <= 0)
! pauseHere = true;
! break;
!
! case RECOVERY_TARGET_STOP_IMMEDIATE:
! case RECOVERY_TARGET_STOP_XID:
! case RECOVERY_TARGET_STOP_TIME:
! paused = false;
! break;
!
! /*
! * If we're paused, and mode has changed reset to allow new settings
! * to apply and maybe allow us to continue.
! */
! if (paused && prevRecoveryTargetMode != recoveryTargetMode)
! paused = false;
!
! case RECOVERY_TARGET_PAUSE_CLEANUP:
! /*
! * Advance until we see a cleanup record, then pause.
! */
! if (RecordIsCleanupRecord(record))
! pauseHere = true;
! break;
!
! case RECOVERY_TARGET_PAUSE_XID:
! /*
! * there can be only one transaction end record with this exact
! * transactionid
! *
! * when testing for an xid, we MUST test for equality only, since
! * transactions are numbered in the order they start, not the order
! * they complete. A higher numbered xid will complete before you about
! * 50% of the time...
! */
! if (is_xact_completion_record)
! pauseHere = (record->xl_xid == recoveryTargetXid);
! break;
!
! case RECOVERY_TARGET_PAUSE_TIME:
! /*
! * there can be many transactions that share the same commit time, so
! * we pause after the last one, if we are inclusive, or pause at the
! * first one if we are exclusive
! */
! if (is_xact_completion_record)
! {
! if (recoveryTargetInclusive)
! pauseHere = (recoveryLastXTime > recoveryTargetTime);
! else
! pauseHere = (recoveryLastXTime >= recoveryTargetTime);
! }
! break;
!
! default:
! ereport(WARNING,
! (errmsg("unknown recovery mode %d, continuing recovery",
! recoveryTargetMode)));
! return false;
! }
!
! if (pauseHere && !paused)
! {
! if (is_xact_completion_record)
! {
! if (record_info == XLOG_XACT_COMMIT)
! ereport(LOG,
! (errmsg("recovery pausing before commit of transaction %u, time %s",
! record->xl_xid,
! timestamptz_to_str(recoveryLastXTime))));
! else
! ereport(LOG,
! (errmsg("recovery pausing before abort of transaction %u, time %s",
! record->xl_xid,
! timestamptz_to_str(recoveryLastXTime))));
! }
! else
! ereport(LOG,
! (errmsg("recovery pausing; last completed transaction %u, time %s",
! recoveryLastXid,
! timestamptz_to_str(recoveryLastXTime))));
!
! set_ps_display("recovery paused", false);
!
! paused = true;
! }
!
/*
! * Pause for a while before rechecking mode at top of loop.
*/
! if (paused)
! pg_usleep(200000L);
!
! /*
! * We leave the loop at the bottom only if our recovery mode is
! * set (or has been recently reset) to one of the stop options.
! */
! } while (paused);
!
! /*
! * Decide how to act if stop target mode set. We run this separately from
! * pause to allow user to reset their stop target while paused.
! */
! switch (recoveryTargetMode)
! {
! case RECOVERY_TARGET_STOP_IMMEDIATE:
! ereport(LOG,
! (errmsg("recovery stopping immediately")));
! return true;
!
! case RECOVERY_TARGET_STOP_XID:
! /*
! * there can be only one transaction end record with this exact
! * transactionid
! *
! * when testing for an xid, we MUST test for equality only, since
! * transactions are numbered in the order they start, not the order
! * they complete. A higher numbered xid will complete before you about
! * 50% of the time...
! */
! if (is_xact_completion_record)
! {
! stopsHere = (record->xl_xid == recoveryTargetXid);
! if (stopsHere)
! *includeThis = recoveryTargetInclusive;
! }
! break;
!
! case RECOVERY_TARGET_STOP_TIME:
! /*
! * there can be many transactions that share the same commit time, so
! * we stop after the last one, if we are inclusive, or stop at the
! * first one if we are exclusive
! */
! if (is_xact_completion_record)
! {
! if (recoveryTargetInclusive)
! stopsHere = (recoveryLastXTime > recoveryTargetTime);
! else
! stopsHere = (recoveryLastXTime >= recoveryTargetTime);
! if (stopsHere)
! *includeThis = false;
! }
! break;
}
if (stopsHere)
{
+ Assert(is_xact_completion_record);
recoveryStopXid = record->xl_xid;
! recoveryStopTime = recoveryLastXTime;
recoveryStopAfter = *includeThis;
if (record_info == XLOG_XACT_COMMIT)
***************
*** 4856,4869 **** recoveryStopsHere(XLogRecord *record, bool *includeThis)
recoveryStopXid,
timestamptz_to_str(recoveryStopTime))));
}
! if (recoveryStopAfter)
! recoveryLastXTime = recordXtime;
}
else
! recoveryLastXTime = recordXtime;
! return stopsHere;
}
/*
--- 5299,5496 ----
recoveryStopXid,
timestamptz_to_str(recoveryStopTime))));
}
+ }
! return stopsHere;
! }
!
! /*
! * Utility function used by various user functions to set the recovery
! * target mode. This allows user control over the progress of recovery.
! */
! static void
! SetRecoveryTargetMode(int mode, TransactionId xid, TimestampTz ts, int advance)
! {
! if (!superuser())
! ereport(ERROR,
! (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
! errmsg("must be superuser to control recovery")));
!
! if (!IsRecoveryProcessingMode())
! ereport(ERROR,
! (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
! errmsg("recovery is not in progress"),
! errhint("WAL control functions can only be executed during recovery.")));
!
! {
! /* use volatile pointer to prevent code rearrangement */
! volatile XLogCtlData *xlogctl = XLogCtl;
!
! SpinLockAcquire(&xlogctl->info_lck);
! xlogctl->recoveryTargetMode = mode;
!
! if (mode == RECOVERY_TARGET_STOP_XID ||
! mode == RECOVERY_TARGET_PAUSE_XID)
! xlogctl->recoveryTargetXid = xid;
! else if (mode == RECOVERY_TARGET_STOP_TIME ||
! mode == RECOVERY_TARGET_PAUSE_TIME)
! xlogctl->recoveryTargetTime = ts;
! else if (mode == RECOVERY_TARGET_ADVANCE)
! xlogctl->recoveryTargetAdvance = advance;
!
! SpinLockRelease(&xlogctl->info_lck);
}
+
+ return;
+ }
+
+ /*
+ * Forces recovery mode to reset to unfrozen.
+ * Returns void.
+ */
+ Datum
+ pg_recovery_continue(PG_FUNCTION_ARGS)
+ {
+ SetRecoveryTargetMode(RECOVERY_TARGET_NONE, InvalidTransactionId, 0, 0);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Pause recovery immediately. Stays paused until asked to play again.
+ * Returns void.
+ */
+ Datum
+ pg_recovery_pause(PG_FUNCTION_ARGS)
+ {
+ SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_ALL, InvalidTransactionId, 0, 0);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Pause recovery at the next cleanup record. Stays paused until asked to
+ * play again.
+ */
+ Datum
+ pg_recovery_pause_cleanup(PG_FUNCTION_ARGS)
+ {
+ SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_CLEANUP, InvalidTransactionId, 0, 0);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Pause recovery at stated xid, if ever seen. Once paused, stays paused
+ * until asked to play again.
+ */
+ Datum
+ pg_recovery_pause_xid(PG_FUNCTION_ARGS)
+ {
+ int xidi = PG_GETARG_INT32(0);
+ TransactionId xid = (TransactionId) xidi;
+
+ if (xid < 3)
+ elog(ERROR, "cannot specify special values for transaction id");
+
+ SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_XID, xid, 0, 0);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Pause recovery at stated timestamp, if ever reached. Once paused, stays paused
+ * until asked to play again.
+ */
+ Datum
+ pg_recovery_pause_time(PG_FUNCTION_ARGS)
+ {
+ TimestampTz ts = PG_GETARG_TIMESTAMPTZ(0);
+
+ SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_TIME, InvalidTransactionId, ts, 0);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * If paused, advance N records.
+ */
+ Datum
+ pg_recovery_advance(PG_FUNCTION_ARGS)
+ {
+ int adv = PG_GETARG_INT32(0);
+
+ if (adv < 1)
+ elog(ERROR, "recovery advance must be greater than or equal to 1");
+
+ SetRecoveryTargetMode(RECOVERY_TARGET_ADVANCE, InvalidTransactionId, 0, adv);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Forces recovery to stop now if paused, or at end of next record if playing.
+ */
+ Datum
+ pg_recovery_stop(PG_FUNCTION_ARGS)
+ {
+ SetRecoveryTargetMode(RECOVERY_TARGET_STOP_IMMEDIATE, InvalidTransactionId, 0, 0);
+
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Returns bool with current recovery mode
+ */
+ Datum
+ pg_is_in_recovery(PG_FUNCTION_ARGS)
+ {
+ PG_RETURN_BOOL(IsRecoveryProcessingMode());
+ }
+
+ /*
+ * Returns timestamp of last completed transaction
+ */
+ Datum
+ pg_last_completed_xact_timestamp(PG_FUNCTION_ARGS)
+ {
+ PG_RETURN_TIMESTAMPTZ(recoveryLastXTime);
+ }
+
+ /*
+ * Returns delay in milliseconds, or -1 if delay too large
+ */
+ int
+ GetLatestReplicationDelay(void)
+ {
+ long delay_secs;
+ int delay_usecs;
+ int delay;
+ TimestampTz currTz = GetCurrentTimestamp();
+
+ TimestampDifference(recoveryLastXTime, currTz,
+ &delay_secs, &delay_usecs);
+
+ /*
+ * If delay is very large we probably aren't looking at
+ * a replication situation at all, just a recover from backup.
+ * So return a special value instead.
+ */
+ if (delay_secs > (long)(INT_MAX / 1000))
+ delay = -1;
else
! delay = (int)(delay_secs * 1000) + (delay_usecs / 1000);
! return delay;
! }
!
! /*
! * Returns xid of last completed transaction
! */
! Datum
! pg_last_completed_xid(PG_FUNCTION_ARGS)
! {
! PG_RETURN_INT32(recoveryLastXid);
}
/*
***************
*** 4876,4881 **** StartupXLOG(void)
--- 5503,5509 ----
CheckPoint checkPoint;
bool wasShutdown;
bool reachedStopPoint = false;
+ bool performedRecovery = false;
bool haveBackupLabel = false;
XLogRecPtr RecPtr,
LastRec,
***************
*** 4888,4893 **** StartupXLOG(void)
--- 5516,5523 ----
uint32 freespace;
TransactionId oldestActiveXID;
+ XLogCtl->SharedRecoveryProcessingMode = true;
+
/*
* Read control file and check XLOG status looks valid.
*
***************
*** 5108,5116 **** StartupXLOG(void)
--- 5738,5752 ----
if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
ControlFile->minRecoveryPoint = minRecoveryLoc;
ControlFile->time = (pg_time_t) time(NULL);
+ /* No need to hold ControlFileLock yet, we aren't up far enough */
UpdateControlFile();
/*
+ * Reset pgstat data, because it may be invalid after recovery.
+ */
+ pgstat_reset_all();
+
+ /*
* If there was a backup label file, it's done its job and the info
* has now been propagated into pg_control. We must get rid of the
* label file so that if we crash during recovery, we'll pick up at
***************
*** 5167,5173 **** StartupXLOG(void)
do
{
#ifdef WAL_DEBUG
! if (XLOG_DEBUG)
{
StringInfoData buf;
--- 5803,5814 ----
do
{
#ifdef WAL_DEBUG
! int loglevel = DEBUG3;
!
! if (rmid == RM_XACT_ID)
! loglevel = DEBUG2;
!
! if (loglevel >= trace_recovery_messages)
{
StringInfoData buf;
***************
*** 5210,5215 **** StartupXLOG(void)
--- 5851,5875 ----
TransactionIdAdvance(ShmemVariableCache->nextXid);
}
+ if (InArchiveRecovery)
+ {
+ /*
+ * Make sure the incoming transaction is emulated as running
+ * prior to allowing any changes made by it to touch data.
+ */
+ RecordKnownAssignedTransactionIds(EndRecPtr, record);
+
+ /*
+ * Wait, kill or otherwise resolve any conflicts between
+ * incoming cleanup records and user queries. This is the
+ * main barrier that allows MVCC to work correctly when
+ * running standby servers. Only need to do this if there
+ * is a possibility that users may be active.
+ */
+ if (reachedSafeStartPoint && RecordIsCleanupRecord(record))
+ ResolveRedoVisibilityConflicts(EndRecPtr, record);
+ }
+
if (record->xl_info & XLR_BKP_BLOCK_MASK)
RestoreBkpBlocks(record, EndRecPtr);
***************
*** 5220,5225 **** StartupXLOG(void)
--- 5880,5920 ----
LastRec = ReadRecPtr;
+ /*
+ * Can we signal Postmaster to enter consistent recovery mode?
+ *
+ * There are two points in the log that we must pass. The first
+ * is minRecoveryPoint, which is the LSN at the time the
+ * base backup was taken that we are about to rollforward from.
+ * If recovery has ever crashed or was stopped there is also
+ * another point also: minSafeStartPoint, which we know the
+ * latest LSN that recovery could have reached prior to crash.
+ *
+ * We must also have assembled sufficient information about
+ * transaction state to allow valid snapshots to be taken.
+ *
+ * XXX: Shouldn't we call StartupSUBTRANS() and the other
+ * startup functions like we do below, before letting
+ * anyone in?
+ */
+ if (!reachedSafeStartPoint &&
+ IsRunningXactDataValid() &&
+ XLByteLE(ControlFile->minSafeStartPoint, EndRecPtr) &&
+ XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+ {
+ reachedSafeStartPoint = true;
+ if (InArchiveRecovery)
+ {
+ ereport(LOG,
+ (errmsg("database has now reached consistent state at %X/%X",
+ EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+ InitRecoveryTransactionEnvironment();
+ StartCleanupDelayStats();
+ if (IsUnderPostmaster)
+ SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+ }
+ }
+
record = ReadRecord(NULL, LOG);
} while (record != NULL && recoveryContinue);
***************
*** 5241,5246 **** StartupXLOG(void)
--- 5936,5942 ----
/* there are no WAL records following the checkpoint */
ereport(LOG,
(errmsg("redo is not required")));
+ reachedSafeStartPoint = true;
}
}
***************
*** 5254,5269 **** StartupXLOG(void)
/*
* Complain if we did not roll forward far enough to render the backup
! * dump consistent.
*/
! if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
{
if (reachedStopPoint) /* stopped because of stop request */
ereport(FATAL,
(errmsg("requested recovery stop point is before end time of backup dump")));
else /* ran off end of WAL */
ereport(FATAL,
! (errmsg("WAL ends before end time of backup dump")));
}
/*
--- 5950,5965 ----
/*
* Complain if we did not roll forward far enough to render the backup
! * dump consistent and start safely.
*/
! if (InArchiveRecovery && !reachedSafeStartPoint)
{
if (reachedStopPoint) /* stopped because of stop request */
ereport(FATAL,
(errmsg("requested recovery stop point is before end time of backup dump")));
else /* ran off end of WAL */
ereport(FATAL,
! (errmsg("end of WAL reached before end time of backup dump")));
}
/*
***************
*** 5378,5416 **** StartupXLOG(void)
XLogCheckInvalidPages();
/*
! * Reset pgstat data, because it may be invalid after recovery.
*/
! pgstat_reset_all();
! /*
! * Perform a checkpoint to update all our recovery activity to disk.
! *
! * Note that we write a shutdown checkpoint rather than an on-line
! * one. This is not particularly critical, but since we may be
! * assigning a new TLI, using a shutdown checkpoint allows us to have
! * the rule that TLI only changes in shutdown checkpoints, which
! * allows some extra error checking in xlog_redo.
! */
! CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
}
- /*
- * Preallocate additional log files, if wanted.
- */
- PreallocXlogFiles(EndOfLog);
-
- /*
- * Okay, we're officially UP.
- */
- InRecovery = false;
-
- ControlFile->state = DB_IN_PRODUCTION;
- ControlFile->time = (pg_time_t) time(NULL);
- UpdateControlFile();
-
- /* start the archive_timeout timer running */
- XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
-
/* initialize shared-memory copy of latest checkpoint XID/epoch */
XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
--- 6074,6087 ----
XLogCheckInvalidPages();
/*
! * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote
! * a shutdown checkpoint here, but we ask bgwriter to do that now.
*/
! exitRecovery();
! performedRecovery = true;
}
/* initialize shared-memory copy of latest checkpoint XID/epoch */
XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
***************
*** 5419,5424 **** StartupXLOG(void)
--- 6090,6099 ----
ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
+ /* Shutdown the recovery environment. Must be in this order */
+ ProcArrayClearRecoveryTransactions();
+ RelationClearRecoveryLocks();
+
/* Start up the commit log and related stuff, too */
StartupCLOG();
StartupSUBTRANS(oldestActiveXID);
***************
*** 5444,5449 **** StartupXLOG(void)
--- 6119,6219 ----
readRecordBuf = NULL;
readRecordBufSize = 0;
}
+
+ /*
+ * Prior to 8.4 we wrote a Shutdown Checkpoint at the end of recovery.
+ * This could add minutes to the startup time, so we want bgwriter
+ * to perform it. This then frees the Startup process to complete so we can
+ * allow transactions and WAL inserts. We still write a checkpoint, but
+ * it will be an online checkpoint. Online checkpoints have a redo
+ * location that can be prior to the actual checkpoint record. So we want
+ * to derive that redo location *before* we let anybody else write WAL,
+ * otherwise we might miss some WAL records if we crash.
+ */
+ if (performedRecovery)
+ {
+ XLogRecPtr redo;
+
+ /*
+ * We must grab the pointer before anybody writes WAL
+ */
+ redo = GetRedoLocationForCheckpoint();
+
+ /*
+ * Set up information for the bgwriter, but if it is not active
+ * for whatever reason, perform the checkpoint ourselves.
+ */
+ if (SetRedoLocationForArchiveCheckpoint(redo))
+ {
+ /*
+ * Okay, we can come up now. Allow others to write WAL.
+ */
+ XLogCtl->SharedRecoveryProcessingMode = false;
+
+ /*
+ * Now request checkpoint from bgwriter.
+ */
+ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+ }
+ else
+ {
+ /*
+ * Startup process performs the checkpoint, but defers
+ * the change in processing mode until afterwards.
+ */
+ CreateCheckPoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+ }
+ }
+ else
+ {
+ /*
+ * No recovery, so lets just get on with it.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->state = DB_IN_PRODUCTION;
+ ControlFile->time = (pg_time_t) time(NULL);
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+ }
+
+ /*
+ * Okay, we can come up now. Allow others to write WAL.
+ */
+ XLogCtl->SharedRecoveryProcessingMode = false;
+
+ /* start the archive_timeout timer running */
+ XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
+ }
+
+ /*
+ * IsRecoveryProcessingMode()
+ *
+ * Fast test for whether we're still in recovery or not. We test the shared
+ * state each time only until we leave recovery mode. After that we never
+ * look again, relying upon the settings of our local state variables. This
+ * is designed to avoid the need for a separate initialisation step.
+ */
+ bool
+ IsRecoveryProcessingMode(void)
+ {
+ if (knownProcessingMode && !LocalRecoveryProcessingMode)
+ return false;
+
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ if (xlogctl == NULL)
+ return false;
+
+ SpinLockAcquire(&xlogctl->mode_lck);
+ LocalRecoveryProcessingMode = XLogCtl->SharedRecoveryProcessingMode;
+ SpinLockRelease(&xlogctl->mode_lck);
+ }
+
+ knownProcessingMode = true;
+
+ return LocalRecoveryProcessingMode;
}
/*
***************
*** 5701,5720 **** ShutdownXLOG(int code, Datum arg)
static void
LogCheckpointStart(int flags)
{
! elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
! (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
! (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
! (flags & CHECKPOINT_FORCE) ? " force" : "",
! (flags & CHECKPOINT_WAIT) ? " wait" : "",
! (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
! (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
}
/*
* Log end of a checkpoint.
*/
static void
! LogCheckpointEnd(void)
{
long write_secs,
sync_secs,
--- 6471,6494 ----
static void
LogCheckpointStart(int flags)
{
! if (flags & CHECKPOINT_RESTARTPOINT)
! elog(LOG, "restartpoint starting:%s",
! (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "");
! else
! elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
! (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
! (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
! (flags & CHECKPOINT_FORCE) ? " force" : "",
! (flags & CHECKPOINT_WAIT) ? " wait" : "",
! (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
! (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
}
/*
* Log end of a checkpoint.
*/
static void
! LogCheckpointEnd(int flags)
{
long write_secs,
sync_secs,
***************
*** 5737,5753 **** LogCheckpointEnd(void)
CheckpointStats.ckpt_sync_end_t,
&sync_secs, &sync_usecs);
! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! "%d transaction log file(s) added, %d removed, %d recycled; "
! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! CheckpointStats.ckpt_bufs_written,
! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! CheckpointStats.ckpt_segs_added,
! CheckpointStats.ckpt_segs_removed,
! CheckpointStats.ckpt_segs_recycled,
! write_secs, write_usecs / 1000,
! sync_secs, sync_usecs / 1000,
! total_secs, total_usecs / 1000);
}
/*
--- 6511,6536 ----
CheckpointStats.ckpt_sync_end_t,
&sync_secs, &sync_usecs);
! if (flags & CHECKPOINT_RESTARTPOINT)
! elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! CheckpointStats.ckpt_bufs_written,
! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! write_secs, write_usecs / 1000,
! sync_secs, sync_usecs / 1000,
! total_secs, total_usecs / 1000);
! else
! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! "%d transaction log file(s) added, %d removed, %d recycled; "
! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! CheckpointStats.ckpt_bufs_written,
! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! CheckpointStats.ckpt_segs_added,
! CheckpointStats.ckpt_segs_removed,
! CheckpointStats.ckpt_segs_recycled,
! write_secs, write_usecs / 1000,
! sync_secs, sync_usecs / 1000,
! total_secs, total_usecs / 1000);
}
/*
***************
*** 5772,5788 **** CreateCheckPoint(int flags)
XLogRecPtr recptr;
XLogCtlInsert *Insert = &XLogCtl->Insert;
XLogRecData rdata;
- uint32 freespace;
uint32 _logId;
uint32 _logSeg;
TransactionId *inCommitXids;
int nInCommit;
/*
* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
! * (This is just pro forma, since in the present system structure there is
! * only one process that is allowed to issue checkpoints at any given
! * time.)
*/
LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
--- 6555,6570 ----
XLogRecPtr recptr;
XLogCtlInsert *Insert = &XLogCtl->Insert;
XLogRecData rdata;
uint32 _logId;
uint32 _logSeg;
TransactionId *inCommitXids;
int nInCommit;
+ bool leavingArchiveRecovery = false;
/*
* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
! * That shouldn't be happening, but checkpoints are an important aspect
! * of our resilience, so we take no chances.
*/
LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
***************
*** 5797,5811 **** CreateCheckPoint(int flags)
--- 6579,6602 ----
CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
/*
+ * Find out if this is the first checkpoint after archive recovery.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ leavingArchiveRecovery = (ControlFile->state == DB_IN_ARCHIVE_RECOVERY);
+ LWLockRelease(ControlFileLock);
+
+ /*
* Use a critical section to force system panic if we have trouble.
*/
START_CRIT_SECTION();
if (shutdown)
{
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_SHUTDOWNING;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
+ LWLockRelease(ControlFileLock);
}
/*
***************
*** 5861,5901 **** CreateCheckPoint(int flags)
}
}
! /*
! * Compute new REDO record ptr = location of next XLOG record.
! *
! * NB: this is NOT necessarily where the checkpoint record itself will be,
! * since other backends may insert more XLOG records while we're off doing
! * the buffer flush work. Those XLOG records are logically after the
! * checkpoint, even though physically before it. Got that?
! */
! freespace = INSERT_FREESPACE(Insert);
! if (freespace < SizeOfXLogRecord)
! {
! (void) AdvanceXLInsertBuffer(false);
! /* OK to ignore update return flag, since we will do flush anyway */
! freespace = INSERT_FREESPACE(Insert);
! }
! INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
!
! /*
! * Here we update the shared RedoRecPtr for future XLogInsert calls; this
! * must be done while holding the insert lock AND the info_lck.
! *
! * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
! * pointing past where it really needs to point. This is okay; the only
! * consequence is that XLogInsert might back up whole buffers that it
! * didn't really need to. We can't postpone advancing RedoRecPtr because
! * XLogInserts that happen while we are dumping buffers must assume that
! * their buffer changes are not included in the checkpoint.
! */
{
! /* use volatile pointer to prevent code rearrangement */
! volatile XLogCtlData *xlogctl = XLogCtl;
!
! SpinLockAcquire(&xlogctl->info_lck);
! RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
! SpinLockRelease(&xlogctl->info_lck);
}
/*
--- 6652,6670 ----
}
}
! if (leavingArchiveRecovery)
! checkPoint.redo = GetRedoLocationForArchiveCheckpoint();
! else
{
! /*
! * Compute new REDO record ptr = location of next XLOG record.
! *
! * NB: this is NOT necessarily where the checkpoint record itself will be,
! * since other backends may insert more XLOG records while we're off doing
! * the buffer flush work. Those XLOG records are logically after the
! * checkpoint, even though physically before it. Got that?
! */
! checkPoint.redo = GetRedoLocationForCheckpoint();
}
/*
***************
*** 6013,6023 **** CreateCheckPoint(int flags)
XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
/*
! * Update the control file.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (shutdown)
ControlFile->state = DB_SHUTDOWNED;
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = ProcLastRecPtr;
ControlFile->checkPointCopy = checkPoint;
--- 6782,6799 ----
XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
/*
! * Update the control file. In 8.4, this routine becomes the primary
! * point for recording changes of state in the control file at the
! * end of recovery. Postmaster state already shows us being in
! * normal running mode, but it is only after this point that we
! * are completely free of reperforming a recovery if we crash. Note
! * that this is executed by bgwriter after the death of Startup process.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (shutdown)
ControlFile->state = DB_SHUTDOWNED;
+ else
+ ControlFile->state = DB_IN_PRODUCTION;
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = ProcLastRecPtr;
ControlFile->checkPointCopy = checkPoint;
***************
*** 6025,6030 **** CreateCheckPoint(int flags)
--- 6801,6821 ----
UpdateControlFile();
LWLockRelease(ControlFileLock);
+ if (leavingArchiveRecovery)
+ {
+ /*
+ * Rename the config file out of the way, so that we don't accidentally
+ * re-enter archive recovery mode in a subsequent crash. Prior to
+ * 8.4 this step was performed at end of exitArchiveRecovery().
+ */
+ unlink(RECOVERY_COMMAND_DONE);
+ if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rename file \"%s\" to \"%s\": %m",
+ RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
+ }
+
/* Update shared-memory copy of checkpoint XID/epoch */
{
/* use volatile pointer to prevent code rearrangement */
***************
*** 6068,6082 **** CreateCheckPoint(int flags)
* Truncate pg_subtrans if possible. We can throw away all data before
* the oldest XMIN of any running transaction. No future transaction will
* attempt to reference any pg_subtrans entry older than that (see Asserts
! * in subtrans.c). During recovery, though, we mustn't do this because
! * StartupSUBTRANS hasn't been called yet.
*/
! if (!InRecovery)
TruncateSUBTRANS(GetOldestXmin(true, false));
/* All real work is done, but log before releasing lock. */
if (log_checkpoints)
! LogCheckpointEnd();
TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
NBuffers, CheckpointStats.ckpt_segs_added,
--- 6859,6872 ----
* Truncate pg_subtrans if possible. We can throw away all data before
* the oldest XMIN of any running transaction. No future transaction will
* attempt to reference any pg_subtrans entry older than that (see Asserts
! * in subtrans.c).
*/
! if (!shutdown)
TruncateSUBTRANS(GetOldestXmin(true, false));
/* All real work is done, but log before releasing lock. */
if (log_checkpoints)
! LogCheckpointEnd(flags);
TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
NBuffers, CheckpointStats.ckpt_segs_added,
***************
*** 6084,6089 **** CreateCheckPoint(int flags)
--- 6874,6935 ----
CheckpointStats.ckpt_segs_recycled);
LWLockRelease(CheckpointLock);
+
+ /*
+ * Take a snapshot of running transactions and write this to WAL.
+ * This allows us to reconstruct the state of running transactions
+ * during archive recovery, if required.
+ *
+ * If we are shutting down, or Startup process is completing crash
+ * recovery we don't need to write running xact data.
+ */
+ if (!shutdown && !IsRecoveryProcessingMode())
+ LogCurrentRunningXacts();
+ }
+
+ /*
+ * GetRedoLocationForCheckpoint()
+ *
+ * When !IsRecoveryProcessingMode() this must be called while holding
+ * WALInsertLock().
+ */
+ static XLogRecPtr
+ GetRedoLocationForCheckpoint()
+ {
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ uint32 freespace;
+ XLogRecPtr redo;
+
+ freespace = INSERT_FREESPACE(Insert);
+ if (freespace < SizeOfXLogRecord)
+ {
+ (void) AdvanceXLInsertBuffer(false);
+ /* OK to ignore update return flag, since we will do flush anyway */
+ freespace = INSERT_FREESPACE(Insert);
+ }
+ INSERT_RECPTR(redo, Insert, Insert->curridx);
+
+ /*
+ * Here we update the shared RedoRecPtr for future XLogInsert calls; this
+ * must be done while holding the insert lock AND the info_lck.
+ *
+ * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
+ * pointing past where it really needs to point. This is okay; the only
+ * consequence is that XLogInsert might back up whole buffers that it
+ * didn't really need to. We can't postpone advancing RedoRecPtr because
+ * XLogInserts that happen while we are dumping buffers must assume that
+ * their buffer changes are not included in the checkpoint.
+ */
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo;
+ SpinLockRelease(&xlogctl->info_lck);
+ }
+
+ return redo;
}
/*
***************
*** 6142,6148 **** RecoveryRestartPoint(const CheckPoint *checkPoint)
if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
if (!(RmgrTable[rmid].rm_safe_restartpoint()))
{
! elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
rmid,
checkPoint->redo.xlogid,
checkPoint->redo.xrecoff);
--- 6988,6994 ----
if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
if (!(RmgrTable[rmid].rm_safe_restartpoint()))
{
! elog(trace_recovery(DEBUG2), "RM %d not safe to record restart point at %X/%X",
rmid,
checkPoint->redo.xlogid,
checkPoint->redo.xrecoff);
***************
*** 6150,6180 **** RecoveryRestartPoint(const CheckPoint *checkPoint)
}
}
/*
! * OK, force data out to disk
*/
! CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
/*
! * Update pg_control so that any subsequent crash will restart from this
! * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint
! * record itself.
*/
- ControlFile->prevCheckPoint = ControlFile->checkPoint;
- ControlFile->checkPoint = ReadRecPtr;
- ControlFile->checkPointCopy = *checkPoint;
- ControlFile->time = (pg_time_t) time(NULL);
- UpdateControlFile();
ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! (errmsg("recovery restart point at %X/%X",
! checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
! if (recoveryLastXTime)
! ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! (errmsg("last completed transaction was at log time %s",
! timestamptz_to_str(recoveryLastXTime))));
! }
/*
* Write a NEXTOID log record
*/
--- 6996,7068 ----
}
}
+ RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStartPoint);
+ }
+
+ /*
+ * As of 8.4, RestartPoints are always created by the bgwriter
+ * once we have reachedSafeStartPoint. We use bgwriter's shared memory
+ * area wherever we call it from, to keep better code structure.
+ */
+ void
+ CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int flags)
+ {
+ if (recoveryLogRestartpoints || log_checkpoints)
+ {
+ /*
+ * Prepare to accumulate statistics.
+ */
+
+ MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+ LogCheckpointStart(CHECKPOINT_RESTARTPOINT | flags);
+ }
+
+ /*
+ * Acquire CheckpointLock to ensure only one restartpoint happens at a time.
+ * We rely on this lock to ensure that the startup process doesn't exit
+ * Recovery while we are half way through a restartpoint.
+ */
+ LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+ CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT | flags);
+
/*
! * Update pg_control, using current time
*/
! LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
! ControlFile->prevCheckPoint = ControlFile->checkPoint;
! ControlFile->checkPoint = ReadPtr;
! ControlFile->checkPointCopy = *restartPoint;
! ControlFile->time = (pg_time_t) time(NULL);
! UpdateControlFile();
! LWLockRelease(ControlFileLock);
/*
! * Currently, there is no need to truncate pg_subtrans during recovery.
! * If we did do that, we will need to have called StartupSUBTRANS()
! * already and then TruncateSUBTRANS() would go here.
*/
+ /* All real work is done, but log before releasing lock. */
+ if (recoveryLogRestartpoints || log_checkpoints)
+ LogCheckpointEnd(CHECKPOINT_RESTARTPOINT);
+
ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! (errmsg("recovery restart point at %X/%X",
! restartPoint->redo.xlogid, restartPoint->redo.xrecoff)));
!
! ReportCleanupDelayStats();
!
! if (recoveryLastXTime)
! ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! (errmsg("last completed transaction was at log time %s",
! timestamptz_to_str(recoveryLastXTime))));
+ LWLockRelease(CheckpointLock);
+ }
+
/*
* Write a NEXTOID log record
*/
***************
*** 6237,6243 **** RequestXLogSwitch(void)
}
/*
! * XLOG resource manager's routines
*/
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
--- 7125,7187 ----
}
/*
! * exitRecovery()
! *
! * Exit recovery state and write a XLOG_RECOVERY_END record. This is the
! * only record type that can record a change of timelineID. We assume
! * caller has already set ThisTimeLineID, if appropriate.
! */
! static void
! exitRecovery(void)
! {
! XLogRecData rdata;
!
! rdata.buffer = InvalidBuffer;
! rdata.data = (char *) (&ThisTimeLineID);
! rdata.len = sizeof(TimeLineID);
! rdata.next = NULL;
!
! /*
! * If a restartpoint is in progress, we will not be able to successfully
! * acquire CheckpointLock. If bgwriter is still in progress then send
! * a second signal to nudge bgwriter to go faster so we can avoid delay.
! * Then wait for lock, so we know the restartpoint has completed. We do
! * this because we don't want to interrupt the restartpoint half way
! * through, which might leave us in a mess and we want to be robust. We're
! * going to checkpoint soon anyway, so not it's not wasted effort.
! */
! if (LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
! LWLockRelease(CheckpointLock);
! else
! {
! RequestRestartPointCompletion();
! ereport(trace_recovery(DEBUG1),
! (errmsg("startup process waiting for restartpoint to complete")));
! LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
! LWLockRelease(CheckpointLock);
! }
!
! /*
! * This is the only type of WAL message that can be inserted during
! * recovery. This ensures that we don't allow others to get access
! * until after we have changed state.
! */
! (void) XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata);
!
! /*
! * We don't XLogFlush() here otherwise we'll end up zeroing the WAL
! * file ourselves. So just let bgwriter's forthcoming checkpoint do
! * that for us.
! */
!
! InRecovery = false;
! }
!
! /*
! * XLOG resource manager's routines.
! *
! * Definitions of message info are in include/catalog/pg_control.h,
! * though not all messages relate to control file processing.
*/
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
***************
*** 6267,6293 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record)
MultiXactSetNextMXact(checkPoint.nextMulti,
checkPoint.nextMultiOffset);
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
! /*
! * TLI may change in a shutdown checkpoint, but it shouldn't decrease
*/
- if (checkPoint.ThisTimeLineID != ThisTimeLineID)
- {
- if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
- !list_member_int(expectedTLIs,
- (int) checkPoint.ThisTimeLineID))
- ereport(PANIC,
- (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
- checkPoint.ThisTimeLineID, ThisTimeLineID)));
- /* Following WAL records should be run with new TLI */
- ThisTimeLineID = checkPoint.ThisTimeLineID;
- }
RecoveryRestartPoint(&checkPoint);
}
else if (info == XLOG_CHECKPOINT_ONLINE)
{
CheckPoint checkPoint;
--- 7211,7262 ----
MultiXactSetNextMXact(checkPoint.nextMulti,
checkPoint.nextMultiOffset);
+ /* We know nothing was running on the master at this point */
+ ProcArrayClearRecoveryTransactions();
+ RelationClearRecoveryLocks();
+
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
! /*
! * TLI no longer changes at shutdown checkpoint, since as of 8.4,
! * shutdown checkpoints only occur at shutdown. Much less confusing.
*/
RecoveryRestartPoint(&checkPoint);
}
+ else if (info == XLOG_RECOVERY_END)
+ {
+ TimeLineID tli;
+
+ memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
+
+ /* We know nothing was running on the master at this point */
+ ProcArrayClearRecoveryTransactions();
+ RelationClearRecoveryLocks();
+
+ /*
+ * TLI may change when recovery ends, but it shouldn't decrease.
+ *
+ * This is the only WAL record that can tell us to change timelineID
+ * while we process WAL records.
+ *
+ * We can *choose* to stop recovery at any point, generating a
+ * new timelineID which is recorded using this record type.
+ */
+ if (tli != ThisTimeLineID)
+ {
+ if (tli < ThisTimeLineID ||
+ !list_member_int(expectedTLIs,
+ (int) tli))
+ ereport(PANIC,
+ (errmsg("unexpected timeline ID %u (after %u) at recovery end record",
+ tli, ThisTimeLineID)));
+ /* Following WAL records should be run with new TLI */
+ ThisTimeLineID = tli;
+ }
+ }
else if (info == XLOG_CHECKPOINT_ONLINE)
{
CheckPoint checkPoint;
***************
*** 6309,6315 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record)
ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
! /* TLI should not change in an on-line checkpoint */
if (checkPoint.ThisTimeLineID != ThisTimeLineID)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
--- 7278,7284 ----
ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
! /* TLI must not change at a checkpoint */
if (checkPoint.ThisTimeLineID != ThisTimeLineID)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
***************
*** 6377,6382 **** xlog_outrec(StringInfo buf, XLogRecord *record)
--- 7346,7355 ----
record->xl_prev.xlogid, record->xl_prev.xrecoff,
record->xl_xid);
+ appendStringInfo(buf, "; pxid %u len %u",
+ record->xl_parentxid,
+ record->xl_len);
+
for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
{
if (record->xl_info & XLR_SET_BKP_BLOCK(i))
***************
*** 6545,6550 **** pg_start_backup(PG_FUNCTION_ARGS)
--- 7518,7529 ----
errhint("archive_command must be defined before "
"online backups can be made safely.")));
+ if (IsRecoveryProcessingMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery.")));
+
backupidstr = text_to_cstring(backupid);
/*
***************
*** 6710,6715 **** pg_stop_backup(PG_FUNCTION_ARGS)
--- 7689,7700 ----
errmsg("WAL archiving is not active"),
errhint("archive_mode must be enabled at server start.")));
+ if (IsRecoveryProcessingMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery.")));
+
/*
* OK to clear forcePageWrites
*/
***************
*** 6865,6870 **** pg_switch_xlog(PG_FUNCTION_ARGS)
--- 7850,7861 ----
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
(errmsg("must be superuser to switch transaction log files"))));
+ if (IsRecoveryProcessingMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery.")));
+
switchpoint = RequestXLogSwitch();
/*
***************
*** 6887,6892 **** pg_current_xlog_location(PG_FUNCTION_ARGS)
--- 7878,7889 ----
{
char location[MAXFNAMELEN];
+ if (IsRecoveryProcessingMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery.")));
+
/* Make sure we have an up-to-date local LogwrtResult */
{
/* use volatile pointer to prevent code rearrangement */
***************
*** 6914,6919 **** pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
--- 7911,7922 ----
XLogRecPtr current_recptr;
char location[MAXFNAMELEN];
+ if (IsRecoveryProcessingMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery.")));
+
/*
* Get the current end-of-WAL position ... shared lock is sufficient
*/
*** src/backend/access/transam/xlogutils.c
--- src/backend/access/transam/xlogutils.c
***************
*** 227,233 **** Buffer
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
{
return XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
! init ? RBM_ZERO : RBM_NORMAL);
}
/*
--- 227,240 ----
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
{
return XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
! init ? RBM_ZERO : RBM_NORMAL, BUFFER_LOCK_EXCLUSIVE);
! }
!
! Buffer
! XLogReadBufferForCleanup(RelFileNode rnode, BlockNumber blkno, bool init)
! {
! return XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
! init ? RBM_ZERO : RBM_NORMAL, BUFFER_LOCK_CLEANUP);
}
/*
***************
*** 254,260 **** XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
*/
Buffer
XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
! BlockNumber blkno, ReadBufferMode mode)
{
BlockNumber lastblock;
Buffer buffer;
--- 261,267 ----
*/
Buffer
XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
! BlockNumber blkno, ReadBufferMode mode, int lockmode)
{
BlockNumber lastblock;
Buffer buffer;
***************
*** 306,312 **** XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
Assert(BufferGetBlockNumber(buffer) == blkno);
}
! LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
if (mode == RBM_NORMAL)
{
--- 313,324 ----
Assert(BufferGetBlockNumber(buffer) == blkno);
}
! if (lockmode == BUFFER_LOCK_EXCLUSIVE)
! LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! else if (lockmode == BUFFER_LOCK_CLEANUP)
! LockBufferForCleanup(buffer);
! else
! elog(FATAL, "Invalid buffer lock mode %d", lockmode);
if (mode == RBM_NORMAL)
{
*** src/backend/bootstrap/bootstrap.c
--- src/backend/bootstrap/bootstrap.c
***************
*** 35,40 ****
--- 35,41 ----
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/proc.h"
+ #include "storage/sinvaladt.h"
#include "tcop/tcopprot.h"
#include "utils/builtins.h"
#include "utils/flatfiles.h"
*** src/backend/commands/dbcommands.c
--- src/backend/commands/dbcommands.c
***************
*** 1976,1981 **** dbase_redo(XLogRecPtr lsn, XLogRecord *record)
--- 1976,1986 ----
* We don't need to copy subdirectories
*/
copydir(src_path, dst_path, false);
+
+ /*
+ * Flat files are updated immediately following transaction commit.
+ * Nothing to do here.
+ */
}
else if (info == XLOG_DBASE_DROP)
{
***************
*** 1998,2003 **** dbase_redo(XLogRecPtr lsn, XLogRecord *record)
--- 2003,2012 ----
ereport(WARNING,
(errmsg("some useless files may be left behind in old database directory \"%s\"",
dst_path)));
+ /*
+ * Flat files are updated immediately following transaction commit.
+ * Nothing to do here.
+ */
}
else
elog(PANIC, "dbase_redo: unknown op code %u", info);
*** src/backend/commands/discard.c
--- src/backend/commands/discard.c
***************
*** 65,71 **** DiscardAll(bool isTopLevel)
ResetAllOptions();
DropAllPreparedStatements();
PortalHashTableDeleteAll();
! Async_UnlistenAll();
LockReleaseAll(USER_LOCKMETHOD, true);
ResetPlanCache();
ResetTempTableNamespace();
--- 65,72 ----
ResetAllOptions();
DropAllPreparedStatements();
PortalHashTableDeleteAll();
! if (!IsRecoveryProcessingMode())
! Async_UnlistenAll();
LockReleaseAll(USER_LOCKMETHOD, true);
ResetPlanCache();
ResetTempTableNamespace();
*** src/backend/commands/indexcmds.c
--- src/backend/commands/indexcmds.c
***************
*** 648,654 **** DefineIndex(RangeVar *heapRelation,
* Also, GetCurrentVirtualXIDs never reports our own vxid, so we need not
* check for that.
*/
! old_snapshots = GetCurrentVirtualXIDs(snapshot->xmax, false,
PROC_IS_AUTOVACUUM | PROC_IN_VACUUM);
while (VirtualTransactionIdIsValid(*old_snapshots))
--- 648,654 ----
* Also, GetCurrentVirtualXIDs never reports our own vxid, so we need not
* check for that.
*/
! old_snapshots = GetCurrentVirtualXIDs(snapshot->xmax, MyDatabaseId,
PROC_IS_AUTOVACUUM | PROC_IN_VACUUM);
while (VirtualTransactionIdIsValid(*old_snapshots))
*** src/backend/commands/lockcmds.c
--- src/backend/commands/lockcmds.c
***************
*** 49,54 **** LockTableCommand(LockStmt *lockstmt)
--- 49,66 ----
*/
reloid = RangeVarGetRelid(relation, false);
+ /*
+ * During recovery we only accept these variations:
+ *
+ * LOCK TABLE foo -- parser translates as AccessEclusiveLock request
+ * LOCK TABLE foo IN AccessShareLock MODE
+ * LOCK TABLE foo IN AccessExclusiveLock MODE
+ */
+ if (IsRecoveryProcessingMode() &&
+ !(lockstmt->mode == AccessShareLock ||
+ lockstmt->mode == AccessExclusiveLock))
+ PreventCommandDuringRecovery();
+
if (lockstmt->mode == AccessShareLock)
aclresult = pg_class_aclcheck(reloid, GetUserId(),
ACL_SELECT);
*** src/backend/commands/sequence.c
--- src/backend/commands/sequence.c
***************
*** 457,462 **** nextval_internal(Oid relid)
--- 457,464 ----
rescnt = 0;
bool logit = false;
+ PreventCommandDuringRecovery();
+
/* open and AccessShareLock sequence */
init_sequence(relid, &elm, &seqrel);
*** src/backend/commands/vacuum.c
--- src/backend/commands/vacuum.c
***************
*** 140,145 **** typedef struct VRelStats
--- 140,146 ----
/* vtlinks array for tuple chain following - sorted by new_tid */
int num_vtlinks;
VTupleLink vtlinks;
+ TransactionId latestRemovedXid;
} VRelStats;
/*----------------------------------------------------------------------
***************
*** 223,229 **** static void scan_heap(VRelStats *vacrelstats, Relation onerel,
static void repair_frag(VRelStats *vacrelstats, Relation onerel,
VacPageList vacuum_pages, VacPageList fraged_pages,
int nindexes, Relation *Irel);
! static void move_chain_tuple(Relation rel,
Buffer old_buf, Page old_page, HeapTuple old_tup,
Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
ExecContext ec, ItemPointer ctid, bool cleanVpd);
--- 224,230 ----
static void repair_frag(VRelStats *vacrelstats, Relation onerel,
VacPageList vacuum_pages, VacPageList fraged_pages,
int nindexes, Relation *Irel);
! static void move_chain_tuple(VRelStats *vacrelstats, Relation rel,
Buffer old_buf, Page old_page, HeapTuple old_tup,
Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
ExecContext ec, ItemPointer ctid, bool cleanVpd);
***************
*** 236,242 **** static void update_hint_bits(Relation rel, VacPageList fraged_pages,
int num_moved);
static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
VacPageList vacpagelist);
! static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
static void vacuum_index(VacPageList vacpagelist, Relation indrel,
double num_tuples, int keep_tuples);
static void scan_index(Relation indrel, double num_tuples);
--- 237,243 ----
int num_moved);
static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
VacPageList vacpagelist);
! static void vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage);
static void vacuum_index(VacPageList vacpagelist, Relation indrel,
double num_tuples, int keep_tuples);
static void scan_index(Relation indrel, double num_tuples);
***************
*** 1238,1243 **** full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
--- 1239,1245 ----
vacrelstats->rel_tuples = 0;
vacrelstats->rel_indexed_tuples = 0;
vacrelstats->hasindex = false;
+ vacrelstats->latestRemovedXid = InvalidTransactionId;
/* scan the heap */
vacuum_pages.num_pages = fraged_pages.num_pages = 0;
***************
*** 1641,1646 **** scan_heap(VRelStats *vacrelstats, Relation onerel,
--- 1643,1651 ----
{
ItemId lpp;
+ HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
+ &vacrelstats->latestRemovedXid);
+
/*
* Here we are building a temporary copy of the page with dead
* tuples removed. Below we will apply
***************
*** 1954,1960 **** repair_frag(VRelStats *vacrelstats, Relation onerel,
/* there are dead tuples on this page - clean them */
Assert(!isempty);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
! vacuum_page(onerel, buf, last_vacuum_page);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
else
--- 1959,1965 ----
/* there are dead tuples on this page - clean them */
Assert(!isempty);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
! vacuum_page(vacrelstats, onerel, buf, last_vacuum_page);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
else
***************
*** 2443,2449 **** repair_frag(VRelStats *vacrelstats, Relation onerel,
tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
! move_chain_tuple(onerel, Cbuf, Cpage, &tuple,
dst_buffer, dst_page, destvacpage,
&ec, &Ctid, vtmove[ti].cleanVpd);
--- 2448,2454 ----
tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
! move_chain_tuple(vacrelstats, onerel, Cbuf, Cpage, &tuple,
dst_buffer, dst_page, destvacpage,
&ec, &Ctid, vtmove[ti].cleanVpd);
***************
*** 2529,2535 **** repair_frag(VRelStats *vacrelstats, Relation onerel,
dst_page = BufferGetPage(dst_buffer);
/* if this page was not used before - clean it */
if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0)
! vacuum_page(onerel, dst_buffer, dst_vacpage);
}
else
LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
--- 2534,2540 ----
dst_page = BufferGetPage(dst_buffer);
/* if this page was not used before - clean it */
if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0)
! vacuum_page(vacrelstats, onerel, dst_buffer, dst_vacpage);
}
else
LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
***************
*** 2706,2712 **** repair_frag(VRelStats *vacrelstats, Relation onerel,
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(buf);
if (!PageIsEmpty(page))
! vacuum_page(onerel, buf, *curpage);
UnlockReleaseBuffer(buf);
}
}
--- 2711,2717 ----
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(buf);
if (!PageIsEmpty(page))
! vacuum_page(vacrelstats, onerel, buf, *curpage);
UnlockReleaseBuffer(buf);
}
}
***************
*** 2842,2848 **** repair_frag(VRelStats *vacrelstats, Relation onerel,
recptr = log_heap_clean(onerel, buf,
NULL, 0, NULL, 0,
unused, uncnt,
! false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
--- 2847,2853 ----
recptr = log_heap_clean(onerel, buf,
NULL, 0, NULL, 0,
unused, uncnt,
! vacrelstats->latestRemovedXid, false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
***************
*** 2892,2898 **** repair_frag(VRelStats *vacrelstats, Relation onerel,
* already too long and almost unreadable.
*/
static void
! move_chain_tuple(Relation rel,
Buffer old_buf, Page old_page, HeapTuple old_tup,
Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
ExecContext ec, ItemPointer ctid, bool cleanVpd)
--- 2897,2903 ----
* already too long and almost unreadable.
*/
static void
! move_chain_tuple(VRelStats *vacrelstats, Relation rel,
Buffer old_buf, Page old_page, HeapTuple old_tup,
Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
ExecContext ec, ItemPointer ctid, bool cleanVpd)
***************
*** 2948,2954 **** move_chain_tuple(Relation rel,
int sv_offsets_used = dst_vacpage->offsets_used;
dst_vacpage->offsets_used = 0;
! vacuum_page(rel, dst_buf, dst_vacpage);
dst_vacpage->offsets_used = sv_offsets_used;
}
--- 2953,2959 ----
int sv_offsets_used = dst_vacpage->offsets_used;
dst_vacpage->offsets_used = 0;
! vacuum_page(vacrelstats, rel, dst_buf, dst_vacpage);
dst_vacpage->offsets_used = sv_offsets_used;
}
***************
*** 3272,3278 **** vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
buf = ReadBufferExtended(onerel, MAIN_FORKNUM, (*vacpage)->blkno,
RBM_NORMAL, vac_strategy);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
! vacuum_page(onerel, buf, *vacpage);
UnlockReleaseBuffer(buf);
}
}
--- 3277,3283 ----
buf = ReadBufferExtended(onerel, MAIN_FORKNUM, (*vacpage)->blkno,
RBM_NORMAL, vac_strategy);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
! vacuum_page(vacrelstats, onerel, buf, *vacpage);
UnlockReleaseBuffer(buf);
}
}
***************
*** 3302,3308 **** vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
* Caller must hold pin and lock on buffer.
*/
static void
! vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
{
Page page = BufferGetPage(buffer);
int i;
--- 3307,3313 ----
* Caller must hold pin and lock on buffer.
*/
static void
! vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage)
{
Page page = BufferGetPage(buffer);
int i;
***************
*** 3331,3337 **** vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
recptr = log_heap_clean(onerel, buffer,
NULL, 0, NULL, 0,
vacpage->offsets, vacpage->offsets_free,
! false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
--- 3336,3342 ----
recptr = log_heap_clean(onerel, buffer,
NULL, 0, NULL, 0,
vacpage->offsets, vacpage->offsets_free,
! vacrelstats->latestRemovedXid, false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
*** src/backend/commands/vacuumlazy.c
--- src/backend/commands/vacuumlazy.c
***************
*** 91,96 **** typedef struct LVRelStats
--- 91,97 ----
ItemPointer dead_tuples; /* array of ItemPointerData */
int num_index_scans;
bool scanned_all; /* have we scanned all pages (this far)? */
+ TransactionId latestRemovedXid;
} LVRelStats;
***************
*** 235,240 **** lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
--- 236,271 ----
*scanned_all = vacrelstats->scanned_all;
}
+ /*
+ * For Hot Standby we need to know the highest transaction id that will
+ * be removed by any change. VACUUM proceeds in a number of passes so
+ * we need to consider how each pass operates. The first pass runs
+ * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
+ * progresses - these will have a latestRemovedXid on each record.
+ * In many cases this removes all of the tuples to be removed.
+ * Then we look at tuples to be removed, but do not actually remove them
+ * until phase three. However, index records for those rows are removed
+ * in phase two and index blocks do not have MVCC information attached.
+ * So before we can allow removal of *any* index tuples we need to issue
+ * a WAL record indicating what the latestRemovedXid will be at the end
+ * of phase three. This then allows Hot Standby queries to block at the
+ * correct place, i.e. before phase two, rather than during phase three
+ * as we issue more XLOG_HEAP2_CLEAN records. If we need to run multiple
+ * phase two/three because of memory constraints we need to issue multiple
+ * log records also.
+ */
+ static void
+ vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
+ {
+ /*
+ * No need to log changes for temp tables, they do not contain
+ * data visible on the standby server.
+ */
+ if (rel->rd_istemp)
+ return;
+
+ (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
+ }
/*
* lazy_scan_heap() -- scan an open heap relation
***************
*** 284,289 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
--- 315,321 ----
nblocks = RelationGetNumberOfBlocks(onerel);
vacrelstats->rel_pages = nblocks;
vacrelstats->nonempty_pages = 0;
+ vacrelstats->latestRemovedXid = InvalidTransactionId;
lazy_space_alloc(vacrelstats, nblocks);
***************
*** 328,333 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
--- 360,368 ----
if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
vacrelstats->num_dead_tuples > 0)
{
+ /* Log cleanup info before we touch indexes */
+ vacuum_log_cleanup_info(onerel, vacrelstats);
+
/* Remove index entries */
for (i = 0; i < nindexes; i++)
lazy_vacuum_index(Irel[i],
***************
*** 567,572 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
--- 602,609 ----
if (tupgone)
{
lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
+ HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
+ &vacrelstats->latestRemovedXid);
tups_vacuumed += 1;
}
else
***************
*** 677,682 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
--- 714,722 ----
/* XXX put a threshold on min number of tuples here? */
if (vacrelstats->num_dead_tuples > 0)
{
+ /* Log cleanup info before we touch indexes */
+ vacuum_log_cleanup_info(onerel, vacrelstats);
+
/* Remove index entries */
for (i = 0; i < nindexes; i++)
lazy_vacuum_index(Irel[i],
***************
*** 821,827 **** lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
recptr = log_heap_clean(onerel, buffer,
NULL, 0, NULL, 0,
unused, uncnt,
! false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
--- 861,867 ----
recptr = log_heap_clean(onerel, buffer,
NULL, 0, NULL, 0,
unused, uncnt,
! vacrelstats->latestRemovedXid, false);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
*** src/backend/postmaster/bgwriter.c
--- src/backend/postmaster/bgwriter.c
***************
*** 49,54 ****
--- 49,55 ----
#include
#include "access/xlog_internal.h"
+ #include "catalog/pg_control.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "pgstat.h"
***************
*** 129,134 **** typedef struct
--- 130,142 ----
int ckpt_flags; /* checkpoint flags, as defined in xlog.h */
+ /*
+ * When the Startup process wants bgwriter to perform a restartpoint, it
+ * sets these fields so that we can update the control file afterwards.
+ */
+ XLogRecPtr ReadPtr; /* Requested log pointer */
+ CheckPoint restartPoint; /* restartPoint data for ControlFile */
+
uint32 num_backend_writes; /* counts non-bgwriter buffer writes */
int num_requests; /* current # of requests */
***************
*** 165,171 **** static bool ckpt_active = false;
/* these values are valid when ckpt_active is true: */
static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;
static double ckpt_cached_elapsed;
static pg_time_t last_checkpoint_time;
--- 173,179 ----
/* these values are valid when ckpt_active is true: */
static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr; /* not used if IsRecoveryProcessingMode */
static double ckpt_cached_elapsed;
static pg_time_t last_checkpoint_time;
***************
*** 197,202 **** BackgroundWriterMain(void)
--- 205,211 ----
{
sigjmp_buf local_sigjmp_buf;
MemoryContext bgwriter_context;
+ bool BgWriterRecoveryMode;
BgWriterShmem->bgwriter_pid = MyProcPid;
am_bg_writer = true;
***************
*** 355,370 **** BackgroundWriterMain(void)
*/
PG_SETMASK(&UnBlockSig);
/*
* Loop forever
*/
for (;;)
{
- bool do_checkpoint = false;
- int flags = 0;
- pg_time_t now;
- int elapsed_secs;
-
/*
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
--- 364,380 ----
*/
PG_SETMASK(&UnBlockSig);
+ BgWriterRecoveryMode = IsRecoveryProcessingMode();
+
+ if (BgWriterRecoveryMode)
+ elog(DEBUG1, "bgwriter starting during recovery, pid = %u",
+ BgWriterShmem->bgwriter_pid);
+
/*
* Loop forever
*/
for (;;)
{
/*
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
***************
*** 372,499 **** BackgroundWriterMain(void)
if (!PostmasterIsAlive(true))
exit(1);
- /*
- * Process any requests or signals received recently.
- */
- AbsorbFsyncRequests();
-
if (got_SIGHUP)
{
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
}
- if (checkpoint_requested)
- {
- checkpoint_requested = false;
- do_checkpoint = true;
- BgWriterStats.m_requested_checkpoints++;
- }
- if (shutdown_requested)
- {
- /*
- * From here on, elog(ERROR) should end with exit(1), not send
- * control back to the sigsetjmp block above
- */
- ExitOnAnyError = true;
- /* Close down the database */
- ShutdownXLOG(0, 0);
- /* Normal exit from the bgwriter is here */
- proc_exit(0); /* done */
- }
! /*
! * Force a checkpoint if too much time has elapsed since the last one.
! * Note that we count a timed checkpoint in stats only when this
! * occurs without an external request, but we set the CAUSE_TIME flag
! * bit even if there is also an external request.
! */
! now = (pg_time_t) time(NULL);
! elapsed_secs = now - last_checkpoint_time;
! if (elapsed_secs >= CheckPointTimeout)
{
! if (!do_checkpoint)
! BgWriterStats.m_timed_checkpoints++;
! do_checkpoint = true;
! flags |= CHECKPOINT_CAUSE_TIME;
! }
! /*
! * Do a checkpoint if requested, otherwise do one cycle of
! * dirty-buffer writing.
! */
! if (do_checkpoint)
! {
! /* use volatile pointer to prevent code rearrangement */
! volatile BgWriterShmemStruct *bgs = BgWriterShmem;
! /*
! * Atomically fetch the request flags to figure out what kind of a
! * checkpoint we should perform, and increase the started-counter
! * to acknowledge that we've started a new checkpoint.
! */
! SpinLockAcquire(&bgs->ckpt_lck);
! flags |= bgs->ckpt_flags;
! bgs->ckpt_flags = 0;
! bgs->ckpt_started++;
! SpinLockRelease(&bgs->ckpt_lck);
! /*
! * We will warn if (a) too soon since last checkpoint (whatever
! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! * since the last checkpoint start. Note in particular that this
! * implementation will not generate warnings caused by
! * CheckPointTimeout < CheckPointWarning.
! */
! if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! elapsed_secs < CheckPointWarning)
! ereport(LOG,
! (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! elapsed_secs),
! errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
/*
! * Initialize bgwriter-private variables used during checkpoint.
*/
! ckpt_active = true;
! ckpt_start_recptr = GetInsertRecPtr();
! ckpt_start_time = now;
! ckpt_cached_elapsed = 0;
! /*
! * Do the checkpoint.
! */
! CreateCheckPoint(flags);
/*
! * After any checkpoint, close all smgr files. This is so we
! * won't hang onto smgr references to deleted files indefinitely.
*/
! smgrcloseall();
/*
! * Indicate checkpoint completion to any waiting backends.
*/
! SpinLockAcquire(&bgs->ckpt_lck);
! bgs->ckpt_done = bgs->ckpt_started;
! SpinLockRelease(&bgs->ckpt_lck);
! ckpt_active = false;
! /*
! * Note we record the checkpoint start time not end time as
! * last_checkpoint_time. This is so that time-driven checkpoints
! * happen at a predictable spacing.
! */
! last_checkpoint_time = now;
}
- else
- BgBufferSync();
-
- /* Check for archive_timeout and switch xlog files if necessary. */
- CheckArchiveTimeout();
-
- /* Nap for the configured time. */
- BgWriterNap();
}
}
--- 382,595 ----
if (!PostmasterIsAlive(true))
exit(1);
if (got_SIGHUP)
{
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
}
! if (BgWriterRecoveryMode)
{
! if (shutdown_requested)
! {
! /*
! * From here on, elog(ERROR) should end with exit(1), not send
! * control back to the sigsetjmp block above
! */
! ExitOnAnyError = true;
! /* Normal exit from the bgwriter is here */
! proc_exit(0); /* done */
! }
! if (!IsRecoveryProcessingMode())
! {
! elog(DEBUG2, "bgwriter changing from recovery to normal mode");
!
! InitXLOGAccess();
! BgWriterRecoveryMode = false;
!
! /*
! * Start time-driven events from now
! */
! last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
!
! /*
! * Notice that we do *not* act on a checkpoint_requested
! * state at this point. We have changed mode, so we wish to
! * perform a checkpoint not a restartpoint.
! */
! continue;
! }
! if (checkpoint_requested)
! {
! XLogRecPtr ReadPtr;
! CheckPoint restartPoint;
!
! checkpoint_requested = false;
!
! /*
! * Initialize bgwriter-private variables used during checkpoint.
! */
! ckpt_active = true;
! ckpt_start_time = (pg_time_t) time(NULL);
! ckpt_cached_elapsed = 0;
!
! /*
! * Get the requested values from shared memory that the
! * Startup process has put there for us.
! */
! SpinLockAcquire(&BgWriterShmem->ckpt_lck);
! ReadPtr = BgWriterShmem->ReadPtr;
! memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
! SpinLockRelease(&BgWriterShmem->ckpt_lck);
!
! /* Use smoothed writes, until interrupted if ever */
! CreateRestartPoint(ReadPtr, &restartPoint, 0);
!
! /*
! * After any checkpoint, close all smgr files. This is so we
! * won't hang onto smgr references to deleted files indefinitely.
! */
! smgrcloseall();
!
! ckpt_active = false;
! checkpoint_requested = false;
! }
! else
! {
! /* Clean buffers dirtied by recovery */
! BgBufferSync();
! /* Nap for the configured time. */
! BgWriterNap();
! }
! }
! else /* Normal processing */
! {
! bool do_checkpoint = false;
! int flags = 0;
! pg_time_t now;
! int elapsed_secs;
/*
! * Process any requests or signals received recently.
*/
! AbsorbFsyncRequests();
! if (checkpoint_requested)
! {
! checkpoint_requested = false;
! do_checkpoint = true;
! BgWriterStats.m_requested_checkpoints++;
! }
! if (shutdown_requested)
! {
! /*
! * From here on, elog(ERROR) should end with exit(1), not send
! * control back to the sigsetjmp block above
! */
! ExitOnAnyError = true;
! /* Close down the database */
! ShutdownXLOG(0, 0);
! /* Normal exit from the bgwriter is here */
! proc_exit(0); /* done */
! }
/*
! * Force a checkpoint if too much time has elapsed since the last one.
! * Note that we count a timed checkpoint in stats only when this
! * occurs without an external request, but we set the CAUSE_TIME flag
! * bit even if there is also an external request.
*/
! now = (pg_time_t) time(NULL);
! elapsed_secs = now - last_checkpoint_time;
! if (elapsed_secs >= CheckPointTimeout)
! {
! if (!do_checkpoint)
! BgWriterStats.m_timed_checkpoints++;
! do_checkpoint = true;
! flags |= CHECKPOINT_CAUSE_TIME;
! }
/*
! * Do a checkpoint if requested, otherwise do one cycle of
! * dirty-buffer writing.
*/
! if (do_checkpoint)
! {
! /* use volatile pointer to prevent code rearrangement */
! volatile BgWriterShmemStruct *bgs = BgWriterShmem;
!
! /*
! * Atomically fetch the request flags to figure out what kind of a
! * checkpoint we should perform, and increase the started-counter
! * to acknowledge that we've started a new checkpoint.
! */
! SpinLockAcquire(&bgs->ckpt_lck);
! flags |= bgs->ckpt_flags;
! bgs->ckpt_flags = 0;
! bgs->ckpt_started++;
! SpinLockRelease(&bgs->ckpt_lck);
!
! /*
! * We will warn if (a) too soon since last checkpoint (whatever
! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! * since the last checkpoint start. Note in particular that this
! * implementation will not generate warnings caused by
! * CheckPointTimeout < CheckPointWarning.
! */
! if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! elapsed_secs < CheckPointWarning)
! ereport(LOG,
! (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! elapsed_secs),
! errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
!
! /*
! * Initialize bgwriter-private variables used during checkpoint.
! */
! ckpt_active = true;
! ckpt_start_recptr = GetInsertRecPtr();
! ckpt_start_time = now;
! ckpt_cached_elapsed = 0;
!
! /*
! * Do the checkpoint.
! */
! CreateCheckPoint(flags);
!
! /*
! * After any checkpoint, close all smgr files. This is so we
! * won't hang onto smgr references to deleted files indefinitely.
! */
! smgrcloseall();
!
! /*
! * Indicate checkpoint completion to any waiting backends.
! */
! SpinLockAcquire(&bgs->ckpt_lck);
! bgs->ckpt_done = bgs->ckpt_started;
! SpinLockRelease(&bgs->ckpt_lck);
!
! ckpt_active = false;
!
! /*
! * Note we record the checkpoint start time not end time as
! * last_checkpoint_time. This is so that time-driven checkpoints
! * happen at a predictable spacing.
! */
! last_checkpoint_time = now;
! }
! else
! BgBufferSync();
! /* Check for archive_timeout and switch xlog files if necessary. */
! CheckArchiveTimeout();
! /* Nap for the configured time. */
! BgWriterNap();
}
}
}
***************
*** 586,592 **** BgWriterNap(void)
(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
break;
pg_usleep(1000000L);
! AbsorbFsyncRequests();
udelay -= 1000000L;
}
--- 682,689 ----
(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
break;
pg_usleep(1000000L);
! if (!IsRecoveryProcessingMode())
! AbsorbFsyncRequests();
udelay -= 1000000L;
}
***************
*** 640,645 **** CheckpointWriteDelay(int flags, double progress)
--- 737,755 ----
if (!am_bg_writer)
return;
+ /* Perform minimal duties during recovery and skip wait if requested */
+ if (IsRecoveryProcessingMode())
+ {
+ BgBufferSync();
+
+ if (!shutdown_requested &&
+ !checkpoint_requested &&
+ IsCheckpointOnSchedule(progress))
+ BgWriterNap();
+
+ return;
+ }
+
/*
* Perform the usual bgwriter duties and take a nap, unless we're behind
* schedule, in which case we just try to catch up as quickly as possible.
***************
*** 714,729 **** IsCheckpointOnSchedule(double progress)
* However, it's good enough for our purposes, we're only calculating an
* estimate anyway.
*/
! recptr = GetInsertRecPtr();
! elapsed_xlogs =
! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! CheckPointSegments;
!
! if (progress < elapsed_xlogs)
{
! ckpt_cached_elapsed = elapsed_xlogs;
! return false;
}
/*
--- 824,842 ----
* However, it's good enough for our purposes, we're only calculating an
* estimate anyway.
*/
! if (!IsRecoveryProcessingMode())
{
! recptr = GetInsertRecPtr();
! elapsed_xlogs =
! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! CheckPointSegments;
!
! if (progress < elapsed_xlogs)
! {
! ckpt_cached_elapsed = elapsed_xlogs;
! return false;
! }
}
/*
***************
*** 989,994 **** RequestCheckpoint(int flags)
--- 1102,1180 ----
}
/*
+ * Always runs in Startup process (see xlog.c)
+ */
+ void
+ RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter)
+ {
+ /*
+ * Should we just do it ourselves?
+ */
+ if (!IsPostmasterEnvironment || !sendToBGWriter)
+ {
+ CreateRestartPoint(ReadPtr, restartPoint, CHECKPOINT_IMMEDIATE);
+ return;
+ }
+
+ /*
+ * Push requested values into shared memory, then signal to request restartpoint.
+ */
+ if (BgWriterShmem->bgwriter_pid == 0)
+ elog(LOG, "could not request restartpoint because bgwriter not running");
+
+ SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ BgWriterShmem->ReadPtr = ReadPtr;
+ memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint));
+ SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+ if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ elog(LOG, "could not signal for restartpoint: %m");
+ }
+
+ /*
+ * Sends another checkpoint request signal to bgwriter, which causes it
+ * to avoid smoothed writes and continue processing as if it had been
+ * called with CHECKPOINT_IMMEDIATE. This is used at the end of recovery.
+ */
+ void
+ RequestRestartPointCompletion(void)
+ {
+ if (BgWriterShmem->bgwriter_pid != 0 &&
+ kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ elog(LOG, "could not signal for restartpoint immediate: %m");
+ }
+
+ XLogRecPtr
+ GetRedoLocationForArchiveCheckpoint(void)
+ {
+ XLogRecPtr redo;
+
+ SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ redo = BgWriterShmem->ReadPtr;
+ SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+ return redo;
+ }
+
+ /*
+ * Store the information needed for a checkpoint at the end of recovery.
+ * Returns true if bgwriter can perform checkpoint, or false if bgwriter
+ * not active or otherwise unable to comply.
+ */
+ bool
+ SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo)
+ {
+ SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ BgWriterShmem->ReadPtr = redo;
+ SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+ if (BgWriterShmem->bgwriter_pid == 0 || !IsPostmasterEnvironment)
+ return false;
+
+ return true;
+ }
+
+ /*
* ForwardFsyncRequest
* Forward a file-fsync request from a backend to the bgwriter
*
*** src/backend/postmaster/postmaster.c
--- src/backend/postmaster/postmaster.c
***************
*** 230,237 **** static bool FatalError = false; /* T if recovering from backend crash */
* We use a simple state machine to control startup, shutdown, and
* crash recovery (which is rather like shutdown followed by startup).
*
! * Normal child backends can only be launched when we are in PM_RUN state.
! * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
* In other states we handle connection requests by launching "dead_end"
* child processes, which will simply send the client an error message and
* quit. (We track these in the BackendList so that we can know when they
--- 230,239 ----
* We use a simple state machine to control startup, shutdown, and
* crash recovery (which is rather like shutdown followed by startup).
*
! * Normal child backends can only be launched when we are in PM_RUN or
! * PM_RECOVERY state. Any transaction started in PM_RECOVERY state will
! * be read-only for the whole of its life. (We also allow launch of normal
! * child backends in PM_WAIT_BACKUP state, but only for superusers.)
* In other states we handle connection requests by launching "dead_end"
* child processes, which will simply send the client an error message and
* quit. (We track these in the BackendList so that we can know when they
***************
*** 254,259 **** typedef enum
--- 256,266 ----
{
PM_INIT, /* postmaster starting */
PM_STARTUP, /* waiting for startup subprocess */
+ PM_RECOVERY, /* consistent recovery mode; state only
+ * entered for archive and streaming recovery,
+ * and only after the point where the
+ * all data is in consistent state.
+ */
PM_RUN, /* normal "database is alive" state */
PM_WAIT_BACKUP, /* waiting for online backup mode to end */
PM_WAIT_BACKENDS, /* waiting for live backends to exit */
***************
*** 1302,1308 **** ServerLoop(void)
* state that prevents it, start one. It doesn't matter if this
* fails, we'll just try again later.
*/
! if (BgWriterPID == 0 && pmState == PM_RUN)
BgWriterPID = StartBackgroundWriter();
/*
--- 1309,1315 ----
* state that prevents it, start one. It doesn't matter if this
* fails, we'll just try again later.
*/
! if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY))
BgWriterPID = StartBackgroundWriter();
/*
***************
*** 1651,1661 **** retry1:
(errcode(ERRCODE_CANNOT_CONNECT_NOW),
errmsg("the database system is shutting down")));
break;
- case CAC_RECOVERY:
- ereport(FATAL,
- (errcode(ERRCODE_CANNOT_CONNECT_NOW),
- errmsg("the database system is in recovery mode")));
- break;
case CAC_TOOMANY:
ereport(FATAL,
(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
--- 1658,1663 ----
***************
*** 1664,1669 **** retry1:
--- 1666,1672 ----
case CAC_WAITBACKUP:
/* OK for now, will check in InitPostgres */
break;
+ case CAC_RECOVERY:
case CAC_OK:
break;
}
***************
*** 1982,1991 **** pmdie(SIGNAL_ARGS)
ereport(LOG,
(errmsg("received smart shutdown request")));
! if (pmState == PM_RUN)
{
/* autovacuum workers are told to shut down immediately */
! SignalAutovacWorkers(SIGTERM);
/* and the autovac launcher too */
if (AutoVacPID != 0)
signal_child(AutoVacPID, SIGTERM);
--- 1985,1995 ----
ereport(LOG,
(errmsg("received smart shutdown request")));
! if (pmState == PM_RUN || pmState == PM_RECOVERY)
{
/* autovacuum workers are told to shut down immediately */
! if (pmState == PM_RUN)
! SignalAutovacWorkers(SIGTERM);
/* and the autovac launcher too */
if (AutoVacPID != 0)
signal_child(AutoVacPID, SIGTERM);
***************
*** 2019,2025 **** pmdie(SIGNAL_ARGS)
if (StartupPID != 0)
signal_child(StartupPID, SIGTERM);
! if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP)
{
ereport(LOG,
(errmsg("aborting any active transactions")));
--- 2023,2029 ----
if (StartupPID != 0)
signal_child(StartupPID, SIGTERM);
! if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_WAIT_BACKUP)
{
ereport(LOG,
(errmsg("aborting any active transactions")));
***************
*** 2115,2122 **** reaper(SIGNAL_ARGS)
*/
if (pid == StartupPID)
{
StartupPID = 0;
! Assert(pmState == PM_STARTUP);
/* FATAL exit of startup is treated as catastrophic */
if (!EXIT_STATUS_0(exitstatus))
--- 2119,2129 ----
*/
if (pid == StartupPID)
{
+ bool leavingRecovery = (pmState == PM_RECOVERY);
+
StartupPID = 0;
! Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY ||
! pmState == PM_WAIT_BACKUP || pmState == PM_WAIT_BACKENDS);
/* FATAL exit of startup is treated as catastrophic */
if (!EXIT_STATUS_0(exitstatus))
***************
*** 2124,2130 **** reaper(SIGNAL_ARGS)
LogChildExit(LOG, _("startup process"),
pid, exitstatus);
ereport(LOG,
! (errmsg("aborting startup due to startup process failure")));
ExitPostmaster(1);
}
--- 2131,2137 ----
LogChildExit(LOG, _("startup process"),
pid, exitstatus);
ereport(LOG,
! (errmsg("aborting startup due to startup process failure")));
ExitPostmaster(1);
}
***************
*** 2157,2166 **** reaper(SIGNAL_ARGS)
load_role();
/*
! * Crank up the background writer. It doesn't matter if this
! * fails, we'll just try again later.
*/
! Assert(BgWriterPID == 0);
BgWriterPID = StartBackgroundWriter();
/*
--- 2164,2173 ----
load_role();
/*
! * Check whether we need to start background writer, if not
! * already running.
*/
! if (BgWriterPID == 0)
BgWriterPID = StartBackgroundWriter();
/*
***************
*** 2177,2184 **** reaper(SIGNAL_ARGS)
PgStatPID = pgstat_start();
/* at this point we are really open for business */
! ereport(LOG,
! (errmsg("database system is ready to accept connections")));
continue;
}
--- 2184,2195 ----
PgStatPID = pgstat_start();
/* at this point we are really open for business */
! if (leavingRecovery)
! ereport(LOG,
! (errmsg("database can now be accessed with read and write transactions")));
! else
! ereport(LOG,
! (errmsg("database system is ready to accept connections")));
continue;
}
***************
*** 2898,2904 **** BackendStartup(Port *port)
bn->pid = pid;
bn->cancel_key = MyCancelKey;
bn->is_autovacuum = false;
! bn->dead_end = (port->canAcceptConnections != CAC_OK &&
port->canAcceptConnections != CAC_WAITBACKUP);
DLAddHead(BackendList, DLNewElem(bn));
#ifdef EXEC_BACKEND
--- 2909,2916 ----
bn->pid = pid;
bn->cancel_key = MyCancelKey;
bn->is_autovacuum = false;
! bn->dead_end = (!(port->canAcceptConnections == CAC_RECOVERY ||
! port->canAcceptConnections == CAC_OK) &&
port->canAcceptConnections != CAC_WAITBACKUP);
DLAddHead(BackendList, DLNewElem(bn));
#ifdef EXEC_BACKEND
***************
*** 3847,3852 **** sigusr1_handler(SIGNAL_ARGS)
--- 3859,3911 ----
PG_SETMASK(&BlockSig);
+ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+ {
+ Assert(pmState == PM_STARTUP);
+
+ /*
+ * Go to shutdown mode if a shutdown request was pending.
+ */
+ if (Shutdown > NoShutdown)
+ {
+ pmState = PM_WAIT_BACKENDS;
+ /* PostmasterStateMachine logic does the rest */
+ }
+ else
+ {
+ /*
+ * Startup process has entered recovery
+ */
+ pmState = PM_RECOVERY;
+
+ /*
+ * Load the flat authorization file into postmaster's cache. The
+ * startup process won't have recomputed this from the database yet,
+ * so we it may change following recovery.
+ */
+ load_role();
+
+ /*
+ * Crank up the background writer. It doesn't matter if this
+ * fails, we'll just try again later.
+ */
+ Assert(BgWriterPID == 0);
+ BgWriterPID = StartBackgroundWriter();
+
+ /*
+ * Likewise, start other special children as needed.
+ */
+ Assert(PgStatPID == 0);
+ PgStatPID = pgstat_start();
+
+ /* We can now accept read-only connections */
+ ereport(LOG,
+ (errmsg("database system is ready to accept connections")));
+ ereport(LOG,
+ (errmsg("database can now be accessed with read only transactions")));
+ }
+ }
+
if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
{
/*
*** src/backend/storage/buffer/README
--- src/backend/storage/buffer/README
***************
*** 268,270 **** out (and anyone else who flushes buffer contents to disk must do so too).
--- 268,279 ----
This ensures that the page image transferred to disk is reasonably consistent.
We might miss a hint-bit update or two but that isn't a problem, for the same
reasons mentioned under buffer access rules.
+
+ As of 8.4, background writer starts during recovery mode when there is
+ some form of potentially extended recovery to perform. It performs an
+ identical service to normal processing, except that checkpoints it
+ writes are technically restartpoints. Flushing outstanding WAL for dirty
+ buffers is also skipped, though there shouldn't ever be new WAL entries
+ at that time in any case. We could choose to start background writer
+ immediately but we hold off until we can prove the database is in a
+ consistent state so that postmaster has a single, clean state change.
*** src/backend/storage/buffer/bufmgr.c
--- src/backend/storage/buffer/bufmgr.c
***************
*** 71,77 **** static bool IsForInput;
/* local state for LockBufferForCleanup */
static volatile BufferDesc *PinCountWaitBuf = NULL;
!
static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf,
ForkNumber forkNum, BlockNumber blockNum,
--- 71,79 ----
/* local state for LockBufferForCleanup */
static volatile BufferDesc *PinCountWaitBuf = NULL;
! static long CleanupWaitSecs = 0;
! static int CleanupWaitUSecs = 0;
! static bool CleanupWaitStats = false;
static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf,
ForkNumber forkNum, BlockNumber blockNum,
***************
*** 2308,2313 **** ConditionalLockBuffer(Buffer buffer)
--- 2310,2362 ----
}
/*
+ * On standby servers only the Startup process applies Cleanup. As a result
+ * a single buffer pin can be enough to effectively halt recovery for short
+ * periods. We need special instrumentation to monitor this so we can judge
+ * whether additional measures are required to control the negative effects.
+ */
+ void
+ StartCleanupDelayStats(void)
+ {
+ CleanupWaitSecs = 0;
+ CleanupWaitUSecs = 0;
+ CleanupWaitStats = true;
+ }
+
+ void
+ EndCleanupDelayStats(void)
+ {
+ CleanupWaitStats = false;
+ }
+
+ /*
+ * Called by Startup process whenever we request restartpoint
+ */
+ void
+ ReportCleanupDelayStats(void)
+ {
+ elog(trace_recovery(DEBUG2), "cleanup wait total=%ld.%03d s",
+ CleanupWaitSecs, CleanupWaitUSecs / 1000);
+ }
+
+ static void
+ CleanupDelayStats(TimestampTz start_ts, TimestampTz end_ts)
+ {
+ long wait_secs;
+ int wait_usecs;
+
+ TimestampDifference(start_ts, end_ts, &wait_secs, &wait_usecs);
+
+ CleanupWaitSecs +=wait_secs;
+ CleanupWaitUSecs +=wait_usecs;
+ if (CleanupWaitUSecs > 999999)
+ {
+ CleanupWaitSecs += 1;
+ CleanupWaitUSecs -= 1000000;
+ }
+ }
+
+ /*
* LockBufferForCleanup - lock a buffer in preparation for deleting items
*
* Items may be deleted from a disk page only when the caller (a) holds an
***************
*** 2350,2355 **** LockBufferForCleanup(Buffer buffer)
--- 2399,2406 ----
for (;;)
{
+ TimestampTz start_ts = 0;
+
/* Try to acquire lock */
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
LockBufHdr(bufHdr);
***************
*** 2372,2380 **** LockBufferForCleanup(Buffer buffer)
--- 2423,2436 ----
PinCountWaitBuf = bufHdr;
UnlockBufHdr(bufHdr);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ if (CleanupWaitStats)
+ start_ts = GetCurrentTimestamp();
/* Wait to be signaled by UnpinBuffer() */
ProcWaitForSignal();
PinCountWaitBuf = NULL;
+ if (CleanupWaitStats)
+ CleanupDelayStats(start_ts, GetCurrentTimestamp());
+
/* Loop back and try again */
}
}
*** src/backend/storage/freespace/freespace.c
--- src/backend/storage/freespace/freespace.c
***************
*** 211,217 **** XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
blkno = fsm_logical_to_physical(addr);
/* If the page doesn't exist already, extend */
! buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR);
page = BufferGetPage(buf);
if (PageIsNew(page))
PageInit(page, BLCKSZ, 0);
--- 211,218 ----
blkno = fsm_logical_to_physical(addr);
/* If the page doesn't exist already, extend */
! buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno,
! RBM_ZERO_ON_ERROR, BUFFER_LOCK_CLEANUP);
page = BufferGetPage(buf);
if (PageIsNew(page))
PageInit(page, BLCKSZ, 0);
*** src/backend/storage/ipc/procarray.c
--- src/backend/storage/ipc/procarray.c
***************
*** 17,22 ****
--- 17,37 ----
* as are the myProcLocks lists. They can be distinguished from regular
* backend PGPROCs at need by checking for pid == 0.
*
+ * The process array now also includes PGPROC structures representing
+ * transactions being recovered. The xid and subxids fields of these are valid,
+ * though few other fields are. They can be distinguished from regular backend
+ * PGPROCs by checking for pid == 0. The proc array also has an
+ * secondary array of UnobservedXids representing transactions that are
+ * known to be running on the master but for which we do not yet have
+ * a recovery proc. We infer
+ * the existence of UnobservedXids by watching the sequence of arriving
+ * xids. This is very important because if we leave those xids out of the
+ * the snapshot then they will appear to be already complete. Later, when
+ * they have actually completed this could lead to confusion as to whether
+ * those xids are visible or not, blowing a huge hole in MVCC. We need 'em.
+ * We go to extreme lengths to ensure that the number of UnobservedXids is
+ * both bounded and realistically manageable. There are simpler designs,
+ * but they lead to unbounded worst case behaviour, so we sweat.
*
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
***************
*** 33,56 ****
#include "access/subtrans.h"
#include "access/transam.h"
! #include "access/xact.h"
#include "access/twophase.h"
#include "miscadmin.h"
#include "storage/procarray.h"
#include "utils/snapmgr.h"
/* Our shared memory area */
typedef struct ProcArrayStruct
{
int numProcs; /* number of valid procs entries */
! int maxProcs; /* allocated size of procs array */
/*
* We declare procs[] as 1 entry because C wants a fixed-size array, but
* actually it is maxProcs entries long.
*/
PGPROC *procs[1]; /* VARIABLE LENGTH ARRAY */
} ProcArrayStruct;
static ProcArrayStruct *procArray;
--- 48,86 ----
#include "access/subtrans.h"
#include "access/transam.h"
! #include "access/xlog.h"
#include "access/twophase.h"
#include "miscadmin.h"
+ #include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/snapmgr.h"
+ static RunningXactsData CurrentRunningXactsData;
+
+ /* Handy constant for an invalid xlog recptr */
+ static const XLogRecPtr InvalidXLogRecPtr = {0, 0};
+
+ void ProcArrayDisplay(int trace_level);
+
/* Our shared memory area */
typedef struct ProcArrayStruct
{
int numProcs; /* number of valid procs entries */
! int maxProcs; /* allocated size of total procs array */
!
! int maxRecoveryProcs; /* number of allocated recovery procs */
!
! int numUnobservedXids; /* number of valid unobserved xids */
! int maxUnobservedXids; /* allocated size of unobserved array */
/*
* We declare procs[] as 1 entry because C wants a fixed-size array, but
* actually it is maxProcs entries long.
*/
PGPROC *procs[1]; /* VARIABLE LENGTH ARRAY */
+
+ /* ARRAY OF UNOBSERVED TRANSACTION XIDs FOLLOWS */
} ProcArrayStruct;
static ProcArrayStruct *procArray;
***************
*** 100,107 **** ProcArrayShmemSize(void)
Size size;
size = offsetof(ProcArrayStruct, procs);
! size = add_size(size, mul_size(sizeof(PGPROC *),
! add_size(MaxBackends, max_prepared_xacts)));
return size;
}
--- 130,148 ----
Size size;
size = offsetof(ProcArrayStruct, procs);
!
! /* Normal processing */
! /* MyProc slots */
! size = add_size(size, mul_size(sizeof(PGPROC *), MaxBackends));
! size = add_size(size, mul_size(sizeof(PGPROC *), max_prepared_xacts));
!
! /* Recovery processing */
!
! /* Recovery Procs */
! size = add_size(size, mul_size(sizeof(PGPROC *), MaxBackends));
! /* UnobservedXids */
! size = add_size(size, mul_size(sizeof(TransactionId), MaxBackends));
! size = add_size(size, mul_size(sizeof(TransactionId), MaxBackends));
return size;
}
***************
*** 123,130 **** CreateSharedProcArray(void)
--- 164,209 ----
/*
* We're the first - initialize.
*/
+ /* Normal processing */
procArray->numProcs = 0;
procArray->maxProcs = MaxBackends + max_prepared_xacts;
+
+ /* Recovery processing */
+ procArray->maxRecoveryProcs = MaxBackends;
+ procArray->maxProcs += procArray->maxRecoveryProcs;
+
+ procArray->maxUnobservedXids = 2 * MaxBackends;
+ procArray->numUnobservedXids = 0;
+
+ if (!IsUnderPostmaster)
+ {
+ int i;
+
+ /* XXX: We should probably have a separate pool for recovery
+ * procs, similar to how we handle prepared transactions. The
+ * fields only used for recovery procs (lsn), could then also be
+ * included only for the recovery procs, like the extra fields
+ * in GlobalTransactionData
+ */
+ /*
+ * Create and add the Procs for recovery emulation.
+ *
+ * We do this now, so that we can identify which Recovery Proc
+ * goes with each normal backend. Normal procs were allocated
+ * first so we can use the slotId of the *proc* to look up
+ * the Recovery Proc in the *procarray*. Recovery Procs never
+ * move around in the procarray, whereas normal procs do.
+ * e.g. Proc with slotId=7 is always associated with procarray[7]
+ * for recovery processing. see also
+ */
+ for (i = 0; i < procArray->maxRecoveryProcs; i++)
+ {
+ PGPROC *RecoveryProc = InitRecoveryProcess();
+
+ ProcArrayAdd(RecoveryProc);
+ }
+ elog(DEBUG3, "Added %d Recovery Procs", i);
+ }
}
}
***************
*** 213,218 **** ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
--- 292,306 ----
elog(LOG, "failed to find proc %p in ProcArray", proc);
}
+ /*
+ * Initialisation when we switch into PM_RECOVERY mode.
+ * Expected caller is InitRecoveryTransactionEnvironment()
+ */
+ void
+ ProcArrayInitRecoveryEnvironment(void)
+ {
+ PublishStartupProcessInformation();
+ }
/*
* ProcArrayEndTransaction -- mark a transaction as no longer running
***************
*** 220,226 **** ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
* This is used interchangeably for commit and abort cases. The transaction
* commit/abort must already be reported to WAL and pg_clog.
*
! * proc is currently always MyProc, but we pass it explicitly for flexibility.
* latestXid is the latest Xid among the transaction's main XID and
* subtransactions, or InvalidTransactionId if it has no XID. (We must ask
* the caller to pass latestXid, instead of computing it from the PGPROC's
--- 308,316 ----
* This is used interchangeably for commit and abort cases. The transaction
* commit/abort must already be reported to WAL and pg_clog.
*
! * In normal running proc is currently always MyProc, but in recovery we pass
! * one of the recovery procs.
! *
* latestXid is the latest Xid among the transaction's main XID and
* subtransactions, or InvalidTransactionId if it has no XID. (We must ask
* the caller to pass latestXid, instead of computing it from the PGPROC's
***************
*** 228,234 **** ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
* incomplete.)
*/
void
! ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
{
if (TransactionIdIsValid(latestXid))
{
--- 318,325 ----
* incomplete.)
*/
void
! ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid,
! int nsubxids, TransactionId *subxids)
{
if (TransactionIdIsValid(latestXid))
{
***************
*** 253,258 **** ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
--- 344,370 ----
proc->subxids.nxids = 0;
proc->subxids.overflowed = false;
+ /*
+ * Check that any subtransactions are removed from UnobservedXids.
+ * We include the subxids array so that they can be removed atomically
+ * from UnobservedXids at the same time as we zero the main xid on
+ * the Recovery proc.
+ */
+ if (nsubxids > 0)
+ {
+ int i;
+
+ Assert(subxids != NULL);
+
+ /*
+ * Ignore any failure to find the xids - this avoids complex
+ * bookkeeping solely to account for rare strangeness that
+ * would add too much overhead to be worth the cost.
+ */
+ for (i = 0; i < nsubxids; i++)
+ UnobservedTransactionsRemoveXid(subxids[i], false);
+ }
+
/* Also advance global latestCompletedXid while holding the lock */
if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
latestXid))
***************
*** 301,306 **** ProcArrayClearTransaction(PGPROC *proc)
--- 413,419 ----
proc->xid = InvalidTransactionId;
proc->lxid = InvalidLocalTransactionId;
proc->xmin = InvalidTransactionId;
+ proc->lsn = InvalidXLogRecPtr;
/* redundant, but just in case */
proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
***************
*** 311,316 **** ProcArrayClearTransaction(PGPROC *proc)
--- 424,606 ----
proc->subxids.overflowed = false;
}
+ /*
+ * ProcArrayClearRecoveryTransactions
+ *
+ * Called during recovery when we see a Shutdown checkpoint or EndRecovery
+ * record, or at the end of recovery processing.
+ */
+ void
+ ProcArrayClearRecoveryTransactions(void)
+ {
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Reset Recovery Procs
+ */
+ for (index = 0; index < arrayP->maxRecoveryProcs; index++)
+ {
+ PGPROC *RecoveryProc = arrayP->procs[index];
+
+ ProcArrayClearTransaction(RecoveryProc);
+ }
+
+ /*
+ * Clear the UnobservedXids also
+ */
+ UnobservedTransactionsClearXids();
+
+ LWLockRelease(ProcArrayLock);
+ }
+
+ /* debug support functions for recovery processing */
+ bool
+ XidInRecoveryProcs(TransactionId xid)
+ {
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ for (index = 0; index < arrayP->maxRecoveryProcs; index++)
+ {
+ PGPROC *RecoveryProc = arrayP->procs[index];
+
+ if (RecoveryProc->xid == xid)
+ return true;
+ }
+ return false;
+ }
+
+ void
+ ProcArrayDisplay(int trace_level)
+ {
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ for (index = 0; index < arrayP->maxRecoveryProcs; index++)
+ {
+ PGPROC *RecoveryProc = arrayP->procs[index];
+
+ if (TransactionIdIsValid(RecoveryProc->xid))
+ elog(trace_level,
+ "proc %d proc->xid %d proc->lsn %X/%X", index, RecoveryProc->xid,
+ RecoveryProc->lsn.xlogid, RecoveryProc->lsn.xrecoff);
+ }
+
+ UnobservedTransactionsDisplay(trace_level);
+
+ LWLockRelease(ProcArrayLock);
+ }
+
+ /*
+ * Use the data about running transactions on master to either create the
+ * initial state of the Recovery Procs, or maintain correctness of their
+ * state. This is almost the opposite of GetSnapshotData().
+ *
+ * Only used during recovery. Notice the signature is very similar to a
+ * _redo function.
+ */
+ void
+ ProcArrayUpdateRecoveryTransactions(XLogRecPtr lsn, xl_xact_running_xacts *xlrec)
+ {
+ ProcArrayStruct *arrayP = procArray;
+ int xid_index;
+ TransactionId *subxip = (TransactionId *) &(xlrec->xrun[xlrec->xcnt]);
+ int index;
+
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ for (xid_index = 0; xid_index < xlrec->xcnt; xid_index++)
+ {
+ RunningXact *rxact = (RunningXact *) xlrec->xrun;
+ PGPROC *proc = NULL;
+ TransactionId xid = rxact->xid;
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ PGPROC *p = arrayP->procs[index];
+
+ if (p->xid == xid)
+ {
+ proc = p;
+ break;
+ }
+ }
+
+ if (proc == NULL)
+ {
+ /* TODO should add it to array here */
+ continue;
+ }
+
+ elog(trace_recovery(DEBUG5),
+ "running xact proc->lsn %X/%X lsn %X/%X proc->xid %d xid %d",
+ proc->lsn.xlogid, proc->lsn.xrecoff,
+ lsn.xlogid, lsn.xrecoff, proc->xid, rxact[xid_index].xid);
+ /*
+ * If our state information is later for this proc, then
+ * overwrite it. It's possible for a commit and possibly
+ * a new transaction record to have arrived in WAL in between
+ * us doing GetRunningTransactionData() and grabbing the
+ * WALInsertLock, so we musn't assume we know best always.
+ */
+ if (XLByteLT(proc->lsn, lsn))
+ {
+ proc->lsn = lsn;
+ proc->xid = rxact[xid_index].xid;
+ /* proc-> pid stays 0 for Recovery Procs */
+ proc->databaseId = rxact[xid_index].databaseId;
+ proc->roleId = rxact[xid_index].roleId;
+ proc->vacuumFlags = rxact[xid_index].vacuumFlags;
+
+ proc->subxids.nxids = rxact[xid_index].nsubxids;
+ proc->subxids.overflowed = rxact[xid_index].overflowed;
+
+ memcpy(proc->subxids.xids, subxip,
+ rxact[xid_index].nsubxids * sizeof(TransactionId));
+ }
+ }
+
+ /*
+ * Scan the proc array for stale recovery PGPROC entries, and
+ * remove them.
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ PGPROC *p = arrayP->procs[index];
+
+ if (p->pid == 0 && !XLogRecPtrIsInvalid(p->lsn) && XLByteLT(p->lsn, lsn))
+ {
+ arrayP->procs[index] = arrayP->procs[arrayP->numProcs - 1];
+ arrayP->numProcs--;
+ FreeRecoveryProcess(p);
+ }
+ }
+
+ /* Advance global latestCompletedXid while holding the lock */
+ if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
+ xlrec->latestCompletedXid))
+ ShmemVariableCache->latestCompletedXid = xlrec->latestCompletedXid;
+
+ /*
+ * Left prune the UnobservedXids array up to latestRunningXid.
+ * This is correct because at the time we take this snapshot, all
+ * completed transactions prior to latestRunningXid will be marked in
+ * WAL. So we won't ever see a WAL record for them again.
+ *
+ * We can't clear the array completely because race conditions allow
+ * things to slip through sometimes.
+ */
+ UnobservedTransactionsPruneXids(xlrec->latestRunningXid);
+
+ LWLockRelease(ProcArrayLock);
+
+ ProcArrayDisplay(trace_recovery(DEBUG5));
+ }
/*
* TransactionIdIsInProgress -- is given transaction running in some backend
***************
*** 655,661 **** GetOldestXmin(bool allDbs, bool ignoreVacuum)
* but since PGPROC has only a limited cache area for subxact XIDs, full
* information may not be available. If we find any overflowed subxid arrays,
* we have to mark the snapshot's subxid data as overflowed, and extra work
! * will need to be done to determine what's running (see XidInMVCCSnapshot()
* in tqual.c).
*
* We also update the following backend-global variables:
--- 945,951 ----
* but since PGPROC has only a limited cache area for subxact XIDs, full
* information may not be available. If we find any overflowed subxid arrays,
* we have to mark the snapshot's subxid data as overflowed, and extra work
! * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
* in tqual.c).
*
* We also update the following backend-global variables:
***************
*** 680,685 **** GetSnapshotData(Snapshot snapshot)
--- 970,976 ----
int index;
int count = 0;
int subcount = 0;
+ bool suboverflowed = false;
Assert(snapshot != NULL);
***************
*** 706,713 **** GetSnapshotData(Snapshot snapshot)
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
Assert(snapshot->subxip == NULL);
snapshot->subxip = (TransactionId *)
! malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
if (snapshot->subxip == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
--- 997,1005 ----
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
Assert(snapshot->subxip == NULL);
+ #define maxNumSubXids (arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS)
snapshot->subxip = (TransactionId *)
! malloc(maxNumSubXids * sizeof(TransactionId));
if (snapshot->subxip == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
***************
*** 771,781 **** GetSnapshotData(Snapshot snapshot)
}
/*
! * Save subtransaction XIDs if possible (if we've already overflowed,
! * there's no point). Note that the subxact XIDs must be later than
! * their parent, so no need to check them against xmin. We could
! * filter against xmax, but it seems better not to do that much work
! * while holding the ProcArrayLock.
*
* The other backend can add more subxids concurrently, but cannot
* remove any. Hence it's important to fetch nxids just once. Should
--- 1063,1073 ----
}
/*
! * Save subtransaction XIDs, whether or not we have overflowed.
! * Note that the subxact XIDs must be later than their parent, so no
! * need to check them against xmin. We could filter against xmax,
! * but it seems better not to do that much work while holding the
! * ProcArrayLock.
*
* The other backend can add more subxids concurrently, but cannot
* remove any. Hence it's important to fetch nxids just once. Should
***************
*** 784,806 **** GetSnapshotData(Snapshot snapshot)
*
* Again, our own XIDs are not included in the snapshot.
*/
! if (subcount >= 0 && proc != MyProc)
! {
! if (proc->subxids.overflowed)
! subcount = -1; /* overflowed */
! else
{
int nxids = proc->subxids.nxids;
if (nxids > 0)
{
memcpy(snapshot->subxip + subcount,
(void *) proc->subxids.xids,
nxids * sizeof(TransactionId));
subcount += nxids;
}
}
}
}
if (!TransactionIdIsValid(MyProc->xmin))
--- 1076,1144 ----
*
* Again, our own XIDs are not included in the snapshot.
*/
! if (proc != MyProc)
{
int nxids = proc->subxids.nxids;
if (nxids > 0)
{
+ if (proc->subxids.overflowed)
+ suboverflowed = true;
+
memcpy(snapshot->subxip + subcount,
(void *) proc->subxids.xids,
nxids * sizeof(TransactionId));
subcount += nxids;
}
+
}
}
+
+ /*
+ * Also check for unobserved xids. There is no need for us to specify
+ * only if IsRecoveryProcessingMode(), since the list will always be
+ * empty when normal processing begins and the test will be optimised
+ * to nearly nothing very quickly.
+ */
+ for (index = 0; index < arrayP->numUnobservedXids; index++)
+ {
+ volatile TransactionId *UnobservedXids;
+ TransactionId xid;
+
+ UnobservedXids = (TransactionId *) &(arrayP->procs[arrayP->maxProcs]);
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = UnobservedXids[index];
+
+ /*
+ * If there are no more visible xids, we're done. This works
+ * because UnobservedXids is maintained in strict ascending order.
+ */
+ if (!TransactionIdIsNormal(xid) || TransactionIdPrecedes(xid, xmax))
+ break;
+
+ /*
+ * Typically, there will be space in the snapshot. We know that the
+ * unobserved xids are being run by one of the procs marked with
+ * an xid of InvalidTransactionId, so we will have ignored that above,
+ * and the xidcache for that proc will have been empty also.
+ *
+ * We put the unobserved xids into the subxid cache. The xid might
+ * be a top-level or it might be a subtransaction, but it won't
+ * change the answer to XidInMVCCSnapshot() whichever it is. That's
+ * just as well, since we don't know which it is, by definition.
+ * The subxid cache gets searched first, so put it there.
+ */
+ snapshot->subxip[subcount++] = xid;
+
+ /*
+ * We don't really need xmin during recovery, but lets derive
+ * it anyway for consistency. It is possible that an unobserved
+ * xid could be xmin if there is contention between long-lived
+ * transactions.
+ */
+ if (TransactionIdPrecedes(xid, xmin))
+ xmin = xid;
}
if (!TransactionIdIsValid(MyProc->xmin))
***************
*** 824,829 **** GetSnapshotData(Snapshot snapshot)
--- 1162,1168 ----
snapshot->xmax = xmax;
snapshot->xcnt = count;
snapshot->subxcnt = subcount;
+ snapshot->suboverflowed = suboverflowed;
snapshot->curcid = GetCurrentCommandId(false);
***************
*** 839,844 **** GetSnapshotData(Snapshot snapshot)
--- 1178,1415 ----
}
/*
+ * GetRunningTransactionData -- returns information about running transactions.
+ *
+ * Similar to GetSnapshotData but returning more information. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes. We
+ * include slotId and databaseId for each PGPROC. We also keep track
+ * of which subtransactions go with each PGPROC, information which is lost
+ * when we GetSnapshotData.
+ *
+ * This is never executed when IsRecoveryMode() so there is no need to look
+ * at UnobservedXids.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ */
+ RunningTransactions
+ GetRunningTransactionData(void)
+ {
+ ProcArrayStruct *arrayP = procArray;
+ RunningTransactions CurrentRunningXacts = (RunningTransactions) &CurrentRunningXactsData;
+ RunningXact *rxact;
+ TransactionId *subxip;
+ TransactionId latestRunningXid = InvalidTransactionId;
+ TransactionId prev_latestRunningXid = InvalidTransactionId;
+ TransactionId latestCompletedXid;
+ int numAttempts = 0;
+ int index;
+ int count = 0;
+ int subcount = 0;
+ bool suboverflowed = false;
+
+ /*
+ * Allocating space for maxProcs xids is usually overkill; numProcs would
+ * be sufficient. But it seems better to do the malloc while not holding
+ * the lock, so we can't look at numProcs. Likewise, we allocate much
+ * more subxip storage than is probably needed.
+ *
+ * Should only be allocated for bgwriter, since only ever executed
+ * during checkpoints.
+ */
+ if (CurrentRunningXacts->xrun == NULL)
+ {
+ /*
+ * First call
+ */
+ CurrentRunningXacts->xrun = (RunningXact *)
+ malloc(arrayP->maxProcs * sizeof(RunningXact));
+ if (CurrentRunningXacts->xrun == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ Assert(CurrentRunningXacts->subxip == NULL);
+ CurrentRunningXacts->subxip = (TransactionId *)
+ malloc(maxNumSubXids * sizeof(TransactionId));
+ if (CurrentRunningXacts->subxip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ rxact = CurrentRunningXacts->xrun;
+ subxip = CurrentRunningXacts->subxip;
+
+ /*
+ * Loop until we get a valid snapshot. See exit conditions below.
+ */
+ for (;;)
+ {
+ count = 0;
+ subcount = 0;
+ suboverflowed = false;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ latestCompletedXid = ShmemVariableCache->latestCompletedXid;
+
+ /*
+ * Spin over procArray checking xid, and subxids. Shared lock is enough
+ * because new transactions don't use locks at all, so LW_EXCLUSIVE
+ * wouldn't be enough to prevent them, so don't bother.
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ volatile PGPROC *proc = arrayP->procs[index];
+ TransactionId xid;
+ int nxids;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = proc->xid;
+
+ /*
+ * We store all xids, even XIDs >= xmax and our own XID, if any.
+ * But we don't store transactions that don't have a TransactionId
+ * yet because they will not show as running on a standby server.
+ */
+ if (!TransactionIdIsValid(xid))
+ continue;
+
+ rxact[count].xid = xid;
+ rxact[count].databaseId = proc->databaseId;
+ rxact[count].roleId = proc->roleId;
+ rxact[count].vacuumFlags = proc->vacuumFlags;
+
+ if (TransactionIdPrecedes(latestRunningXid, xid))
+ latestRunningXid = xid;
+
+ /*
+ * Save subtransaction XIDs.
+ *
+ * The other backend can add more subxids concurrently, but cannot
+ * remove any. Hence it's important to fetch nxids just once. Should
+ * be safe to use memcpy, though. (We needn't worry about missing any
+ * xids added concurrently, because they must postdate xmax.)
+ *
+ * Again, our own XIDs *are* included in the snapshot.
+ */
+ nxids = proc->subxids.nxids;
+
+ if (nxids > 0)
+ {
+ TransactionId *subxids = (TransactionId *) proc->subxids.xids;
+
+ rxact[count].subx_offset = subcount;
+
+ memcpy(subxip + subcount,
+ (void *) proc->subxids.xids,
+ nxids * sizeof(TransactionId));
+ subcount += nxids;
+
+ if (proc->subxids.overflowed)
+ {
+ rxact[count].overflowed = true;
+ suboverflowed = true;
+ }
+
+ if (TransactionIdPrecedes(latestRunningXid, subxids[nxids - 1]))
+ latestRunningXid = subxids[nxids - 1];
+ }
+ else
+ {
+ rxact[count].subx_offset = 0;
+ rxact[count].overflowed = false;
+ }
+
+ rxact[count].nsubxids = nxids;
+
+ count++;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * If there's no procs with TransactionIds allocated we need to
+ * find what the last xid assigned was. This takes and releases
+ * XidGenLock, but that shouldn't cause contention in this case.
+ * We could do this as well if the snapshot overflowed, but in
+ * that case we think that XidGenLock might be high, so we punt.
+ *
+ * By the time we do this, another proc may have incremented the
+ * nextxid, so we must rescan the procarray to check whether
+ * there are either new running transactions or the counter is
+ * the same as before. If transactions appear and disappear
+ * faster than we can do this, we're in trouble. So spin for at
+ * a few 3 attempts before giving up.
+ *
+ * We do it this way to avoid needing to grab XidGenLock in all
+ * cases, which is hardly ever actually required.
+ */
+ if (count > 0)
+ break;
+ else
+ {
+ #define MAX_SNAPSHOT_ATTEMPTS 3
+ if (numAttempts >= MAX_SNAPSHOT_ATTEMPTS)
+ {
+ latestRunningXid = InvalidTransactionId;
+ break;
+ }
+
+ latestRunningXid = ReadNewTransactionId();
+ TransactionIdRetreat(latestRunningXid);
+
+ if (prev_latestRunningXid == latestRunningXid)
+ break;
+
+ prev_latestRunningXid = latestRunningXid;
+ numAttempts++;
+ }
+ }
+
+ CurrentRunningXacts->xcnt = count;
+ CurrentRunningXacts->subxcnt = subcount;
+ CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
+ if (!suboverflowed)
+ CurrentRunningXacts->latestRunningXid = latestRunningXid;
+ else
+ CurrentRunningXacts->latestRunningXid = InvalidTransactionId;
+
+ #ifdef RUNNING_XACT_DEBUG
+ elog(trace_recovery(DEBUG3),
+ "logging running xacts xcnt %d subxcnt %d latestCompletedXid %d latestRunningXid %d",
+ CurrentRunningXacts->xcnt,
+ CurrentRunningXacts->subxcnt,
+ CurrentRunningXacts->latestCompletedXid,
+ CurrentRunningXacts->latestRunningXid);
+
+ for (index = 0; index < CurrentRunningXacts->xcnt; index++)
+ {
+ int j;
+ elog(trace_recovery(DEBUG3),
+ "xid %d pid %d backend %d db %d role %d nsubxids %d offset %d vf %u, overflow %s",
+ CurrentRunningXacts->xrun[index].xid,
+ CurrentRunningXacts->xrun[index].pid,
+ CurrentRunningXacts->xrun[index].slotId,
+ CurrentRunningXacts->xrun[index].databaseId,
+ CurrentRunningXacts->xrun[index].roleId,
+ CurrentRunningXacts->xrun[index].nsubxids,
+ CurrentRunningXacts->xrun[index].subx_offset,
+ CurrentRunningXacts->xrun[index].vacuumFlags,
+ CurrentRunningXacts->xrun[index].overflowed ? "t" : "f");
+ for (j = 0; j < CurrentRunningXacts->xrun[index].nsubxids; j++)
+ elog(trace_recovery(DEBUG3),
+ "subxid offset %d j %d xid %d",
+ CurrentRunningXacts->xrun[index].subx_offset, j,
+ CurrentRunningXacts->subxip[j + CurrentRunningXacts->xrun[index].subx_offset]);
+ }
+ #endif
+
+ return CurrentRunningXacts;
+ }
+
+ /*
* GetTransactionsInCommit -- Get the XIDs of transactions that are committing
*
* Constructs an array of XIDs of transactions that are currently in commit
***************
*** 968,973 **** BackendPidGetProc(int pid)
--- 1539,1579 ----
}
/*
+ * BackendXidGetProc -- get a backend's PGPROC given its XID
+ *
+ * Returns NULL if not found. Note that it is up to the caller to be
+ * sure that the question remains meaningful for long enough for the
+ * answer to be used ...
+ */
+ PGPROC *
+ BackendXidGetProc(TransactionId xid)
+ {
+ PGPROC *result = NULL;
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ if (xid == InvalidTransactionId) /* never match invalid xid */
+ return 0;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ PGPROC *proc = arrayP->procs[index];
+
+ if (proc->xid == xid)
+ {
+ result = proc;
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+ }
+
+ /*
* BackendXidGetPid -- get a backend's pid given its XID
*
* Returns 0 if not found or it's a prepared transaction. Note that
***************
*** 1024,1036 **** IsBackendPid(int pid)
* The array is palloc'd and is terminated with an invalid VXID.
*
* If limitXmin is not InvalidTransactionId, we skip any backends
! * with xmin >= limitXmin. If allDbs is false, we skip backends attached
* to other databases. If excludeVacuum isn't zero, we skip processes for
* which (excludeVacuum & vacuumFlags) is not zero. Also, our own process
* is always skipped.
*/
VirtualTransactionId *
! GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum)
{
VirtualTransactionId *vxids;
ProcArrayStruct *arrayP = procArray;
--- 1630,1642 ----
* The array is palloc'd and is terminated with an invalid VXID.
*
* If limitXmin is not InvalidTransactionId, we skip any backends
! * with xmin >= limitXmin. If dbOid is valid we skip backends attached
* to other databases. If excludeVacuum isn't zero, we skip processes for
* which (excludeVacuum & vacuumFlags) is not zero. Also, our own process
* is always skipped.
*/
VirtualTransactionId *
! GetCurrentVirtualXIDs(TransactionId limitXmin, Oid dbOid, int excludeVacuum)
{
VirtualTransactionId *vxids;
ProcArrayStruct *arrayP = procArray;
***************
*** 1053,1059 **** GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum)
if (excludeVacuum & proc->vacuumFlags)
continue;
! if (allDbs || proc->databaseId == MyDatabaseId)
{
/* Fetch xmin just once - might change on us? */
TransactionId pxmin = proc->xmin;
--- 1659,1665 ----
if (excludeVacuum & proc->vacuumFlags)
continue;
! if (!OidIsValid(dbOid) || proc->databaseId == dbOid)
{
/* Fetch xmin just once - might change on us? */
TransactionId pxmin = proc->xmin;
***************
*** 1083,1088 **** GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum)
--- 1689,1725 ----
return vxids;
}
+ int
+ VirtualTransactionIdGetPid(VirtualTransactionId vxid)
+ {
+ ProcArrayStruct *arrayP = procArray;
+ int result = 0;
+ int index;
+
+ if (!VirtualTransactionIdIsValid(vxid))
+ return 0;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ VirtualTransactionId procvxid;
+ PGPROC *proc = arrayP->procs[index];
+
+ GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+ if (procvxid.backendId == vxid.backendId &&
+ procvxid.localTransactionId == vxid.localTransactionId)
+ {
+ result = proc->pid;
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+ }
/*
* CountActiveBackends --- count backends (other than myself) that are in
***************
*** 1367,1369 **** DisplayXidCache(void)
--- 2004,2210 ----
}
#endif /* XIDCACHE_DEBUG */
+
+ /* ----------------------------------------------
+ * UnobservedTransactions sub-module
+ * ----------------------------------------------
+ *
+ * All functions must be called holding ProcArrayLock.
+ */
+
+ /*
+ * Add unobserved xids to end of UnobservedXids array
+ */
+ void
+ UnobservedTransactionsAddXids(TransactionId firstXid, TransactionId lastXid)
+ {
+ TransactionId ixid = firstXid;
+ int index = procArray->numUnobservedXids;
+ TransactionId *UnobservedXids;
+
+ UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+ Assert(TransactionIdIsNormal(firstXid));
+ Assert(TransactionIdIsNormal(lastXid));
+ Assert(TransactionIdPrecedes(firstXid, lastXid));
+
+ /*
+ * UnobservedXids is maintained as a ascending list of xids, with no gaps.
+ * Incoming xids are always higher than previous entries, so we just add
+ * them directly to the end of the array.
+ */
+ while (ixid != lastXid)
+ {
+ /*
+ * check to see if we have space to store more UnobservedXids
+ */
+ if (index >= procArray->maxUnobservedXids)
+ {
+ UnobservedTransactionsDisplay(WARNING);
+ elog(FATAL, "No more room in UnobservedXids array");
+ }
+
+ /*
+ * append ixid to UnobservedXids
+ */
+ #ifdef USE_ASSERT_CHECKING
+ if (TransactionIdIsValid(UnobservedXids[index]) ||
+ (index > 0 && TransactionIdPrecedes(UnobservedXids[index - 1], ixid)))
+ UnobservedTransactionsDisplay(LOG);
+ #endif
+
+ elog(trace_recovery(DEBUG4), "Adding UnobservedXid %d", ixid);
+ UnobservedXids[index] = ixid;
+ index++;
+
+ TransactionIdAdvance(ixid);
+ }
+
+ procArray->numUnobservedXids = index;
+ }
+
+ /*
+ * Remove one unobserved xid from anywhere on UnobservedXids array.
+ * If xid has already been pruned away, no need to report as missing.
+ */
+ void
+ UnobservedTransactionsRemoveXid(TransactionId xid, bool missing_is_error)
+ {
+ int index;
+ bool found = false;
+ TransactionId *UnobservedXids;
+
+ UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+ elog(trace_recovery(DEBUG4), "Remove UnobservedXid = %d", xid);
+
+ /*
+ * If we haven't initialised array yet, or if we've already cleared it
+ * ignore this and get on with it. If it's missing after this it is an
+ * ERROR if removal is requested and the value isn't present.
+ */
+ if (procArray->numUnobservedXids == 0 ||
+ (procArray->numUnobservedXids > 0 &&
+ TransactionIdPrecedes(xid, UnobservedXids[0])))
+ return;
+
+ /*
+ * XXX we could use bsearch, if this has significant overhead.
+ */
+ for (index = 0; index < procArray->numUnobservedXids; index++)
+ {
+ if (!found)
+ {
+ if (UnobservedXids[index] == xid)
+ found = true;
+ }
+ else
+ {
+ UnobservedXids[index - 1] = UnobservedXids[index];
+ }
+ }
+
+ if (found)
+ UnobservedXids[--procArray->numUnobservedXids] = InvalidTransactionId;
+
+ if (!found && missing_is_error)
+ {
+ UnobservedTransactionsDisplay(LOG);
+ elog(ERROR, "could not remove unobserved xid = %d", xid);
+ }
+ }
+
+ /*
+ * Prune array up to a particular limit. This frequently means clearing the
+ * whole array, but we don't attempt to optimise for that at present.
+ */
+ void
+ UnobservedTransactionsPruneXids(TransactionId limitXid)
+ {
+ int index;
+ int pruneUpToThisIndex = 0;
+ TransactionId *UnobservedXids;
+
+ UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+ elog(trace_recovery(DEBUG4), "prune UnobservedXids up to %d", limitXid);
+
+ for (index = 0; index < procArray->numUnobservedXids; index++)
+ {
+ if (TransactionIdFollowsOrEquals(limitXid, UnobservedXids[index]))
+ pruneUpToThisIndex = index + 1;
+ else
+ {
+ /*
+ * Anything to delete?
+ */
+ if (pruneUpToThisIndex == 0)
+ return;
+
+ /*
+ * Move unpruned values to start of array
+ */
+ UnobservedXids[index - pruneUpToThisIndex] = UnobservedXids[index];
+ UnobservedXids[index] = 0;
+ }
+ }
+
+ procArray->numUnobservedXids -= pruneUpToThisIndex;
+ }
+
+ /*
+ * Clear the whole array.
+ */
+ void
+ UnobservedTransactionsClearXids(void)
+ {
+ int index;
+ TransactionId *UnobservedXids;
+
+ elog(trace_recovery(DEBUG4), "Clear UnobservedXids");
+
+ UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+ /*
+ * UnobservedTransactionsAddXids() asserts that array will be empty
+ * when we add new values. so it must be zeroes here each time.
+ */
+ for (index = 0; index < procArray->numUnobservedXids; index++)
+ {
+ UnobservedXids[index] = 0;
+ }
+
+ procArray->numUnobservedXids = 0;
+ }
+
+ void
+ UnobservedTransactionsDisplay(int trace_level)
+ {
+ int index;
+ TransactionId *UnobservedXids;
+
+ UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+ for (index = 0; index < procArray->maxUnobservedXids; index++)
+ {
+ if (TransactionIdIsValid(UnobservedXids[index]))
+ elog(trace_level, "%d unobserved[%d] = %d ",
+ procArray->numUnobservedXids, index, UnobservedXids[index]);
+ }
+ }
+
+ bool
+ XidInUnobservedTransactions(TransactionId xid)
+ {
+ int index;
+ TransactionId *UnobservedXids;
+
+ UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+ for (index = 0; index < procArray->numUnobservedXids; index++)
+ {
+ if (UnobservedXids[index] == xid)
+ return true;
+ }
+ return false;
+ }
*** src/backend/storage/ipc/sinvaladt.c
--- src/backend/storage/ipc/sinvaladt.c
***************
*** 142,147 **** typedef struct ProcState
--- 142,148 ----
int nextMsgNum; /* next message number to read */
bool resetState; /* backend needs to reset its state */
bool signaled; /* backend has been sent catchup signal */
+ bool sendOnly; /* backend only sends, never receives */
/*
* Next LocalTransactionId to use for each idle backend slot. We keep
***************
*** 248,254 **** CreateSharedInvalidationState(void)
* Initialize a new backend to operate on the sinval buffer
*/
void
! SharedInvalBackendInit(void)
{
int index;
ProcState *stateP = NULL;
--- 249,255 ----
* Initialize a new backend to operate on the sinval buffer
*/
void
! SharedInvalBackendInit(bool sendOnly)
{
int index;
ProcState *stateP = NULL;
***************
*** 307,312 **** SharedInvalBackendInit(void)
--- 308,314 ----
stateP->nextMsgNum = segP->maxMsgNum;
stateP->resetState = false;
stateP->signaled = false;
+ stateP->sendOnly = sendOnly;
LWLockRelease(SInvalWriteLock);
***************
*** 578,584 **** SICleanupQueue(bool callerHasWriteLock, int minFree)
/*
* Recompute minMsgNum = minimum of all backends' nextMsgNum, identify
* the furthest-back backend that needs signaling (if any), and reset
! * any backends that are too far back.
*/
min = segP->maxMsgNum;
minsig = min - SIG_THRESHOLD;
--- 580,588 ----
/*
* Recompute minMsgNum = minimum of all backends' nextMsgNum, identify
* the furthest-back backend that needs signaling (if any), and reset
! * any backends that are too far back. Note that because we ignore
! * sendOnly backends here it is possible for them to keep sending
! * messages without a problem even when they are the only active backend.
*/
min = segP->maxMsgNum;
minsig = min - SIG_THRESHOLD;
***************
*** 590,596 **** SICleanupQueue(bool callerHasWriteLock, int minFree)
int n = stateP->nextMsgNum;
/* Ignore if inactive or already in reset state */
! if (stateP->procPid == 0 || stateP->resetState)
continue;
/*
--- 594,600 ----
int n = stateP->nextMsgNum;
/* Ignore if inactive or already in reset state */
! if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly)
continue;
/*
*** src/backend/storage/lmgr/lock.c
--- src/backend/storage/lmgr/lock.c
***************
*** 35,43 ****
--- 35,45 ----
#include "access/transam.h"
#include "access/twophase.h"
#include "access/twophase_rmgr.h"
+ #include "access/xact.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+ #include "storage/sinval.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
#include "utils/resowner.h"
***************
*** 490,495 **** LockAcquire(const LOCKTAG *locktag,
--- 492,506 ----
if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
elog(ERROR, "unrecognized lock mode: %d", lockmode);
+ if (IsRecoveryProcessingMode() &&
+ locktag->locktag_type == LOCKTAG_OBJECT &&
+ lockmode > AccessShareLock)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot acquire lockmode %s on database objects while recovery is in progress",
+ lockMethodTable->lockModeNames[lockmode]),
+ errhint("Only AccessShareLock can be acquired on database objects during recovery.")));
+
#ifdef LOCK_DEBUG
if (LOCK_DEBUG_ENABLED(locktag))
elog(LOG, "LockAcquire: lock [%u,%u] %s",
***************
*** 817,822 **** LockAcquire(const LOCKTAG *locktag,
--- 828,881 ----
LWLockRelease(partitionLock);
+ /*
+ * We made it all the way here. We've got the lock and we've got
+ * it for the first time in this transaction. So now it's time
+ * to send a WAL message so that standby servers can see this event,
+ * if its an AccessExclusiveLock on a relation.
+ */
+ if (!InRecovery && lockmode >= AccessExclusiveLock &&
+ locktag->locktag_type == LOCKTAG_RELATION)
+ {
+ XLogRecData rdata;
+ xl_rel_lock xlrec;
+ TransactionId xid;
+
+ /*
+ * First thing we do is ensure that a TransactionId has been
+ * assigned to this transaction. We don't actually need the xid
+ * but if we don't do this then RecordTransactionCommit() and
+ * RecordTransactionAbort() will optimise away the transaction
+ * completion record which recovery relies upon to release locks.
+ * It's a hack, but for a corner case not worth adding code for
+ * into the main commit path.
+ */
+ xid = GetTopTransactionId();
+ Assert(TransactionIdIsValid(xid));
+
+ Assert(OidIsValid(locktag->locktag_field2));
+
+ START_CRIT_SECTION();
+
+ /*
+ * Decode the locktag back to the original values, to avoid
+ * sending lots of empty bytes with every message. See
+ * lock.h to check how a locktag is defined for LOCKTAG_RELATION
+ */
+ xlrec.xid = xid;
+ xlrec.dbOid = locktag->locktag_field1;
+ xlrec.relOid = locktag->locktag_field2;
+
+ rdata.data = (char *) (&xlrec);
+ rdata.len = sizeof(xl_rel_lock);
+ rdata.buffer = InvalidBuffer;
+ rdata.next = NULL;
+
+ (void) XLogInsert(RM_RELATION_ID, XLOG_RELATION_LOCK, &rdata);
+
+ END_CRIT_SECTION();
+ }
+
return LOCKACQUIRE_OK;
}
*** src/backend/storage/lmgr/proc.c
--- src/backend/storage/lmgr/proc.c
***************
*** 103,108 **** ProcGlobalShmemSize(void)
--- 103,110 ----
size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGPROC)));
/* MyProcs, including autovacuum */
size = add_size(size, mul_size(MaxBackends, sizeof(PGPROC)));
+ /* RecoveryProcs, including recovery actions by autovacuum */
+ size = add_size(size, mul_size(MaxBackends, sizeof(PGPROC)));
/* ProcStructLock */
size = add_size(size, sizeof(slock_t));
***************
*** 204,209 **** InitProcGlobal(void)
--- 206,230 ----
ProcGlobal->autovacFreeProcs = &procs[i];
}
+ /*
+ * Create enough Recovery Procs so there is a shadow proc for every
+ * normal proc. Recovery procs don't need semaphores because they
+ * aren't actually performing any work, they are just ghosts with
+ * enough substance to store enough information to make them look
+ * real to anyone requesting a snapshot from the procarray.
+ */
+ procs = (PGPROC *) ShmemAlloc((MaxBackends) * sizeof(PGPROC));
+ if (!procs)
+ ereport(FATAL,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory")));
+ MemSet(procs, 0, MaxBackends * sizeof(PGPROC));
+ for (i = 0; i < MaxBackends; i++)
+ {
+ procs[i].links.next = (SHM_QUEUE *) ProcGlobal->freeProcs;
+ ProcGlobal->freeProcs = &procs[i];
+ }
+
MemSet(AuxiliaryProcs, 0, NUM_AUXILIARY_PROCS * sizeof(PGPROC));
for (i = 0; i < NUM_AUXILIARY_PROCS; i++)
{
***************
*** 277,282 **** InitProcess(void)
--- 298,304 ----
/*
* Initialize all fields of MyProc, except for the semaphore which was
* prepared for us by InitProcGlobal.
+ * Recovery snapshot processing relies completely on this never changing.
*/
SHMQueueElemInit(&(MyProc->links));
MyProc->waitStatus = STATUS_OK;
***************
*** 319,324 **** InitProcess(void)
--- 341,440 ----
InitDeadLockChecking();
}
+ void
+ FreeRecoveryProcess(PGPROC *proc)
+ {
+ volatile PROC_HDR *procglobal = ProcGlobal;
+
+ SpinLockAcquire(ProcStructLock);
+
+ /* Return struct to freelist */
+ proc->links.next = (SHM_QUEUE *) procglobal->freeProcs;
+ procglobal->freeProcs = proc;
+
+ SpinLockRelease(ProcStructLock);
+ }
+
+ /*
+ * InitRecoveryProcess -- initialize a per-master process data structure
+ * for use when emulating transactions in recovery
+ */
+ PGPROC *
+ InitRecoveryProcess(void)
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile PROC_HDR *procglobal = ProcGlobal;
+ PGPROC *ThisProc = NULL;
+
+ /*
+ * ProcGlobal should be set up already (if we are a backend, we inherit
+ * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+ */
+ if (procglobal == NULL)
+ elog(PANIC, "proc header uninitialized");
+
+ /*
+ * Try to get a proc struct from the free list. If this fails, we must be
+ * out of PGPROC structures (not to mention semaphores).
+ */
+ SpinLockAcquire(ProcStructLock);
+
+ ThisProc = procglobal->freeProcs;
+
+ if (ThisProc != NULL)
+ {
+ procglobal->freeProcs = (PGPROC *) ThisProc->links.next;
+ SpinLockRelease(ProcStructLock);
+ }
+ else
+ {
+ /*
+ * Should never reach here if shared memory is allocated correctly.
+ */
+ SpinLockRelease(ProcStructLock);
+ elog(FATAL, "too many procs - could not create recovery proc");
+ }
+
+ /*
+ * xid will be set later as WAL records arrive for this recovery proc
+ */
+ ThisProc->xid = InvalidTransactionId;
+
+ /*
+ * The backendid of the recovery proc stays at InvalidBackendId. There
+ * is a direct 1:1 correspondence between a master backendid and this
+ * proc, but that same backendid may also be in use during recovery,
+ * so if we set this field we would have duplicate backendids.
+ */
+ ThisProc->backendId = InvalidBackendId;
+
+ /*
+ * The following are not used in recovery
+ */
+ ThisProc->pid = 0;
+
+ SHMQueueElemInit(&(ThisProc->links));
+ ThisProc->waitStatus = STATUS_OK;
+ ThisProc->lxid = InvalidLocalTransactionId;
+ ThisProc->xmin = InvalidTransactionId;
+ ThisProc->databaseId = InvalidOid;
+ ThisProc->roleId = InvalidOid;
+ ThisProc->inCommit = false;
+ ThisProc->vacuumFlags = 0;
+ ThisProc->lwWaiting = false;
+ ThisProc->lwExclusive = false;
+ ThisProc->lwWaitLink = NULL;
+ ThisProc->waitLock = NULL;
+ ThisProc->waitProcLock = NULL;
+
+ /*
+ * There is little else to do. The recovery proc is never used to
+ * acquire buffers, nor will we ever acquire LWlocks using the proc.
+ * Deadlock checker is not active during recovery.
+ */
+ return ThisProc;
+ }
+
/*
* InitProcessPhase2 -- make MyProc visible in the shared ProcArray.
*
***************
*** 363,368 **** InitProcessPhase2(void)
--- 479,489 ----
* to the ProcArray or the sinval messaging mechanism, either. They also
* don't get a VXID assigned, since this is only useful when we actually
* hold lockmgr locks.
+ *
+ * Startup process however uses locks but never waits for them in the
+ * normal backend sense. Startup process also takes part in sinval messaging
+ * as a sendOnly process, so never reads messages from sinval queue. So
+ * Startup process does have a VXID and does show up in pg_locks.
*/
void
InitAuxiliaryProcess(void)
***************
*** 452,457 **** InitAuxiliaryProcess(void)
--- 573,595 ----
}
/*
+ * Additional initialisation for Startup process
+ */
+ void
+ PublishStartupProcessInformation(void)
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile PROC_HDR *procglobal = ProcGlobal;
+
+ SpinLockAcquire(ProcStructLock);
+
+ procglobal->startupProc = MyProc;
+ procglobal->startupProcPid = MyProcPid;
+
+ SpinLockRelease(ProcStructLock);
+ }
+
+ /*
* Check whether there are at least N free PGPROC objects.
*
* Note: this is designed on the assumption that N will generally be small.
***************
*** 1271,1277 **** ProcWaitForSignal(void)
void
ProcSendSignal(int pid)
{
! PGPROC *proc = BackendPidGetProc(pid);
if (proc != NULL)
PGSemaphoreUnlock(&proc->sem);
--- 1409,1438 ----
void
ProcSendSignal(int pid)
{
! PGPROC *proc = NULL;
!
! /*
! * Check to see whether it is the Startup process we wish to signal.
! * We could initialise this elsewhere, but then have a function in
! * proc.c calling a function in procarray.c calling a function in
! * proc.c which is more confusing and error prone than just putting
! * this code where it's needed.
! */
! if (IsRecoveryProcessingMode())
! {
! /* use volatile pointer to prevent code rearrangement */
! volatile PROC_HDR *procglobal = ProcGlobal;
!
! SpinLockAcquire(ProcStructLock);
!
! if (pid == procglobal->startupProcPid)
! proc = procglobal->startupProc;
!
! SpinLockRelease(ProcStructLock);
! }
!
! if (proc == NULL)
! proc = BackendPidGetProc(pid);
if (proc != NULL)
PGSemaphoreUnlock(&proc->sem);
*** src/backend/tcop/utility.c
--- src/backend/tcop/utility.c
***************
*** 287,296 **** ProcessUtility(Node *parsetree,
--- 287,308 ----
SetPGVariable("transaction_isolation",
list_make1(item->arg),
true);
+
else if (strcmp(item->defname, "transaction_read_only") == 0)
+ {
+ A_Const *con;
+
+ Assert(IsA(item->arg, A_Const));
+ con = (A_Const *) item->arg;
+ Assert(nodeTag(&con->val) == T_Integer);
+
+ if (!intVal(&con->val))
+ PreventCommandDuringRecovery();
+
SetPGVariable("transaction_read_only",
list_make1(item->arg),
true);
+ }
}
}
break;
***************
*** 305,310 **** ProcessUtility(Node *parsetree,
--- 317,323 ----
break;
case TRANS_STMT_PREPARE:
+ PreventCommandDuringRecovery();
if (!PrepareTransactionBlock(stmt->gid))
{
/* report unsuccessful commit in completionTag */
***************
*** 314,324 **** ProcessUtility(Node *parsetree,
--- 327,339 ----
break;
case TRANS_STMT_COMMIT_PREPARED:
+ PreventCommandDuringRecovery();
PreventTransactionChain(isTopLevel, "COMMIT PREPARED");
FinishPreparedTransaction(stmt->gid, true);
break;
case TRANS_STMT_ROLLBACK_PREPARED:
+ PreventCommandDuringRecovery();
PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED");
FinishPreparedTransaction(stmt->gid, false);
break;
***************
*** 676,681 **** ProcessUtility(Node *parsetree,
--- 691,697 ----
break;
case T_GrantStmt:
+ PreventCommandDuringRecovery();
ExecuteGrantStmt((GrantStmt *) parsetree);
break;
***************
*** 846,851 **** ProcessUtility(Node *parsetree,
--- 862,868 ----
case T_NotifyStmt:
{
NotifyStmt *stmt = (NotifyStmt *) parsetree;
+ PreventCommandDuringRecovery();
Async_Notify(stmt->conditionname);
}
***************
*** 854,859 **** ProcessUtility(Node *parsetree,
--- 871,877 ----
case T_ListenStmt:
{
ListenStmt *stmt = (ListenStmt *) parsetree;
+ PreventCommandDuringRecovery();
Async_Listen(stmt->conditionname);
}
***************
*** 862,867 **** ProcessUtility(Node *parsetree,
--- 880,886 ----
case T_UnlistenStmt:
{
UnlistenStmt *stmt = (UnlistenStmt *) parsetree;
+ PreventCommandDuringRecovery();
if (stmt->conditionname)
Async_Unlisten(stmt->conditionname);
***************
*** 881,890 **** ProcessUtility(Node *parsetree,
--- 900,911 ----
break;
case T_ClusterStmt:
+ PreventCommandDuringRecovery();
cluster((ClusterStmt *) parsetree, isTopLevel);
break;
case T_VacuumStmt:
+ PreventCommandDuringRecovery();
vacuum((VacuumStmt *) parsetree, InvalidOid, true, NULL, false,
isTopLevel);
break;
***************
*** 1000,1011 **** ProcessUtility(Node *parsetree,
--- 1021,1034 ----
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser to do CHECKPOINT")));
+ PreventCommandDuringRecovery();
RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
break;
case T_ReindexStmt:
{
ReindexStmt *stmt = (ReindexStmt *) parsetree;
+ PreventCommandDuringRecovery();
switch (stmt->kind)
{
***************
*** 2490,2492 **** GetCommandLogLevel(Node *parsetree)
--- 2513,2524 ----
return lev;
}
+
+ void
+ PreventCommandDuringRecovery(void)
+ {
+ if (IsRecoveryProcessingMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
+ errmsg("cannot be run until recovery completes")));
+ }
*** src/backend/utils/adt/txid.c
--- src/backend/utils/adt/txid.c
***************
*** 338,343 **** txid_current(PG_FUNCTION_ARGS)
--- 338,349 ----
txid val;
TxidEpoch state;
+ if (IsRecoveryProcessingMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot assign txid while recovery is in progress"),
+ errhint("only read only queries can execute during recovery")));
+
load_xid_epoch(&state);
val = convert_xid(GetTopTransactionId(), &state);
*** src/backend/utils/cache/inval.c
--- src/backend/utils/cache/inval.c
***************
*** 86,95 ****
--- 86,100 ----
*/
#include "postgres.h"
+ #include
+
+ #include "access/transam.h"
#include "access/twophase_rmgr.h"
#include "access/xact.h"
#include "catalog/catalog.h"
#include "miscadmin.h"
+ #include "storage/lmgr.h"
+ #include "storage/procarray.h"
#include "storage/sinval.h"
#include "storage/smgr.h"
#include "utils/inval.h"
***************
*** 155,160 **** typedef struct TransInvalidationInfo
--- 160,173 ----
static TransInvalidationInfo *transInvalInfo = NULL;
+ static SharedInvalidationMessage *SharedInvalidMessagesArray;
+ static int numSharedInvalidMessagesArray;
+ static int maxSharedInvalidMessagesArray;
+
+ static List *RecoveryLockList;
+ static MemoryContext RelationLockContext;
+
+
/*
* Dynamically-registered callback functions. Current implementation
* assumes there won't be very many of these at once; could improve if needed.
***************
*** 741,746 **** AtStart_Inval(void)
--- 754,761 ----
MemoryContextAllocZero(TopTransactionContext,
sizeof(TransInvalidationInfo));
transInvalInfo->my_level = GetCurrentTransactionNestLevel();
+ SharedInvalidMessagesArray = NULL;
+ numSharedInvalidMessagesArray = 0;
}
/*
***************
*** 851,856 **** inval_twophase_postcommit(TransactionId xid, uint16 info,
--- 866,991 ----
}
}
+ static void
+ MakeSharedInvalidMessagesArray(const SharedInvalidationMessage *msgs, int n)
+ {
+ /*
+ * Initialise array first time through in each commit
+ */
+ if (SharedInvalidMessagesArray == NULL)
+ {
+ maxSharedInvalidMessagesArray = FIRSTCHUNKSIZE;
+ numSharedInvalidMessagesArray = 0;
+
+ /*
+ * Although this is being palloc'd we don't actually free it directly.
+ * We're so close to EOXact that we now we're going to lose it anyhow.
+ */
+ SharedInvalidMessagesArray = palloc(maxSharedInvalidMessagesArray
+ * sizeof(SharedInvalidationMessage));
+ }
+
+ if ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray)
+ {
+ while ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray)
+ maxSharedInvalidMessagesArray *= 2;
+
+ SharedInvalidMessagesArray = repalloc(SharedInvalidMessagesArray,
+ maxSharedInvalidMessagesArray
+ * sizeof(SharedInvalidationMessage));
+ }
+
+ /*
+ * Append the next chunk onto the array
+ */
+ memcpy(SharedInvalidMessagesArray + numSharedInvalidMessagesArray,
+ msgs, n * sizeof(SharedInvalidationMessage));
+ numSharedInvalidMessagesArray += n;
+ }
+
+ /*
+ * xactGetCommittedInvalidationMessages() is executed by
+ * RecordTransactionCommit() to add invalidation messages onto the
+ * commit record. This applies only to commit message types, never to
+ * abort records. Must always run before AtEOXact_Inval(), since that
+ * removes the data we need to see.
+ *
+ * Remember that this runs before we have officially committed, so we
+ * must not do anything here to change what might occur *if* we should
+ * fail between here and the actual commit.
+ *
+ * Note that transactional validation does *not* write a invalidation
+ * WAL message using XLOG_RELATION_INVAL messages. Those are only used
+ * by non-transactional invalidation. see comments in
+ * EndNonTransactionalInvalidation().
+ *
+ * see also xact_redo_commit() and xact_desc_commit()
+ */
+ int
+ xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs,
+ bool *RelcacheInitFileInval)
+ {
+ MemoryContext oldcontext;
+
+ /* Must be at top of stack */
+ Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
+
+ /*
+ * Relcache init file invalidation requires processing both before and
+ * after we send the SI messages. However, we need not do anything
+ * unless we committed.
+ */
+ if (transInvalInfo->RelcacheInitFileInval)
+ *RelcacheInitFileInval = true;
+ else
+ *RelcacheInitFileInval = false;
+
+ /*
+ * Walk through TransInvalidationInfo to collect all the messages
+ * into a single contiguous array of invalidation messages. It must
+ * be contiguous so we can copy directly into WAL message. Maintain the
+ * order that they would be processed in by AtEOXact_Inval(), to ensure
+ * emulated behaviour in redo is as similar as possible to original.
+ * We want the same bugs, if any, not new ones.
+ */
+ oldcontext = MemoryContextSwitchTo(CurTransactionContext);
+
+ ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
+ MakeSharedInvalidMessagesArray);
+ ProcessInvalidationMessagesMulti(&transInvalInfo->PriorCmdInvalidMsgs,
+ MakeSharedInvalidMessagesArray);
+ MemoryContextSwitchTo(oldcontext);
+
+ #ifdef STANDBY_INVAL_DEBUG
+ if (numSharedInvalidMessagesArray > 0)
+ {
+ int i;
+
+ elog(LOG, "numSharedInvalidMessagesArray = %d", numSharedInvalidMessagesArray);
+
+ Assert(SharedInvalidMessagesArray != NULL);
+
+ for (i = 0; i < numSharedInvalidMessagesArray; i++)
+ {
+ SharedInvalidationMessage *msg = SharedInvalidMessagesArray + i;
+
+ if (msg->id >= 0)
+ elog(LOG, "catcache id %d", msg->id);
+ else if (msg->id == SHAREDINVALRELCACHE_ID)
+ elog(LOG, "relcache id %d", msg->id);
+ else if (msg->id == SHAREDINVALSMGR_ID)
+ elog(LOG, "smgr cache id %d", msg->id);
+ }
+ }
+ #endif
+
+ if (numSharedInvalidMessagesArray > 0)
+ Assert(SharedInvalidMessagesArray != NULL);
+
+ *msgs = SharedInvalidMessagesArray;
+
+ return numSharedInvalidMessagesArray;
+ }
/*
* AtEOXact_Inval
***************
*** 1041,1046 **** BeginNonTransactionalInvalidation(void)
--- 1176,1217 ----
Assert(transInvalInfo->CurrentCmdInvalidMsgs.cclist == NULL);
Assert(transInvalInfo->CurrentCmdInvalidMsgs.rclist == NULL);
Assert(transInvalInfo->RelcacheInitFileInval == false);
+
+ SharedInvalidMessagesArray = NULL;
+ numSharedInvalidMessagesArray = 0;
+ }
+
+ /*
+ * General function to log the SharedInvalidMessagesArray. Only current
+ * caller is EndNonTransactionalInvalidation(), but that may change.
+ */
+ static void
+ LogSharedInvalidMessagesArray(void)
+ {
+ XLogRecData rdata[2];
+ xl_rel_inval xlrec;
+
+ if (numSharedInvalidMessagesArray == 0)
+ return;
+
+ START_CRIT_SECTION();
+
+ xlrec.nmsgs = numSharedInvalidMessagesArray;
+
+ rdata[0].data = (char *) (&xlrec);
+ rdata[0].len = MinSizeOfRelationInval;
+ rdata[0].buffer = InvalidBuffer;
+
+ rdata[0].next = &(rdata[1]);
+ rdata[1].data = (char *) SharedInvalidMessagesArray;
+ rdata[1].len = numSharedInvalidMessagesArray *
+ sizeof(SharedInvalidationMessage);
+ rdata[1].buffer = InvalidBuffer;
+ rdata[1].next = NULL;
+
+ (void) XLogInsert(RM_RELATION_ID, XLOG_RELATION_INVAL, rdata);
+
+ END_CRIT_SECTION();
}
/*
***************
*** 1081,1087 **** EndNonTransactionalInvalidation(void)
--- 1252,1278 ----
ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
SendSharedInvalidMessages);
+ /*
+ * Write invalidation messages to WAL. This is not required for
+ * recovery, it is only required for standby servers. It's fairly
+ * low overhead so don't worry. This allows us to trigger inval
+ * messages on the standby as soon as we see these records.
+ * see relation_redo_inval()
+ *
+ * Note that transactional validation uses an array attached to
+ * a WAL commit record, so these messages are rare.
+ */
+ ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
+ MakeSharedInvalidMessagesArray);
+ LogSharedInvalidMessagesArray();
+
/* Clean up and release memory */
+
+ /* XXX: some questions and thoughts here:
+ * not sure where/how to allocate memory correctly in this case
+ * and how to free it afterwards. Think some more on this.
+ */
+
for (chunk = transInvalInfo->CurrentCmdInvalidMsgs.cclist;
chunk != NULL;
chunk = next)
***************
*** 1235,1237 **** CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
--- 1426,1808 ----
++relcache_callback_count;
}
+
+ /*
+ * -----------------------------------------------------
+ * Standby wait timers and backend cancel logic
+ * -----------------------------------------------------
+ */
+
+ static void
+ InitStandbyDelayTimers(int *currentDelay_ms, int *standbyWait_ms)
+ {
+ *currentDelay_ms = GetLatestReplicationDelay();
+
+ /*
+ * If replication delay is enormously huge, just treat that as
+ * zero and work up from there. This prevents us from acting
+ * foolishly when replaying old log files.
+ */
+ if (*currentDelay_ms < 0)
+ *currentDelay_ms = 0;
+
+ #define STANDBY_INITIAL_WAIT_MS 1
+ *standbyWait_ms = STANDBY_INITIAL_WAIT_MS;
+ }
+
+ /*
+ * Standby wait logic for XactResolveRedoVisibilityConflicts().
+ * We wait here for a while then return. If we decide wecan't wait any
+ * more then we return true, if we can wait some more return false.
+ */
+ static bool
+ WaitExceedsMaxStandbyDelay(int *currentDelay_ms, int *standbyWait_ms)
+ {
+ int maxStandbyDelay_ms = maxStandbyDelay * 1000;
+
+ /*
+ * If the server is already further behind than we would
+ * like then no need to wait or do more complex logic.
+ * max_standby_delay = 0 means wait for ever, if necessary
+ */
+ if (maxStandbyDelay >= 0 &&
+ *currentDelay_ms > maxStandbyDelay_ms)
+ return true;
+
+ /*
+ * Sleep, then do bookkeeping.
+ */
+ pg_usleep(*standbyWait_ms * 1000L);
+ *currentDelay_ms += *standbyWait_ms;
+
+ /*
+ * Progressively increase the sleep times.
+ */
+ *standbyWait_ms *= 2;
+ if (*standbyWait_ms > 1000)
+ *standbyWait_ms = 1000;
+
+ /*
+ * Re-test our exit criteria
+ */
+ if (maxStandbyDelay >= 0 &&
+ *currentDelay_ms > maxStandbyDelay_ms)
+ return true;
+
+ return false;
+ }
+
+ void
+ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+ char *reason)
+ {
+ int standbyWait_ms;
+ int currentDelay_ms;
+ bool logged;
+ int wontDieWait = 1;
+
+ InitStandbyDelayTimers(¤tDelay_ms, &standbyWait_ms);
+ logged = false;
+
+ while (VirtualTransactionIdIsValid(*waitlist))
+ {
+ /*
+ * log that we have been waiting for a while now...
+ */
+ if (!logged && standbyWait_ms > 500)
+ {
+ elog(trace_recovery(DEBUG5),
+ "virtual transaction %u/%u is blocking %s",
+ waitlist->backendId,
+ waitlist->localTransactionId,
+ reason);
+ logged = true;
+ }
+
+ if (ConditionalVirtualXactLockTableWait(*waitlist))
+ {
+ waitlist++;
+ InitStandbyDelayTimers(¤tDelay_ms, &standbyWait_ms);
+ logged = false;
+ }
+ else if (WaitExceedsMaxStandbyDelay(¤tDelay_ms,
+ &standbyWait_ms))
+ {
+ /*
+ * Now find out who to throw out of the balloon.
+ */
+ int pid;
+
+ Assert(VirtualTransactionIdIsValid(*waitlist));
+ pid = VirtualTransactionIdGetPid(*waitlist);
+
+ /*
+ * Kill the pid if it's still here. If not, that's what we wanted
+ * so ignore any errors.
+ */
+ if (pid != 0)
+ {
+ elog(LOG,
+ "recovery cancels activity of virtual transaction %u/%u pid %d "
+ "because it blocks %s (current delay now %d secs)",
+ waitlist->backendId,
+ waitlist->localTransactionId,
+ pid, reason,
+ currentDelay_ms / 1000);
+ kill(pid, SIGINT);
+
+ /* wait awhile for it to die */
+ pg_usleep(wontDieWait * 5000L);
+ wontDieWait *= 2;
+ }
+ }
+ }
+ }
+
+ /*
+ * Locking in Recovery Mode
+ *
+ * All locks are held by the Startup process using a single virtual
+ * transaction. This implementation is both simpler and in some senses,
+ * more correct. The locks held mean "some original transaction held
+ * this lock, so query access is not allowed at this time". So the Startup
+ * process is the proxy by which the original locks are implemented.
+ *
+ * We only keep track of AccessExclusiveLocks, which are only ever held by
+ * one transaction on one relation. So we don't worry too much about keeping
+ * track of which xid holds which lock, we just track which slot holds the
+ * lock. This makes this scheme self-cleaning in case lock holders die
+ * without leaving a trace in the WAL.
+ *
+ * We keep a single dynamically expandible locks list in local memory.
+ * List elements use type xl_rel_lock, since the WAL record type exactly
+ * matches the information that we need to keep track of.
+ *
+ * We use session locks rather than normal locks so we don't need owners.
+ */
+
+ /* called by relation_redo_lock() */
+ static void
+ RelationAddRecoveryLock(xl_rel_lock *lockRequest)
+ {
+ xl_rel_lock *newlock;
+ LOCKTAG locktag;
+ MemoryContext old_context;
+
+ elog(trace_recovery(DEBUG4),
+ "adding recovery lock: db %d rel %d",
+ lockRequest->dbOid, lockRequest->relOid);
+
+ /*
+ * dbOid is InvalidOid when we are locking a shared relation.
+ */
+ Assert(OidIsValid(lockRequest->relOid));
+
+ if (RelationLockContext == NULL)
+ RelationLockContext = AllocSetContextCreate(TopMemoryContext,
+ "RelationLocks",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+
+ old_context = MemoryContextSwitchTo(RelationLockContext);
+ newlock = palloc(sizeof(xl_rel_lock));
+ MemoryContextSwitchTo(old_context);
+
+ newlock->xid = lockRequest->xid;
+ newlock->dbOid = lockRequest->dbOid;
+ newlock->relOid = lockRequest->relOid;
+ RecoveryLockList = lappend(RecoveryLockList, newlock);
+
+ /*
+ * Attempt to acquire the lock as requested.
+ */
+ SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
+
+ /*
+ * Waiting for lock to clear or kill anyone in our way. Not a
+ * completely foolproof way of getting the lock, but we cannot
+ * afford to sit and wait for the lock indefinitely. This is
+ * one reason to reduce strengths of various locks in 8.4.
+ */
+ while (LockAcquire(&locktag, AccessExclusiveLock, true, true)
+ == LOCKACQUIRE_NOT_AVAIL)
+ {
+ VirtualTransactionId *old_lockholders;
+
+ old_lockholders = GetLockConflicts(&locktag, AccessExclusiveLock);
+ ResolveRecoveryConflictWithVirtualXIDs(old_lockholders,
+ "exclusive locks");
+ }
+ }
+
+ static void
+ RelationRemoveRecoveryLocks(TransactionId xid)
+ {
+ ListCell *l;
+ LOCKTAG locktag;
+ List *deletionList = NIL;
+
+ /*
+ * Release all matching locks and identify list elements to remove
+ */
+ foreach(l, RecoveryLockList)
+ {
+ xl_rel_lock *lock = (xl_rel_lock *) lfirst(l);
+
+ elog(trace_recovery(DEBUG4),
+ "releasing recovery lock: xid %u db %d rel %d",
+ lock->xid, lock->dbOid, lock->relOid);
+
+ if (!TransactionIdIsValid(xid) || lock->xid == xid)
+ {
+ SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
+ if (!LockRelease(&locktag, AccessExclusiveLock, true))
+ elog(trace_recovery(LOG),
+ "RecoveryLockList contains entry for lock "
+ "no longer recorded by lock manager "
+ "xid %u database %d relation %d",
+ lock->xid, lock->dbOid, lock->relOid);
+ deletionList = lappend(deletionList, lock);
+ }
+ }
+
+ /*
+ * Now remove the elements from RecoveryLockList. We can't navigate
+ * the list at the same time as deleting multiple elements from it.
+ */
+ foreach(l, deletionList)
+ {
+ xl_rel_lock *lock = (xl_rel_lock *) lfirst(l);
+
+ RecoveryLockList = list_delete_ptr(RecoveryLockList, lock);
+ pfree(lock);
+ }
+ }
+
+
+ /*
+ * Called during xact_commit_redo() and xact_commit_abort when InArchiveRecovery
+ * to remove any AccessExclusiveLocks requested by a transaction.
+ *
+ * Remove all locks for this xid from the RecoveryLockList.
+ */
+ void
+ RelationReleaseRecoveryLocks(TransactionId xid)
+ {
+ RelationRemoveRecoveryLocks(xid);
+ }
+
+ /*
+ * Called at end of recovery and when we see a shutdown checkpoint.
+ */
+ void
+ RelationClearRecoveryLocks(void)
+ {
+ elog(trace_recovery(DEBUG1), "clearing recovery locks");
+ RelationRemoveRecoveryLocks(InvalidTransactionId);
+ }
+
+ /*
+ * --------------------------------------------------
+ * Recovery handling for Rmgr RM_RELATION_ID
+ * --------------------------------------------------
+ */
+
+ /*
+ * Redo for relation lock messages
+ */
+ static void
+ relation_redo_lock(xl_rel_lock *xlrec)
+ {
+ RelationAddRecoveryLock(xlrec);
+ }
+
+ /*
+ * Redo for relation invalidation messages
+ */
+ static void
+ relation_redo_inval(xl_rel_inval *xlrec)
+ {
+ SharedInvalidationMessage *msgs = &(xlrec->msgs[0]);
+ int nmsgs = xlrec->nmsgs;
+
+ Assert(nmsgs > 0); /* else we should not have written a record */
+
+ /*
+ * Smack them straight onto the queue and we're done. This is safe
+ * because the only writer of these messages is non-transactional
+ * invalidation.
+ */
+ SendSharedInvalidMessages(msgs, nmsgs);
+ }
+
+ void
+ relation_redo(XLogRecPtr lsn, XLogRecord *record)
+ {
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ if (info == XLOG_RELATION_INVAL)
+ {
+ xl_rel_inval *xlrec = (xl_rel_inval *) XLogRecGetData(record);
+
+ relation_redo_inval(xlrec);
+ }
+ else if (info == XLOG_RELATION_LOCK)
+ {
+ xl_rel_lock *xlrec = (xl_rel_lock *) XLogRecGetData(record);
+
+ relation_redo_lock(xlrec);
+ }
+ else
+ elog(PANIC, "relation_redo: unknown op code %u", info);
+ }
+
+ static void
+ relation_desc_inval(StringInfo buf, xl_rel_inval *xlrec)
+ {
+ SharedInvalidationMessage *msgs = &(xlrec->msgs[0]);
+ int nmsgs = xlrec->nmsgs;
+
+ appendStringInfo(buf, "nmsgs %d;", nmsgs);
+
+ if (nmsgs > 0)
+ {
+ int i;
+
+ for (i = 0; i < nmsgs; i++)
+ {
+ SharedInvalidationMessage *msg = msgs + i;
+
+ if (msg->id >= 0)
+ appendStringInfo(buf, "catcache id %d", msg->id);
+ else if (msg->id == SHAREDINVALRELCACHE_ID)
+ appendStringInfo(buf, "relcache ");
+ else if (msg->id == SHAREDINVALSMGR_ID)
+ appendStringInfo(buf, "smgr ");
+ }
+ }
+ }
+
+ void
+ relation_desc(StringInfo buf, uint8 xl_info, char *rec)
+ {
+ uint8 info = xl_info & ~XLR_INFO_MASK;
+
+ if (info == XLOG_RELATION_INVAL)
+ {
+ xl_rel_inval *xlrec = (xl_rel_inval *) rec;
+
+ appendStringInfo(buf, "inval: ");
+ relation_desc_inval(buf, xlrec);
+ }
+ else if (info == XLOG_RELATION_LOCK)
+ {
+ xl_rel_lock *xlrec = (xl_rel_lock *) rec;
+
+ appendStringInfo(buf, "exclusive relation lock: xid %u db %d rel %d",
+ xlrec->xid, xlrec->dbOid, xlrec->relOid);
+ }
+ else
+ appendStringInfo(buf, "UNKNOWN");
+ }
*** src/backend/utils/error/elog.c
--- src/backend/utils/error/elog.c
***************
*** 2579,2581 **** is_log_level_output(int elevel, int log_min_level)
--- 2579,2598 ----
return false;
}
+
+ /*
+ * If trace_recovery_messages is set to make this visible, then show as LOG,
+ * else display as whatever level is set. It may still be shown, but only
+ * if log_min_messages is set lower than trace_recovery_messages.
+ *
+ * Intention is to keep this for at least the whole of the 8.4 production
+ * release, so we can more easily diagnose production problems in the field.
+ */
+ int
+ trace_recovery(int trace_level)
+ {
+ if (trace_level >= trace_recovery_messages)
+ return LOG;
+
+ return trace_level;
+ }
*** src/backend/utils/init/flatfiles.c
--- src/backend/utils/init/flatfiles.c
***************
*** 678,686 **** write_auth_file(Relation rel_authid, Relation rel_authmem)
/*
* This routine is called once during database startup, after completing
* WAL replay if needed. Its purpose is to sync the flat files with the
! * current state of the database tables. This is particularly important
! * during PITR operation, since the flat files will come from the
! * base backup which may be far out of sync with the current state.
*
* In theory we could skip rebuilding the flat files if no WAL replay
* occurred, but it seems best to just do it always. We have to
--- 678,687 ----
/*
* This routine is called once during database startup, after completing
* WAL replay if needed. Its purpose is to sync the flat files with the
! * current state of the database tables.
! *
! * In 8.4 we also run this during xact_redo_commit() if the transaction
! * wrote a new database or auth flat file.
*
* In theory we could skip rebuilding the flat files if no WAL replay
* occurred, but it seems best to just do it always. We have to
***************
*** 716,723 **** BuildFlatFiles(bool database_only)
/*
* We don't have any hope of running a real relcache, but we can use the
* same fake-relcache facility that WAL replay uses.
- *
- * No locking is needed because no one else is alive yet.
*/
rel_db = CreateFakeRelcacheEntry(rnode);
write_database_file(rel_db, true);
--- 717,722 ----
***************
*** 832,845 **** AtEOXact_UpdateFlatFiles(bool isCommit)
/* Okay to write the files */
if (database_file_update_subid != InvalidSubTransactionId)
{
! database_file_update_subid = InvalidSubTransactionId;
write_database_file(drel, false);
heap_close(drel, NoLock);
}
if (auth_file_update_subid != InvalidSubTransactionId)
{
! auth_file_update_subid = InvalidSubTransactionId;
write_auth_file(arel, mrel);
heap_close(arel, NoLock);
heap_close(mrel, NoLock);
--- 831,844 ----
/* Okay to write the files */
if (database_file_update_subid != InvalidSubTransactionId)
{
! /* reset database_file_update_subid later during commit */
write_database_file(drel, false);
heap_close(drel, NoLock);
}
if (auth_file_update_subid != InvalidSubTransactionId)
{
! /* reset auth_file_update_subid later during commit */
write_auth_file(arel, mrel);
heap_close(arel, NoLock);
heap_close(mrel, NoLock);
***************
*** 859,864 **** AtEOXact_UpdateFlatFiles(bool isCommit)
--- 858,887 ----
ForceSyncCommit();
}
+ /*
+ * Exported to allow transaction commit to set flags to perform flat file
+ * update in redo. Reset per-transaction flags. For abort case they were
+ * already set during AtEOXact_UpdateFlatFiles().
+ */
+ bool
+ AtEOXact_Database_FlatFile_Update_Needed(void)
+ {
+ bool result = TransactionIdIsValid(database_file_update_subid);
+
+ database_file_update_subid = InvalidSubTransactionId;
+
+ return result;
+ }
+
+ bool
+ AtEOXact_Auth_FlatFile_Update_Needed(void)
+ {
+ bool result = TransactionIdIsValid(auth_file_update_subid);
+
+ auth_file_update_subid = InvalidSubTransactionId;
+
+ return result;
+ }
/*
* This routine is called during transaction prepare.
*** src/backend/utils/init/postinit.c
--- src/backend/utils/init/postinit.c
***************
*** 440,446 **** InitPostgres(const char *in_dbname, Oid dboid, const char *username,
*/
MyBackendId = InvalidBackendId;
! SharedInvalBackendInit();
if (MyBackendId > MaxBackends || MyBackendId <= 0)
elog(FATAL, "bad backend id: %d", MyBackendId);
--- 440,446 ----
*/
MyBackendId = InvalidBackendId;
! SharedInvalBackendInit(false);
if (MyBackendId > MaxBackends || MyBackendId <= 0)
elog(FATAL, "bad backend id: %d", MyBackendId);
***************
*** 489,497 **** InitPostgres(const char *in_dbname, Oid dboid, const char *username,
--- 489,503 ----
* Start a new transaction here before first access to db, and get a
* snapshot. We don't have a use for the snapshot itself, but we're
* interested in the secondary effect that it sets RecentGlobalXmin.
+ * If we are connecting during recovery, make sure the initial
+ * transaction is read only and force all subsequent transactions
+ * that way also.
*/
if (!bootstrap)
{
+ if (IsRecoveryProcessingMode())
+ SetConfigOption("default_transaction_read_only", "true",
+ PGC_POSTMASTER, PGC_S_OVERRIDE);
StartTransactionCommand();
(void) GetTransactionSnapshot();
}
***************
*** 515,521 **** InitPostgres(const char *in_dbname, Oid dboid, const char *username,
*/
if (!bootstrap)
LockSharedObject(DatabaseRelationId, MyDatabaseId, 0,
! RowExclusiveLock);
/*
* Recheck the flat file copy of pg_database to make sure the target
--- 521,527 ----
*/
if (!bootstrap)
LockSharedObject(DatabaseRelationId, MyDatabaseId, 0,
! (IsRecoveryProcessingMode() ? AccessShareLock : RowExclusiveLock));
/*
* Recheck the flat file copy of pg_database to make sure the target
*** src/backend/utils/misc/guc.c
--- src/backend/utils/misc/guc.c
***************
*** 114,119 **** extern char *temp_tablespaces;
--- 114,121 ----
extern bool synchronize_seqscans;
extern bool fullPageWrites;
+ int trace_recovery_messages = DEBUG1;
+
#ifdef TRACE_SORT
extern bool trace_sort;
#endif
***************
*** 2609,2614 **** static struct config_enum ConfigureNamesEnum[] =
--- 2611,2626 ----
},
{
+ {"trace_recovery_messages", PGC_SUSET, LOGGING_WHEN,
+ gettext_noop("Sets the message levels that are logged during recovery."),
+ gettext_noop("Each level includes all the levels that follow it. The later"
+ " the level, the fewer messages are sent.")
+ },
+ &trace_recovery_messages,
+ DEBUG1, server_message_level_options, NULL, NULL
+ },
+
+ {
{"track_functions", PGC_SUSET, STATS_COLLECTOR,
gettext_noop("Collects function-level statistics on database activity."),
NULL
***************
*** 5475,5482 **** ExecSetVariableStmt(VariableSetStmt *stmt)
--- 5487,5505 ----
SetPGVariable("transaction_isolation",
list_make1(item->arg), stmt->is_local);
else if (strcmp(item->defname, "transaction_read_only") == 0)
+ {
+ A_Const *con;
+
+ Assert(IsA(item->arg, A_Const));
+ con = (A_Const *) item->arg;
+ Assert(nodeTag(&con->val) == T_Integer);
+
+ if (!intVal(&con->val))
+ PreventCommandDuringRecovery();
+
SetPGVariable("transaction_read_only",
list_make1(item->arg), stmt->is_local);
+ }
else
elog(ERROR, "unexpected SET TRANSACTION element: %s",
item->defname);
***************
*** 5494,5501 **** ExecSetVariableStmt(VariableSetStmt *stmt)
--- 5517,5535 ----
SetPGVariable("default_transaction_isolation",
list_make1(item->arg), stmt->is_local);
else if (strcmp(item->defname, "transaction_read_only") == 0)
+ {
+ A_Const *con;
+
+ Assert(IsA(item->arg, A_Const));
+ con = (A_Const *) item->arg;
+ Assert(nodeTag(&con->val) == T_Integer);
+
+ if (!intVal(&con->val))
+ PreventCommandDuringRecovery();
+
SetPGVariable("default_transaction_read_only",
list_make1(item->arg), stmt->is_local);
+ }
else
elog(ERROR, "unexpected SET SESSION element: %s",
item->defname);
*** src/backend/utils/time/tqual.c
--- src/backend/utils/time/tqual.c
***************
*** 86,92 **** static inline void
SetHintBits(HeapTupleHeader tuple, Buffer buffer,
uint16 infomask, TransactionId xid)
{
! if (TransactionIdIsValid(xid))
{
/* NB: xid must be known committed here! */
XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid);
--- 86,92 ----
SetHintBits(HeapTupleHeader tuple, Buffer buffer,
uint16 infomask, TransactionId xid)
{
! if (!IsRecoveryProcessingMode() && TransactionIdIsValid(xid))
{
/* NB: xid must be known committed here! */
XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid);
***************
*** 1238,1263 **** XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
return true;
/*
! * If the snapshot contains full subxact data, the fastest way to check
! * things is just to compare the given XID against both subxact XIDs and
! * top-level XIDs. If the snapshot overflowed, we have to use pg_subtrans
! * to convert a subxact XID to its parent XID, but then we need only look
! * at top-level XIDs not subxacts.
*/
- if (snapshot->subxcnt >= 0)
- {
- /* full data, so search subxip */
- int32 j;
! for (j = 0; j < snapshot->subxcnt; j++)
! {
! if (TransactionIdEquals(xid, snapshot->subxip[j]))
return true;
}
! /* not there, fall through to search xip[] */
! }
! else
{
/* overflowed, so convert xid to top-level */
xid = SubTransGetTopmostTransaction(xid);
--- 1238,1289 ----
return true;
/*
! * Our strategy for checking xids changed in 8.4. Prior to 8.4
! * we either checked the subxid cache on the snapshot or we
! * checked subtrans. That was much more efficient than just using
! * subtrans but it has some problems. First, as soon as *any*
! * transaction had more than 64 transactions we forced *all*
! * snapshots to check against subtrans, giving a sharp modal
! * change in behaviour. Second because we either checked subtrans
! * or the snapshot, we were forced to place entries in subtrans
! * in case the snapshot later overflowed, even if we never
! * actually checked subtrans.
! *
! * In 8.4 we improve on that scheme in a number of ways. As before
! * we check subtrans if the snapshot has overflowed. We *also*
! * check the subxid cache. This has two benefits: first the
! * behaviour degrades gracefully when the cache overflows, so we
! * retain much of its benefit if it has only just overflowed.
! * Second, a transaction doesn't need to insert entries into
! * subtrans until its own personal subxid cache overflows. This
! * means entries into subtrans become significantly rarer,
! * perhaps less than 1% of the previous insert rate, giving
! * considerable benefit for transactions using only a few
! * subtransactions.
! *
! * This behaviour is also necessary for allowing snapshots to work
! * correctly on a standby server. By this subtle change of behaviour
! * we can now utilise the subxid cache to store "unobserved xids"
! * of which we can infer their existence from watching the
! * arrival sequence of newly observed transactionids in the WAL.
*/
! /*
! * First, compare the given XID against cached subxact XIDs.
! */
! for (i = 0; i < snapshot->subxcnt; i++)
! {
! if (TransactionIdEquals(xid, snapshot->subxip[i]))
return true;
}
! /*
! * If the snapshot overflowed and we haven't already located the xid
! * we also have to consult pg_subtrans. We use subtrans to convert a
! * subxact XID to its parent XID, so that we can then check the status
! * of the top-level TransactionId.
! */
! if (snapshot->suboverflowed)
{
/* overflowed, so convert xid to top-level */
xid = SubTransGetTopmostTransaction(xid);
***************
*** 1270,1275 **** XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
--- 1296,1305 ----
return false;
}
+ /*
+ * By now xid is either not present, or a top-level xid. So now
+ * we just need to check the main transaction ids.
+ */
for (i = 0; i < snapshot->xcnt; i++)
{
if (TransactionIdEquals(xid, snapshot->xip[i]))
*** src/bin/pg_controldata/pg_controldata.c
--- src/bin/pg_controldata/pg_controldata.c
***************
*** 197,202 **** main(int argc, char *argv[])
--- 197,205 ----
printf(_("Minimum recovery ending location: %X/%X\n"),
ControlFile.minRecoveryPoint.xlogid,
ControlFile.minRecoveryPoint.xrecoff);
+ printf(_("Minimum safe starting location: %X/%X\n"),
+ ControlFile.minSafeStartPoint.xlogid,
+ ControlFile.minSafeStartPoint.xrecoff);
printf(_("Maximum data alignment: %u\n"),
ControlFile.maxAlign);
/* we don't print floatFormat since can't say much useful about it */
*** src/bin/pg_resetxlog/pg_resetxlog.c
--- src/bin/pg_resetxlog/pg_resetxlog.c
***************
*** 603,608 **** RewriteControlFile(void)
--- 603,610 ----
ControlFile.prevCheckPoint.xrecoff = 0;
ControlFile.minRecoveryPoint.xlogid = 0;
ControlFile.minRecoveryPoint.xrecoff = 0;
+ ControlFile.minSafeStartPoint.xlogid = 0;
+ ControlFile.minSafeStartPoint.xrecoff = 0;
/* Now we can force the recorded xlog seg size to the right thing. */
ControlFile.xlog_seg_size = XLogSegSize;
*** src/include/access/heapam.h
--- src/include/access/heapam.h
***************
*** 130,140 **** extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec);
extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
ItemPointerData from,
Buffer newbuf, HeapTuple newtup);
extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
OffsetNumber *redirected, int nredirected,
OffsetNumber *nowdead, int ndead,
OffsetNumber *nowunused, int nunused,
! bool redirect_move);
extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
TransactionId cutoff_xid,
OffsetNumber *offsets, int offcnt);
--- 130,142 ----
extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
ItemPointerData from,
Buffer newbuf, HeapTuple newtup);
+ extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode,
+ TransactionId latestRemovedXid);
extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
OffsetNumber *redirected, int nredirected,
OffsetNumber *nowdead, int ndead,
OffsetNumber *nowunused, int nunused,
! TransactionId latestRemovedXid, bool redirect_move);
extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
TransactionId cutoff_xid,
OffsetNumber *offsets, int offcnt);
*** src/include/access/htup.h
--- src/include/access/htup.h
***************
*** 580,585 **** typedef HeapTupleData *HeapTuple;
--- 580,586 ----
#define XLOG_HEAP2_FREEZE 0x00
#define XLOG_HEAP2_CLEAN 0x10
#define XLOG_HEAP2_CLEAN_MOVE 0x20
+ #define XLOG_HEAP2_CLEANUP_INFO 0x30
/*
* All what we need to find changed tuple
***************
*** 668,673 **** typedef struct xl_heap_clean
--- 669,675 ----
{
RelFileNode node;
BlockNumber block;
+ TransactionId latestRemovedXid;
uint16 nredirected;
uint16 ndead;
/* OFFSET NUMBERS FOLLOW */
***************
*** 675,680 **** typedef struct xl_heap_clean
--- 677,695 ----
#define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16))
+ /*
+ * Cleanup_info is required in some cases during a lazy VACUUM.
+ * Used for reporting the results of HeapTupleHeaderAdvanceLatestRemovedXid()
+ * see vacuumlazy.c for full explanation
+ */
+ typedef struct xl_heap_cleanup_info
+ {
+ RelFileNode node;
+ TransactionId latestRemovedXid;
+ } xl_heap_cleanup_info;
+
+ #define SizeOfHeapCleanupInfo (sizeof(xl_heap_cleanup_info))
+
/* This is for replacing a page's contents in toto */
/* NB: this is used for indexes as well as heaps */
typedef struct xl_heap_newpage
***************
*** 718,723 **** typedef struct xl_heap_freeze
--- 733,741 ----
#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId))
+ extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
+ TransactionId *latestRemovedXid);
+
/* HeapTupleHeader functions implemented in utils/time/combocid.c */
extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup);
extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup);
*** src/include/access/nbtree.h
--- src/include/access/nbtree.h
***************
*** 214,225 **** typedef struct BTMetaPageData
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
#define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add tuple with split of root */
#define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */
! #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuple */
#define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */
#define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, and update metapage */
#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
#define XLOG_BTREE_DELETE_PAGE_HALF 0xB0 /* page deletion that makes
* parent half-dead */
/*
* All that we need to find changed index tuple
--- 214,226 ----
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
#define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add tuple with split of root */
#define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */
! #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
#define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */
#define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, and update metapage */
#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
#define XLOG_BTREE_DELETE_PAGE_HALF 0xB0 /* page deletion that makes
* parent half-dead */
+ #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during vacuum */
/*
* All that we need to find changed index tuple
***************
*** 306,321 **** typedef struct xl_btree_split
/*
* This is what we need to know about delete of individual leaf index tuples.
* The WAL record can represent deletion of any number of index tuples on a
! * single index page.
*/
typedef struct xl_btree_delete
{
RelFileNode node;
BlockNumber block;
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
} xl_btree_delete;
! #define SizeOfBtreeDelete (offsetof(xl_btree_delete, block) + sizeof(BlockNumber))
/*
* This is what we need to know about deletion of a btree page. The target
--- 307,359 ----
/*
* This is what we need to know about delete of individual leaf index tuples.
* The WAL record can represent deletion of any number of index tuples on a
! * single index page when *not* executed by VACUUM.
*/
typedef struct xl_btree_delete
{
RelFileNode node;
BlockNumber block;
+ TransactionId latestRemovedXid;
+ int numItems; /* number of items in the offset array */
+
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
} xl_btree_delete;
! #define SizeOfBtreeDelete (offsetof(xl_btree_delete, latestRemovedXid) + sizeof(TransactionId))
!
! /*
! * This is what we need to know about vacuum of individual leaf index tuples.
! * The WAL record can represent deletion of any number of index tuples on a
! * single index page when executed by VACUUM.
! *
! * The correctness requirement for applying these changes during recovery is
! * that we must do one of these two things for every block in the index:
! * * lock the block for cleanup and apply any required changes
! * * EnsureBlockUnpinned()
! * The purpose of this is to ensure that no index scans started before we
! * finish scanning the index are still running by the time we begin to remove
! * heap tuples.
! *
! * Any changes to any one block are registered on just one WAL record. All
! * blocks that we need to run EnsureBlockUnpinned() before we touch the changed
! * block are also given on this record as a variable length array. The array
! * is compressed by way of storing an array of block ranges, rather than an
! * actual array of blockids.
! *
! * Note that the *last* WAL record in any vacuum of an index is allowed to
! * have numItems == 0. All other WAL records must have numItems > 0.
! */
! typedef struct xl_btree_vacuum
! {
! RelFileNode node;
! BlockNumber block;
! BlockNumber lastBlockVacuumed;
! int numItems; /* number of items in the offset array */
!
! /* TARGET OFFSET NUMBERS FOLLOW */
! } xl_btree_vacuum;
!
! #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
/*
* This is what we need to know about deletion of a btree page. The target
***************
*** 498,503 **** typedef BTScanOpaqueData *BTScanOpaque;
--- 536,545 ----
#define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
#define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
+ /* XXX probably needs new RMgr call to do this cleanly */
+ extern bool btree_is_cleanup_record(uint8 info);
+ extern bool btree_needs_cleanup_lock(uint8 info);
+
/*
* prototypes for functions in nbtree.c (external entry points for btree)
*/
***************
*** 537,543 **** extern void _bt_relbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page);
extern void _bt_delitems(Relation rel, Buffer buf,
! OffsetNumber *itemnos, int nitems);
extern int _bt_pagedel(Relation rel, Buffer buf,
BTStack stack, bool vacuum_full);
--- 579,586 ----
extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page);
extern void _bt_delitems(Relation rel, Buffer buf,
! OffsetNumber *itemnos, int nitems, bool isVacuum,
! BlockNumber lastBlockVacuumed);
extern int _bt_pagedel(Relation rel, Buffer buf,
BTStack stack, bool vacuum_full);
*** src/include/access/rmgr.h
--- src/include/access/rmgr.h
***************
*** 23,28 **** typedef uint8 RmgrId;
--- 23,29 ----
#define RM_DBASE_ID 4
#define RM_TBLSPC_ID 5
#define RM_MULTIXACT_ID 6
+ #define RM_RELATION_ID 8
#define RM_HEAP2_ID 9
#define RM_HEAP_ID 10
#define RM_BTREE_ID 11
*** src/include/access/xact.h
--- src/include/access/xact.h
***************
*** 17,22 ****
--- 17,23 ----
#include "access/xlog.h"
#include "nodes/pg_list.h"
#include "storage/relfilenode.h"
+ #include "utils/snapshot.h"
#include "utils/timestamp.h"
***************
*** 84,111 **** typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid,
#define XLOG_XACT_ABORT 0x20
#define XLOG_XACT_COMMIT_PREPARED 0x30
#define XLOG_XACT_ABORT_PREPARED 0x40
typedef struct xl_xact_commit
{
! TimestampTz xact_time; /* time of commit */
! int nrels; /* number of RelFileNodes */
! int nsubxacts; /* number of subtransaction XIDs */
! /* Array of RelFileNode(s) to drop at commit */
! RelFileNode xnodes[1]; /* VARIABLE LENGTH ARRAY */
! /* ARRAY OF COMMITTED SUBTRANSACTION XIDs FOLLOWS */
} xl_xact_commit;
#define MinSizeOfXactCommit offsetof(xl_xact_commit, xnodes)
typedef struct xl_xact_abort
{
TimestampTz xact_time; /* time of abort */
int nrels; /* number of RelFileNodes */
int nsubxacts; /* number of subtransaction XIDs */
/* Array of RelFileNode(s) to drop at abort */
RelFileNode xnodes[1]; /* VARIABLE LENGTH ARRAY */
/* ARRAY OF ABORTED SUBTRANSACTION XIDs FOLLOWS */
} xl_xact_abort;
#define MinSizeOfXactAbort offsetof(xl_xact_abort, xnodes)
--- 85,159 ----
#define XLOG_XACT_ABORT 0x20
#define XLOG_XACT_COMMIT_PREPARED 0x30
#define XLOG_XACT_ABORT_PREPARED 0x40
+ #define XLOG_XACT_ASSIGNMENT 0x50
+ #define XLOG_XACT_RUNNING_XACTS 0x60
+ /* 0x70 can also be used, if required */
+
+ typedef struct xl_xact_assignment
+ {
+ TransactionId xassign; /* assigned xid */
+ TransactionId xparent; /* assigned xids parent, if any */
+ bool isSubXact; /* is a subtransaction */
+ } xl_xact_assignment;
+
+ /*
+ * xl_xact_running_xacts is in utils/snapshot.h so it can be passed
+ * around to the same places as snapshots. Not snapmgr.h
+ */
typedef struct xl_xact_commit
{
! TimestampTz xact_time; /* time of commit */
! uint xinfo; /* info flags */
! int nrels; /* number of RelFileForks */
! int nsubxacts; /* number of subtransaction XIDs */
! int nmsgs; /* number of shared inval msgs */
! /* Array of RelFileFork(s) to drop at commit */
! RelFileNode xnodes[1]; /* VARIABLE LENGTH ARRAY */
! /* ARRAY OF COMMITTED SUBTRANSACTION XIDs FOLLOWS */
! /* ARRAY OF SHARED INVALIDATION MESSAGES FOLLOWS */
} xl_xact_commit;
#define MinSizeOfXactCommit offsetof(xl_xact_commit, xnodes)
+ #define OffsetSharedInvalInXactCommit() \
+ ( \
+ MinSizeOfXactCommit + \
+ (xlrec->nsubxacts * sizeof(TransactionId)) + \
+ (xlrec->nrels * sizeof(RelFileNode)) \
+ )
+
+ /*
+ * These flags are set in the xinfo fields of transaction
+ * completion WAL records. They indicate a number of actions
+ * that need to occur when emulating transaction completion.
+ * They are named XactCompletion... to differentiate them from
+ * EOXact... routines which run at the end of the original
+ * transaction completion.
+ */
+ #define XACT_COMPLETION_UNMARKED_SUBXIDS 0x01
+
+ /* These next states only occur on commit record types */
+ #define XACT_COMPLETION_UPDATE_DB_FILE 0x02
+ #define XACT_COMPLETION_UPDATE_AUTH_FILE 0x04
+ #define XACT_COMPLETION_UPDATE_RELCACHE_FILE 0x08
+
+ /* Access macros for above flags */
+ #define XactCompletionHasUnMarkedSubxids(xlrec) ((xlrec)->xinfo & XACT_COMPLETION_UNMARKED_SUBXIDS)
+ #define XactCompletionUpdateDBFile(xlrec) ((xlrec)->xinfo & XACT_COMPLETION_UPDATE_DB_FILE)
+ #define XactCompletionUpdateAuthFile(xlrec) ((xlrec)->xinfo & XACT_COMPLETION_UPDATE_AUTH_FILE)
+ #define XactCompletionRelcacheInitFileInval(xlrec) ((xlrec)->xinfo & XACT_COMPLETION_UPDATE_RELCACHE_FILE)
typedef struct xl_xact_abort
{
TimestampTz xact_time; /* time of abort */
+ uint xinfo; /* info flags */
int nrels; /* number of RelFileNodes */
int nsubxacts; /* number of subtransaction XIDs */
/* Array of RelFileNode(s) to drop at abort */
RelFileNode xnodes[1]; /* VARIABLE LENGTH ARRAY */
/* ARRAY OF ABORTED SUBTRANSACTION XIDs FOLLOWS */
} xl_xact_abort;
+ /* Note the intentional lack of an invalidation message array c.f. commit */
#define MinSizeOfXactAbort offsetof(xl_xact_abort, xnodes)
***************
*** 185,190 **** extern TransactionId RecordTransactionCommit(void);
--- 233,245 ----
extern int xactGetCommittedChildren(TransactionId **ptr);
+ extern void LogCurrentRunningXacts(void);
+ extern bool IsRunningXactDataValid(void);
+
+ extern void InitRecoveryTransactionEnvironment(void);
+ extern void XactResolveRecoveryConflicts(TransactionId latestRemovedXid, Oid recDatabaseOid);
+ extern void RecordKnownAssignedTransactionIds(XLogRecPtr lsn, XLogRecord *record);
+
extern void xact_redo(XLogRecPtr lsn, XLogRecord *record);
extern void xact_desc(StringInfo buf, uint8 xl_info, char *rec);
*** src/include/access/xlog.h
--- src/include/access/xlog.h
***************
*** 46,55 **** typedef struct XLogRecord
TransactionId xl_xid; /* xact id */
uint32 xl_tot_len; /* total len of entire record */
uint32 xl_len; /* total len of rmgr data */
! uint8 xl_info; /* flag bits, see below */
RmgrId xl_rmid; /* resource manager for this record */
! /* Depending on MAXALIGN, there are either 2 or 6 wasted bytes here */
/* ACTUAL LOG DATA FOLLOWS AT END OF STRUCT */
--- 46,56 ----
TransactionId xl_xid; /* xact id */
uint32 xl_tot_len; /* total len of entire record */
uint32 xl_len; /* total len of rmgr data */
! uint8 xl_info; /* flag bits, see below (XLR_ entries) */
RmgrId xl_rmid; /* resource manager for this record */
+ TransactionId xl_parentxid; /* parent_xid if XLR2_FIRST_SUBXID_RECORD is set */
! /* XXX Above structure has 8 byte alignment */
/* ACTUAL LOG DATA FOLLOWS AT END OF STRUCT */
***************
*** 133,139 **** typedef struct XLogRecData
} XLogRecData;
extern TimeLineID ThisTimeLineID; /* current TLI */
! extern bool InRecovery;
extern XLogRecPtr XactLastRecEnd;
/* these variables are GUC parameters related to XLOG */
--- 134,148 ----
} XLogRecData;
extern TimeLineID ThisTimeLineID; /* current TLI */
! /*
! * Prior to 8.4, all activity during recovery were carried out by Startup
! * process. This local variable continues to be used in many parts of the
! * code to indicate actions taken by RecoveryManagers. Other processes who
! * potentially perform work during recovery should check
! * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
! */
! extern bool InRecovery;
! extern bool InArchiveRecovery;
extern XLogRecPtr XactLastRecEnd;
/* these variables are GUC parameters related to XLOG */
***************
*** 143,148 **** extern bool XLogArchiveMode;
--- 152,158 ----
extern char *XLogArchiveCommand;
extern int XLogArchiveTimeout;
extern bool log_checkpoints;
+ extern int maxStandbyDelay;
#define XLogArchivingActive() (XLogArchiveMode)
#define XLogArchiveCommandSet() (XLogArchiveCommand[0] != '\0')
***************
*** 166,171 **** extern bool XLOG_DEBUG;
--- 176,182 ----
/* These indicate the cause of a checkpoint request */
#define CHECKPOINT_CAUSE_XLOG 0x0010 /* XLOG consumption */
#define CHECKPOINT_CAUSE_TIME 0x0020 /* Elapsed time */
+ #define CHECKPOINT_RESTARTPOINT 0x0040 /* Restartpoint during recovery */
/* Checkpoint statistics */
typedef struct CheckpointStatsData
***************
*** 197,202 **** extern void XLogSetAsyncCommitLSN(XLogRecPtr record);
--- 208,216 ----
extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
+ extern bool IsRecoveryProcessingMode(void);
+ extern int GetLatestReplicationDelay(void);
+
extern void UpdateControlFile(void);
extern Size XLOGShmemSize(void);
extern void XLOGShmemInit(void);
*** src/include/access/xlog_internal.h
--- src/include/access/xlog_internal.h
***************
*** 17,22 ****
--- 17,23 ----
#define XLOG_INTERNAL_H
#include "access/xlog.h"
+ #include "catalog/pg_control.h"
#include "fmgr.h"
#include "pgtime.h"
#include "storage/block.h"
***************
*** 71,77 **** typedef struct XLogContRecord
/*
* Each page of XLOG file has a header like this:
*/
! #define XLOG_PAGE_MAGIC 0xD063 /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
--- 72,78 ----
/*
* Each page of XLOG file has a header like this:
*/
! #define XLOG_PAGE_MAGIC 0x5352 /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
***************
*** 245,250 **** extern const RmgrData RmgrTable[];
--- 246,254 ----
extern pg_time_t GetLastSegSwitchTime(void);
extern XLogRecPtr RequestXLogSwitch(void);
+ extern void CreateRestartPoint(const XLogRecPtr ReadPtr,
+ const CheckPoint *restartPoint, int flags);
+
/*
* These aren't in xlog.h because I'd rather not include fmgr.h there.
*/
***************
*** 255,259 **** extern Datum pg_current_xlog_location(PG_FUNCTION_ARGS);
--- 259,273 ----
extern Datum pg_current_xlog_insert_location(PG_FUNCTION_ARGS);
extern Datum pg_xlogfile_name_offset(PG_FUNCTION_ARGS);
extern Datum pg_xlogfile_name(PG_FUNCTION_ARGS);
+ extern Datum pg_recovery_continue(PG_FUNCTION_ARGS);
+ extern Datum pg_recovery_pause(PG_FUNCTION_ARGS);
+ extern Datum pg_recovery_pause_cleanup(PG_FUNCTION_ARGS);
+ extern Datum pg_recovery_pause_xid(PG_FUNCTION_ARGS);
+ extern Datum pg_recovery_pause_time(PG_FUNCTION_ARGS);
+ extern Datum pg_recovery_advance(PG_FUNCTION_ARGS);
+ extern Datum pg_recovery_stop(PG_FUNCTION_ARGS);
+ extern Datum pg_is_in_recovery(PG_FUNCTION_ARGS);
+ extern Datum pg_last_completed_xact_timestamp(PG_FUNCTION_ARGS);
+ extern Datum pg_last_completed_xid(PG_FUNCTION_ARGS);
#endif /* XLOG_INTERNAL_H */
*** src/include/access/xlogutils.h
--- src/include/access/xlogutils.h
***************
*** 26,33 **** extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
BlockNumber nblocks);
extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init);
extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
! BlockNumber blkno, ReadBufferMode mode);
extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
extern void FreeFakeRelcacheEntry(Relation fakerel);
--- 26,34 ----
BlockNumber nblocks);
extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init);
+ extern Buffer XLogReadBufferForCleanup(RelFileNode rnode, BlockNumber blkno, bool init);
extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
! BlockNumber blkno, ReadBufferMode mode, int lockmode);
extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
extern void FreeFakeRelcacheEntry(Relation fakerel);
*** src/include/catalog/pg_control.h
--- src/include/catalog/pg_control.h
***************
*** 21,27 ****
/* Version identifier for this pg_control format */
! #define PG_CONTROL_VERSION 843
/*
* Body of CheckPoint XLOG records. This is declared here because we keep
--- 21,28 ----
/* Version identifier for this pg_control format */
! #define PG_CONTROL_VERSION 847
! // xxx change me
/*
* Body of CheckPoint XLOG records. This is declared here because we keep
***************
*** 46,52 **** typedef struct CheckPoint
#define XLOG_NOOP 0x20
#define XLOG_NEXTOID 0x30
#define XLOG_SWITCH 0x40
!
/* System status indicator */
typedef enum DBState
--- 47,58 ----
#define XLOG_NOOP 0x20
#define XLOG_NEXTOID 0x30
#define XLOG_SWITCH 0x40
! /*
! * Prior to 8.4 we wrote a shutdown checkpoint when recovery completed.
! * Now we write an XLOG_RECOVERY_END record, which helps differentiate
! * between a checkpoint-at-shutdown and the startup case.
! */
! #define XLOG_RECOVERY_END 0x50
/* System status indicator */
typedef enum DBState
***************
*** 101,107 **** typedef struct ControlFileData
--- 107,118 ----
CheckPoint checkPointCopy; /* copy of last check point record */
+ /*
+ * Next two sound very similar, yet are distinct and necessary.
+ * Check comments in xlog.c for a full explanation not easily repeated.
+ */
XLogRecPtr minRecoveryPoint; /* must replay xlog to here */
+ XLogRecPtr minSafeStartPoint; /* safe point after recovery crashes */
/*
* This data is used to check for hardware-architecture compatibility of
*** src/include/catalog/pg_proc.h
--- src/include/catalog/pg_proc.h
***************
*** 3230,3235 **** DESCR("xlog filename and byte offset, given an xlog location");
--- 3230,3257 ----
DATA(insert OID = 2851 ( pg_xlogfile_name PGNSP PGUID 12 1 0 0 f f f t f i 1 0 25 "25" _null_ _null_ _null_ _null_ pg_xlogfile_name _null_ _null_ _null_ ));
DESCR("xlog filename, given an xlog location");
+ DATA(insert OID = 3801 ( pg_recovery_continue PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_continue _null_ _null_ _null_ ));
+ DESCR("if recovery is paused, continue with recovery");
+ DATA(insert OID = 3802 ( pg_recovery_pause PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_pause _null_ _null_ _null_ ));
+ DESCR("pause recovery until recovery target reset");
+ DATA(insert OID = 3803 ( pg_recovery_pause_cleanup PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_pause_cleanup _null_ _null_ _null_ ));
+ DESCR("continue recovery until cleanup record arrives, then pause recovery");
+ DATA(insert OID = 3804 ( pg_recovery_pause_xid PGNSP PGUID 12 1 0 0 f f f t f v 1 0 2278 "23" _null_ _null_ _null_ _null_ pg_recovery_pause_xid _null_ _null_ _null_ ));
+ DESCR("continue recovery until specified xid completes, if ever seen, then pause recovery");
+ DATA(insert OID = 3805 ( pg_recovery_pause_time PGNSP PGUID 12 1 0 0 f f f t f v 1 0 2278 "1184" _null_ _null_ _null_ _null_ pg_recovery_pause_time _null_ _null_ _null_ ));
+ DESCR("continue recovery until a transaction with specified timestamp completes, if ever seen, then pause recovery");
+ DATA(insert OID = 3806 ( pg_recovery_advance PGNSP PGUID 12 1 0 0 f f f t f v 1 0 2278 "23" _null_ _null_ _null_ _null_ pg_recovery_advance _null_ _null_ _null_ ));
+ DESCR("continue recovery exactly specified number of records, then pause recovery");
+ DATA(insert OID = 3807 ( pg_recovery_stop PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_stop _null_ _null_ _null_ ));
+ DESCR("stop recovery immediately");
+
+ DATA(insert OID = 3810 ( pg_is_in_recovery PGNSP PGUID 12 1 0 0 f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pg_is_in_recovery _null_ _null_ _null_ ));
+ DESCR("true if server is in recovery");
+ DATA(insert OID = 3811 ( pg_last_completed_xact_timestamp PGNSP PGUID 12 1 0 0 f f f t f v 0 0 1184 "" _null_ _null_ _null_ _null_ pg_last_completed_xact_timestamp _null_ _null_ _null_ ));
+ DESCR("timestamp of last commit or abort record that arrived during recovery, if any");
+ DATA(insert OID = 3812 ( pg_last_completed_xid PGNSP PGUID 12 1 0 0 f f f t f v 0 0 28 "" _null_ _null_ _null_ _null_ pg_last_completed_xid _null_ _null_ _null_ ));
+ DESCR("xid of last commit or abort record that arrived during recovery, if any");
+
DATA(insert OID = 2621 ( pg_reload_conf PGNSP PGUID 12 1 0 0 f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pg_reload_conf _null_ _null_ _null_ ));
DESCR("reload configuration files");
DATA(insert OID = 2622 ( pg_rotate_logfile PGNSP PGUID 12 1 0 0 f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pg_rotate_logfile _null_ _null_ _null_ ));
*** src/include/miscadmin.h
--- src/include/miscadmin.h
***************
*** 235,240 **** extern bool VacuumCostActive;
--- 235,246 ----
/* in tcop/postgres.c */
extern void check_stack_depth(void);
+ /* in tcop/utility.c */
+ extern void PreventCommandDuringRecovery(void);
+
+ /* in utils/misc/guc.c */
+ extern int trace_recovery_messages;
+ int trace_recovery(int trace_level);
/*****************************************************************************
* pdir.h -- *
*** src/include/postmaster/bgwriter.h
--- src/include/postmaster/bgwriter.h
***************
*** 12,17 ****
--- 12,18 ----
#ifndef _BGWRITER_H
#define _BGWRITER_H
+ #include "catalog/pg_control.h"
#include "storage/block.h"
#include "storage/relfilenode.h"
***************
*** 25,30 **** extern double CheckPointCompletionTarget;
--- 26,36 ----
extern void BackgroundWriterMain(void);
extern void RequestCheckpoint(int flags);
+ extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter);
+ extern void RequestRestartPointCompletion(void);
+ extern XLogRecPtr GetRedoLocationForArchiveCheckpoint(void);
+ extern bool SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo);
+
extern void CheckpointWriteDelay(int flags, double progress);
extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
*** src/include/storage/bufmgr.h
--- src/include/storage/bufmgr.h
***************
*** 67,72 **** extern PGDLLIMPORT int32 *LocalRefCount;
--- 67,75 ----
#define BUFFER_LOCK_SHARE 1
#define BUFFER_LOCK_EXCLUSIVE 2
+ /* Not used by LockBuffer, but is used by XLogReadBuffer... */
+ #define BUFFER_LOCK_CLEANUP 3
+
/*
* These routines are beaten on quite heavily, hence the macroization.
*/
***************
*** 197,202 **** extern bool ConditionalLockBuffer(Buffer buffer);
--- 200,209 ----
extern void LockBufferForCleanup(Buffer buffer);
extern bool ConditionalLockBufferForCleanup(Buffer buffer);
+ extern void StartCleanupDelayStats(void);
+ extern void EndCleanupDelayStats(void);
+ extern void ReportCleanupDelayStats(void);
+
extern void AbortBufferIO(void);
extern void BufmgrCommit(void);
*** src/include/storage/pmsignal.h
--- src/include/storage/pmsignal.h
***************
*** 22,27 ****
--- 22,28 ----
*/
typedef enum
{
+ PMSIGNAL_RECOVERY_START, /* move to PM_RECOVERY state */
PMSIGNAL_PASSWORD_CHANGE, /* pg_auth file has changed */
PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */
PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */
*** src/include/storage/proc.h
--- src/include/storage/proc.h
***************
*** 14,19 ****
--- 14,20 ----
#ifndef _PROC_H_
#define _PROC_H_
+ #include "access/xlog.h"
#include "storage/lock.h"
#include "storage/pg_sema.h"
***************
*** 93,98 **** struct PGPROC
--- 94,107 ----
uint8 vacuumFlags; /* vacuum-related flags, see above */
+ /*
+ * The lsn field exists to allow procs to be used during recovery
+ * for managing snapshot data for standby servers. The lsn allows
+ * us to disambiguate any incoming information so we always respect
+ * the latest info.
+ */
+ XLogRecPtr lsn; /* Last LSN which maintained state of Recovery Proc */
+
/* Info about LWLock the process is currently waiting for, if any. */
bool lwWaiting; /* true if waiting for an LW lock */
bool lwExclusive; /* true if waiting for exclusive access */
***************
*** 133,138 **** typedef struct PROC_HDR
--- 142,150 ----
PGPROC *autovacFreeProcs;
/* Current shared estimate of appropriate spins_per_delay value */
int spins_per_delay;
+ /* The proc of the Startup process, since not in ProcArray */
+ PGPROC *startupProc;
+ int startupProcPid;
} PROC_HDR;
/*
***************
*** 157,164 **** extern int ProcGlobalSemas(void);
--- 169,180 ----
extern Size ProcGlobalShmemSize(void);
extern void InitProcGlobal(void);
extern void InitProcess(void);
+ extern PGPROC *InitRecoveryProcess(void);
+ extern void FreeRecoveryProcess(PGPROC *proc);
extern void InitProcessPhase2(void);
extern void InitAuxiliaryProcess(void);
+ extern void PublishStartupProcessInformation(void);
+
extern bool HaveNFreeProcs(int n);
extern void ProcReleaseLocks(bool isCommit);
*** src/include/storage/procarray.h
--- src/include/storage/procarray.h
***************
*** 14,19 ****
--- 14,20 ----
#ifndef PROCARRAY_H
#define PROCARRAY_H
+ #include "access/xact.h"
#include "storage/lock.h"
#include "utils/snapshot.h"
***************
*** 23,31 **** extern void CreateSharedProcArray(void);
extern void ProcArrayAdd(PGPROC *proc);
extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid);
! extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid);
extern void ProcArrayClearTransaction(PGPROC *proc);
extern Snapshot GetSnapshotData(Snapshot snapshot);
extern bool TransactionIdIsInProgress(TransactionId xid);
--- 24,40 ----
extern void ProcArrayAdd(PGPROC *proc);
extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid);
! extern void ProcArrayInitRecoveryEnvironment(void);
! extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid,
! int nsubxids, TransactionId *subxids);
extern void ProcArrayClearTransaction(PGPROC *proc);
+ extern void ProcArrayClearRecoveryTransactions(void);
+ extern bool XidInRecoveryProcs(TransactionId xid);
+ extern void ProcArrayDisplay(int trace_level);
+ extern void ProcArrayUpdateRecoveryTransactions(XLogRecPtr lsn,
+ xl_xact_running_xacts *xlrec);
+ extern RunningTransactions GetRunningTransactionData(void);
extern Snapshot GetSnapshotData(Snapshot snapshot);
extern bool TransactionIdIsInProgress(TransactionId xid);
***************
*** 36,46 **** extern int GetTransactionsInCommit(TransactionId **xids_p);
extern bool HaveTransactionsInCommit(TransactionId *xids, int nxids);
extern PGPROC *BackendPidGetProc(int pid);
extern int BackendXidGetPid(TransactionId xid);
extern bool IsBackendPid(int pid);
! extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin,
! bool allDbs, int excludeVacuum);
extern int CountActiveBackends(void);
extern int CountDBBackends(Oid databaseid);
extern int CountUserBackends(Oid roleid);
--- 45,58 ----
extern bool HaveTransactionsInCommit(TransactionId *xids, int nxids);
extern PGPROC *BackendPidGetProc(int pid);
+ extern PGPROC *BackendXidGetProc(TransactionId xid);
extern int BackendXidGetPid(TransactionId xid);
extern bool IsBackendPid(int pid);
! extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin,
! Oid dbOid, int excludeVacuum);
! extern int VirtualTransactionIdGetPid(VirtualTransactionId vxid);
!
extern int CountActiveBackends(void);
extern int CountDBBackends(Oid databaseid);
extern int CountUserBackends(Oid roleid);
***************
*** 51,54 **** extern void XidCacheRemoveRunningXids(TransactionId xid,
--- 63,76 ----
int nxids, const TransactionId *xids,
TransactionId latestXid);
+ /* Primitives for UnobservedXids array handling for standby */
+ extern void UnobservedTransactionsAddXids(TransactionId firstXid,
+ TransactionId lastXid);
+ extern void UnobservedTransactionsRemoveXid(TransactionId xid,
+ bool missing_is_error);
+ extern void UnobservedTransactionsPruneXids(TransactionId limitXid);
+ extern void UnobservedTransactionsClearXids(void);
+ extern void UnobservedTransactionsDisplay(int trace_level);
+ extern bool XidInUnobservedTransactions(TransactionId xid);
+
#endif /* PROCARRAY_H */
*** src/include/storage/sinval.h
--- src/include/storage/sinval.h
***************
*** 89,94 **** extern void ReceiveSharedInvalidMessages(
--- 89,132 ----
void (*invalFunction) (SharedInvalidationMessage *msg),
void (*resetFunction) (void));
+ extern int xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs,
+ bool *RelcacheInitFileInval);
+
+ /*
+ * Relation Rmgr (RM_RELATION_ID)
+ *
+ * Relation recovery manager exists to allow locks and certain kinds of
+ * invalidation message to be passed across to a standby server.
+ */
+
+ extern void RelationReleaseRecoveryLocks(TransactionId xid);
+ extern void RelationClearRecoveryLocks(void);
+
+ /* Recovery handlers for the Relation Rmgr (RM_RELATION_ID) */
+ extern void relation_redo(XLogRecPtr lsn, XLogRecord *record);
+ extern void relation_desc(StringInfo buf, uint8 xl_info, char *rec);
+
+ /*
+ * XLOG message types
+ */
+ #define XLOG_RELATION_INVAL 0x00
+ #define XLOG_RELATION_LOCK 0x10
+
+ typedef struct xl_rel_inval
+ {
+ int nmsgs; /* number of shared inval msgs */
+ SharedInvalidationMessage msgs[1]; /* VARIABLE LENGTH ARRAY */
+ } xl_rel_inval;
+
+ #define MinSizeOfRelationInval offsetof(xl_rel_inval, msgs)
+
+ typedef struct xl_rel_lock
+ {
+ TransactionId xid; /* xid of the *parent* transaction. XXX why parent? */
+ Oid dbOid;
+ Oid relOid;
+ } xl_rel_lock;
+
/* signal handler for catchup events (SIGUSR1) */
extern void CatchupInterruptHandler(SIGNAL_ARGS);
*** src/include/storage/sinvaladt.h
--- src/include/storage/sinvaladt.h
***************
*** 29,35 ****
*/
extern Size SInvalShmemSize(void);
extern void CreateSharedInvalidationState(void);
! extern void SharedInvalBackendInit(void);
extern bool BackendIdIsActive(int backendID);
extern void SIInsertDataEntries(const SharedInvalidationMessage *data, int n);
--- 29,35 ----
*/
extern Size SInvalShmemSize(void);
extern void CreateSharedInvalidationState(void);
! extern void SharedInvalBackendInit(bool sendOnly);
extern bool BackendIdIsActive(int backendID);
extern void SIInsertDataEntries(const SharedInvalidationMessage *data, int n);
*** src/include/utils/flatfiles.h
--- src/include/utils/flatfiles.h
***************
*** 27,32 **** extern void AtEOSubXact_UpdateFlatFiles(bool isCommit,
--- 27,39 ----
SubTransactionId mySubid,
SubTransactionId parentSubid);
+ /*
+ * Called by RecordTransactionCommit to allow it to set xinfo flags
+ * on the commit record. Used for standby invalidation of flat files.
+ */
+ extern bool AtEOXact_Database_FlatFile_Update_Needed(void);
+ extern bool AtEOXact_Auth_FlatFile_Update_Needed(void);
+
extern Datum flatfile_update_trigger(PG_FUNCTION_ARGS);
extern void flatfile_twophase_postcommit(TransactionId xid, uint16 info,
*** src/include/utils/inval.h
--- src/include/utils/inval.h
***************
*** 15,20 ****
--- 15,21 ----
#define INVAL_H
#include "access/htup.h"
+ #include "storage/lock.h"
#include "utils/relcache.h"
***************
*** 60,63 **** extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
--- 61,67 ----
extern void inval_twophase_postcommit(TransactionId xid, uint16 info,
void *recdata, uint32 len);
+ extern void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+ char *reason);
+
#endif /* INVAL_H */
*** src/include/utils/snapshot.h
--- src/include/utils/snapshot.h
***************
*** 49,55 **** typedef struct SnapshotData
uint32 xcnt; /* # of xact ids in xip[] */
TransactionId *xip; /* array of xact IDs in progress */
/* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */
! int32 subxcnt; /* # of xact ids in subxip[], -1 if overflow */
TransactionId *subxip; /* array of subxact IDs in progress */
/*
--- 49,65 ----
uint32 xcnt; /* # of xact ids in xip[] */
TransactionId *xip; /* array of xact IDs in progress */
/* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */
!
! /*
! * Prior to 8.4 we represented an overflowed subxid cache with subxcnt -1.
! * In 8.4+ we separate the two concepts because when checking the xids
! * in the snapshot we check *both* subxid cache and subtrans, if subxid
! * cache has overflowed. So we still need the count, even if overflowed.
! * We do this to allow unobserved xids to be placed into the snapshot
! * even when snapshot overflows. It is also a performance gain.
! */
! uint32 subxcnt; /* # of xact ids in subxip[] */
! bool suboverflowed; /* true means at least one subxid cache overflowed */
TransactionId *subxip; /* array of subxact IDs in progress */
/*
***************
*** 63,68 **** typedef struct SnapshotData
--- 73,147 ----
} SnapshotData;
/*
+ * Declarations for GetRunningTransactionData(). Similar to Snapshots, but
+ * not quite. This has nothing at all to do with visibility on this server,
+ * so this is completely separate from snapmgr.c and snapmgr.h
+ * This data is important for creating the initial snapshot state on a
+ * standby server. We need lots more information than a normal snapshot,
+ * hence we use a specific data structure for our needs. This data
+ * is written to WAL as a separate record immediately after each
+ * checkpoint. That means that wherever we start a standby from we will
+ * almost immediately see the data we need to begin executing queries.
+ */
+ typedef struct RunningXact
+ {
+ /* Items matching PGPROC entries */
+ TransactionId xid; /* xact ID in progress */
+ int pid; /* backend's process id, or 0 */
+ Oid databaseId; /* OID of database this backend is using */
+ Oid roleId; /* OID of role using this backend */
+ uint8 vacuumFlags; /* vacuum-related flags, see above */
+
+ /* Items matching XidCache */
+ bool overflowed;
+ int nsubxids; /* # of subxact ids for this xact only */
+
+ /* Additional info */
+ uint32 subx_offset; /* array offset of start of subxip,
+ * zero if nsubxids == 0
+ */
+ } RunningXact;
+
+ typedef struct RunningXactsData
+ {
+ uint32 xcnt; /* # of xact ids in xrun[] */
+ uint32 subxcnt; /* total # of xact ids in subxip[] */
+ TransactionId latestRunningXid; /* Initial setting of LatestObservedXid */
+ TransactionId latestCompletedXid;
+
+ RunningXact *xrun; /* array of RunningXact structs */
+
+ /*
+ * subxip is held as a single contiguous array, so no space is wasted,
+ * plus it helps it fit into one XLogRecord. We continue to keep track
+ * of which subxids go with each top-level xid by tracking the start
+ * offset, held on each RunningXact struct.
+ */
+ TransactionId *subxip; /* array of subxact IDs in progress */
+
+ } RunningXactsData;
+
+ typedef RunningXactsData *RunningTransactions;
+
+ /*
+ * When we write running xact data to WAL, we use this structure.
+ */
+ typedef struct xl_xact_running_xacts
+ {
+ int xcnt; /* # of xact ids in xrun[] */
+ int subxcnt; /* # of xact ids in subxip[] */
+ TransactionId latestRunningXid; /* Initial setting of LatestObservedXid */
+ TransactionId latestCompletedXid;
+
+ /* Array of RunningXact(s) */
+ RunningXact xrun[1]; /* VARIABLE LENGTH ARRAY */
+
+ /* ARRAY OF RUNNING SUBTRANSACTION XIDs FOLLOWS */
+ } xl_xact_running_xacts;
+
+ #define MinSizeOfXactRunningXacts offsetof(xl_xact_running_xacts, xrun)
+
+ /*
* Result codes for HeapTupleSatisfiesUpdate. This should really be in
* tqual.h, but we want to avoid including that file elsewhere.
*/