*** doc/src/sgml/config.sgml --- doc/src/sgml/config.sgml *************** *** 5335,5340 **** plruby.use_strict = true # generates error: unknown class name --- 5335,5366 ---- + + trace_recovery_messages (string) + + trace_recovery_messages configuration parameter + + + + Controls which message levels are written to the server log + for system modules needed for recovery processing. This allows + the user to override the normal setting of log_min_messages, + but only for specific messages. This is intended for use in + debugging Hot Standby. + Valid values are DEBUG5, DEBUG4, + DEBUG3, DEBUG2, DEBUG1, + INFO, NOTICE, WARNING, + ERROR, LOG, FATAL, and + PANIC. Each level includes all the levels that + follow it. The later the level, the fewer messages are sent + to the log. The default is WARNING. Note that + LOG has a different rank here than in + client_min_messages. + Parameter should be set in the postgresql.conf only. + + + + zero_damaged_pages (boolean) *** doc/src/sgml/func.sgml --- doc/src/sgml/func.sgml *************** *** 12893,12898 **** postgres=# select * from pg_xlogfile_name_offset(pg_stop_backup()); --- 12893,13089 ---- . + + pg_is_in_recovery + + + pg_last_completed_xact_timestamp + + + pg_last_completed_xid + + + pg_recovery_pause + + + pg_recovery_continue + + + pg_recovery_pause_cleanup + + + pg_recovery_pause_xid + + + pg_recovery_pause_time + + + pg_recovery_stop + + + + The functions shown in assist in archive recovery. + Except for the first three functions, these are restricted to superusers. + All of these functions can only be executed during recovery. + + + + Recovery Control Functions + + + Name Return Type Description + + + + + + + pg_is_in_recovery() + + bool + True if recovery is still in progress. + + + + pg_last_completed_xact_timestamp() + + timestamp with time zone + Returns the original completion timestamp with timezone of the + last completed transaction in the current recovery. + + + + + pg_last_completed_xid() + + integer + Returns the transaction id (32-bit) of last completed transaction + in the current recovery. Later numbered transaction ids may already have + completed. This is unrelated to transactions on the source server. + + + + + + pg_recovery_pause() + + void + Pause recovery processing, unconditionally. + + + + pg_recovery_continue() + + void + If recovery is paused, continue processing. + + + + pg_recovery_stop() + + void + End recovery and begin normal processing. + + + + pg_recovery_pause_xid() + + void + Continue recovery until specified xid completes, if it is ever + seen, then pause recovery. + + + + + pg_recovery_pause_time() + + void + Continue recovery until a transaction with specified timestamp + completes, if one is ever seen, then pause recovery. + + + + + pg_recovery_pause_cleanup() + + void + Continue recovery until the next cleanup record, then pause. + + + + pg_recovery_pause_advance() + + void + Advance recovery specified number of records then pause. + + + +
+ + + pg_recovery_pause and pg_recovery_continue allow + a superuser to control the progress of recovery on the database server. + While recovery is paused queries can then be executed to determine how far + forwards recovery should progress. Recovery can never go backwards + because previous values are overwritten. If the superuser wishes recovery + to complete and normal processing mode to start, execute + pg_recovery_stop. + + + + Variations of the pause function exist, mainly to allow PITR to dynamically + control where it should progress to. pg_recovery_pause_xid and + pg_recovery_pause_time allow the specification of a trial + recovery target, similarly to . + Recovery will then progress to the specified point and then pause, rather + than stopping permanently, allowing assessment of whether this is the + desired stopping point for recovery. + + + + pg_recovery_pause_cleanup allows recovery to progress only + as far as the next cleanup record. This is useful where a longer running + query needs to access the database in a consistent state and it is + more important that the query executes than it is that we keep processing + new WAL records. This can be used as shown: + + select pg_recovery_pause_cleanup(); + + -- run very important query + select + from big_table1 join big_table2 + on ... + where ... + + select pg_recovery_continue; + + + + + pg_recovery_advance allows recovery to progress record by + record, for very careful analysis or debugging. Step size can be 1 or + more records. If recovery is not yet paused then pg_recovery_advance + will process the specified number of records then pause. If recovery + is already paused, recovery will continue for another N records before + pausing again. + + + + If you pause recovery while the server is waiting for a WAL file when + operating in standby mode it will have apparently no effect until the + file arrives. Once the server begins processing WAL records again it + will notice the pause request and will act upon it. This is not a bug. + pause. + + + + Pausing recovery will also prevent restartpoints from starting since they + are triggered by events in the WAL stream. In all other ways processing + will continue, for example the background writer will continue to clean + shared_buffers while paused. + + The functions shown in calculate the actual disk space usage of database objects. *** src/backend/access/heap/heapam.c --- src/backend/access/heap/heapam.c *************** *** 3814,3832 **** heap_restrpos(HeapScanDesc scan) } /* * Perform XLogInsert for a heap-clean operation. Caller must already * have modified the buffer and marked it dirty. * * Note: prior to Postgres 8.3, the entries in the nowunused[] array were * zero-based tuple indexes. Now they are one-based like other uses * of OffsetNumber. */ XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, ! bool redirect_move) { xl_heap_clean xlrec; uint8 info; --- 3814,3891 ---- } /* + * Update the latestRemovedXid for the current VACUUM. This gets called + * only rarely, since we probably already removed rows earlier. + * see comments for vacuum_log_cleanup_info(). + */ + void + HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, + TransactionId *latestRemovedXid) + { + TransactionId xmin = HeapTupleHeaderGetXmin(tuple); + TransactionId xmax = HeapTupleHeaderGetXmax(tuple); + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + + if (tuple->t_infomask & HEAP_MOVED_OFF || + tuple->t_infomask & HEAP_MOVED_IN) + { + if (TransactionIdPrecedes(*latestRemovedXid, xvac)) + *latestRemovedXid = xvac; + } + + if (TransactionIdPrecedes(*latestRemovedXid, xmax)) + *latestRemovedXid = xmax; + + if (TransactionIdPrecedes(*latestRemovedXid, xmin)) + *latestRemovedXid = xmin; + + Assert(TransactionIdIsValid(*latestRemovedXid)); + } + + /* + * Perform XLogInsert to register a heap cleanup info message. These + * messages are sent once per VACUUM and are required because + * of the phasing of removal operations during a lazy VACUUM. + * see comments for vacuum_log_cleanup_info(). + */ + XLogRecPtr + log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid) + { + xl_heap_cleanup_info xlrec; + XLogRecPtr recptr; + XLogRecData rdata; + + xlrec.node = rnode; + xlrec.latestRemovedXid = latestRemovedXid; + + rdata.data = (char *) &xlrec; + rdata.len = SizeOfHeapCleanupInfo; + rdata.buffer = InvalidBuffer; + rdata.next = NULL; + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO, &rdata); + + return recptr; + } + + /* * Perform XLogInsert for a heap-clean operation. Caller must already * have modified the buffer and marked it dirty. * * Note: prior to Postgres 8.3, the entries in the nowunused[] array were * zero-based tuple indexes. Now they are one-based like other uses * of OffsetNumber. + * + * For 8.4 we also include the latestRemovedXid which allows recovery + * processing to abort standby queries that would be have their results + * potentially changed if we applied these changes. */ XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, ! TransactionId latestRemovedXid, bool redirect_move) { xl_heap_clean xlrec; uint8 info; *************** *** 3838,3843 **** log_heap_clean(Relation reln, Buffer buffer, --- 3897,3903 ---- xlrec.node = reln->rd_node; xlrec.block = BufferGetBlockNumber(buffer); + xlrec.latestRemovedXid = latestRemovedXid; xlrec.nredirected = nredirected; xlrec.ndead = ndead; *************** *** 4129,4135 **** heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move) if (record->xl_info & XLR_BKP_BLOCK_1) return; ! buffer = XLogReadBuffer(xlrec->node, xlrec->block, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); --- 4189,4195 ---- if (record->xl_info & XLR_BKP_BLOCK_1) return; ! buffer = XLogReadBufferForCleanup(xlrec->node, xlrec->block, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); *************** *** 4189,4195 **** heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record) if (record->xl_info & XLR_BKP_BLOCK_1) return; ! buffer = XLogReadBuffer(xlrec->node, xlrec->block, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); --- 4249,4255 ---- if (record->xl_info & XLR_BKP_BLOCK_1) return; ! buffer = XLogReadBufferForCleanup(xlrec->node, xlrec->block, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); *************** *** 4824,4829 **** heap2_redo(XLogRecPtr lsn, XLogRecord *record) --- 4884,4897 ---- case XLOG_HEAP2_CLEAN_MOVE: heap_xlog_clean(lsn, record, true); break; + case XLOG_HEAP2_CLEANUP_INFO: + /* + * Actual operation is a no-op. Record type exists to + * provide information to recovery record pre-processing, + * so the actions for this record have already been taken. + * See ResolveRedoVisibilityConflicts() + */ + break; default: elog(PANIC, "heap2_redo: unknown op code %u", info); } *************** *** 4953,4969 **** heap2_desc(StringInfo buf, uint8 xl_info, char *rec) { xl_heap_clean *xlrec = (xl_heap_clean *) rec; ! appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u", xlrec->node.spcNode, xlrec->node.dbNode, ! xlrec->node.relNode, xlrec->block); } else if (info == XLOG_HEAP2_CLEAN_MOVE) { xl_heap_clean *xlrec = (xl_heap_clean *) rec; ! appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u", xlrec->node.spcNode, xlrec->node.dbNode, ! xlrec->node.relNode, xlrec->block); } else appendStringInfo(buf, "UNKNOWN"); --- 5021,5046 ---- { xl_heap_clean *xlrec = (xl_heap_clean *) rec; ! appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u", xlrec->node.spcNode, xlrec->node.dbNode, ! xlrec->node.relNode, xlrec->block, ! xlrec->latestRemovedXid); } else if (info == XLOG_HEAP2_CLEAN_MOVE) { xl_heap_clean *xlrec = (xl_heap_clean *) rec; ! appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u remxid %u", xlrec->node.spcNode, xlrec->node.dbNode, ! xlrec->node.relNode, xlrec->block, ! xlrec->latestRemovedXid); ! } ! else if (info == XLOG_HEAP2_CLEANUP_INFO) ! { ! xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec; ! ! appendStringInfo(buf, "cleanup info: remxid %u", ! xlrec->latestRemovedXid); } else appendStringInfo(buf, "UNKNOWN"); *** src/backend/access/heap/pruneheap.c --- src/backend/access/heap/pruneheap.c *************** *** 30,35 **** --- 30,36 ---- typedef struct { TransactionId new_prune_xid; /* new prune hint value for page */ + TransactionId latestRemovedXid; /* latest xid to be removed by this prune */ int nredirected; /* numbers of entries in arrays below */ int ndead; int nunused; *************** *** 85,90 **** heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) --- 86,99 ---- return; /* + * We can't write WAL in recovery mode, so there's no point trying to + * clean the page. The master will likely issue a cleaning WAL record + * soon anyway, so this is no particular loss. + */ + if (IsRecoveryProcessingMode()) + return; + + /* * We prune when a previous UPDATE failed to find enough space on the page * for a new tuple version, or when free space falls below the relation's * fill-factor target (but not less than 10%). *************** *** 176,181 **** heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, --- 185,191 ---- * Also initialize the rest of our working state. */ prstate.new_prune_xid = InvalidTransactionId; + prstate.latestRemovedXid = InvalidTransactionId; prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); *************** *** 258,264 **** heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, ! redirect_move); PageSetLSN(BufferGetPage(buffer), recptr); PageSetTLI(BufferGetPage(buffer), ThisTimeLineID); --- 268,274 ---- prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, ! prstate.latestRemovedXid, redirect_move); PageSetLSN(BufferGetPage(buffer), recptr); PageSetTLI(BufferGetPage(buffer), ThisTimeLineID); *************** *** 396,401 **** heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, --- 406,413 ---- == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) { heap_prune_record_unused(prstate, rootoffnum); + HeapTupleHeaderAdvanceLatestRemovedXid(htup, + &prstate->latestRemovedXid); ndeleted++; } *************** *** 521,527 **** heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, --- 533,543 ---- * find another DEAD tuple is a fairly unusual corner case.) */ if (tupdead) + { latestdead = offnum; + HeapTupleHeaderAdvanceLatestRemovedXid(htup, + &prstate->latestRemovedXid); + } else if (!recent_dead) break; *** src/backend/access/index/genam.c --- src/backend/access/index/genam.c *************** *** 89,96 **** RelationGetIndexScan(Relation indexRelation, else scan->keyData = NULL; scan->kill_prior_tuple = false; ! scan->ignore_killed_tuples = true; /* default setting */ scan->opaque = NULL; --- 89,104 ---- else scan->keyData = NULL; + /* + * During recovery we ignore killed tuples and don't bother to kill them + * either. We do this because the xmin on the primary node could easily + * be later than the xmin on the standby node, so that what the primary + * thinks is killed is supposed to be visible on standby. So for correct + * MVCC for queries during recovery we must ignore these hints and check + * all tuples. + */ scan->kill_prior_tuple = false; ! scan->ignore_killed_tuples = IsRecoveryProcessingMode(); /* default setting */ scan->opaque = NULL; *** src/backend/access/index/indexam.c --- src/backend/access/index/indexam.c *************** *** 455,463 **** index_getnext(IndexScanDesc scan, ScanDirection direction) /* * If we scanned a whole HOT chain and found only dead tuples, ! * tell index AM to kill its entry for that TID. */ ! scan->kill_prior_tuple = scan->xs_hot_dead; /* * The AM's gettuple proc finds the next index entry matching the --- 455,466 ---- /* * If we scanned a whole HOT chain and found only dead tuples, ! * tell index AM to kill its entry for that TID. We do not do ! * this when in recovery because it may violate MVCC to do so. ! * see comments in RelationGetIndexScan(). */ ! if (!IsRecoveryProcessingMode()) ! scan->kill_prior_tuple = scan->xs_hot_dead; /* * The AM's gettuple proc finds the next index entry matching the *** src/backend/access/nbtree/nbtinsert.c --- src/backend/access/nbtree/nbtinsert.c *************** *** 1924,1930 **** _bt_vacuum_one_page(Relation rel, Buffer buffer) } if (ndeletable > 0) ! _bt_delitems(rel, buffer, deletable, ndeletable); /* * Note: if we didn't find any LP_DEAD items, then the page's --- 1924,1930 ---- } if (ndeletable > 0) ! _bt_delitems(rel, buffer, deletable, ndeletable, false, 0); /* * Note: if we didn't find any LP_DEAD items, then the page's *** src/backend/access/nbtree/nbtpage.c --- src/backend/access/nbtree/nbtpage.c *************** *** 652,658 **** _bt_page_recyclable(Page page) */ void _bt_delitems(Relation rel, Buffer buf, ! OffsetNumber *itemnos, int nitems) { Page page = BufferGetPage(buf); BTPageOpaque opaque; --- 652,659 ---- */ void _bt_delitems(Relation rel, Buffer buf, ! OffsetNumber *itemnos, int nitems, bool isVacuum, ! BlockNumber lastBlockVacuumed) { Page page = BufferGetPage(buf); BTPageOpaque opaque; *************** *** 684,698 **** _bt_delitems(Relation rel, Buffer buf, /* XLOG stuff */ if (!rel->rd_istemp) { - xl_btree_delete xlrec; XLogRecPtr recptr; XLogRecData rdata[2]; ! xlrec.node = rel->rd_node; ! xlrec.block = BufferGetBlockNumber(buf); - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBtreeDelete; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); --- 685,721 ---- /* XLOG stuff */ if (!rel->rd_istemp) { XLogRecPtr recptr; XLogRecData rdata[2]; ! /* We don't need both, but it simplies the code to have both here */ ! xl_btree_delete xlrec_delete; ! xl_btree_vacuum xlrec_vacuum; ! ! if (isVacuum) ! { ! xlrec_vacuum.node = rel->rd_node; ! xlrec_vacuum.block = BufferGetBlockNumber(buf); ! ! xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; ! rdata[0].data = (char *) &xlrec_vacuum; ! rdata[0].len = SizeOfBtreeVacuum; ! } ! else ! { ! xlrec_delete.node = rel->rd_node; ! xlrec_delete.block = BufferGetBlockNumber(buf); ! ! /* ! * We would like to set an accurate latestRemovedXid, but there ! * is no easy way of obtaining a useful value. So we use the ! * probably far too conservative value of RecentGlobalXmin instead. ! */ ! xlrec_delete.latestRemovedXid = RecentGlobalXmin; ! rdata[0].data = (char *) &xlrec_delete; ! rdata[0].len = SizeOfBtreeDelete; ! } rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); *************** *** 715,721 **** _bt_delitems(Relation rel, Buffer buf, rdata[1].buffer_std = true; rdata[1].next = NULL; ! recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); --- 738,747 ---- rdata[1].buffer_std = true; rdata[1].next = NULL; ! if (isVacuum) ! recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata); ! else ! recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); *** src/backend/access/nbtree/nbtree.c --- src/backend/access/nbtree/nbtree.c *************** *** 58,64 **** typedef struct IndexBulkDeleteCallback callback; void *callback_state; BTCycleId cycleid; ! BlockNumber lastUsedPage; BlockNumber totFreePages; /* true total # of free pages */ MemoryContext pagedelcontext; } BTVacState; --- 58,65 ---- IndexBulkDeleteCallback callback; void *callback_state; BTCycleId cycleid; ! BlockNumber lastBlockVacuumed; /* last blkno reached by Vacuum scan */ ! BlockNumber lastUsedPage; /* blkno of last page that is in use */ BlockNumber totFreePages; /* true total # of free pages */ MemoryContext pagedelcontext; } BTVacState; *************** *** 626,631 **** btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, --- 627,633 ---- vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; + vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastUsedPage = BTREE_METAPAGE; vstate.totFreePages = 0; *************** *** 855,861 **** restart: */ if (ndeletable > 0) { ! _bt_delitems(rel, buf, deletable, ndeletable); stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); --- 857,875 ---- */ if (ndeletable > 0) { ! BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf); ! ! _bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed); ! ! /* ! * Keep track of the block number of the lastBlockVacuumed, so ! * we can scan those blocks as well during WAL replay. This then ! * provides concurrency protection and allows btrees to be used ! * while in recovery. ! */ ! if (lastBlockVacuumed > vstate->lastBlockVacuumed) ! vstate->lastBlockVacuumed = lastBlockVacuumed; ! stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); *** src/backend/access/nbtree/nbtxlog.c --- src/backend/access/nbtree/nbtxlog.c *************** *** 459,464 **** btree_xlog_split(bool onleft, bool isroot, --- 459,534 ---- } static void + btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) + { + xl_btree_vacuum *xlrec; + Buffer buffer; + Page page; + BTPageOpaque opaque; + + if (record->xl_info & XLR_BKP_BLOCK_1) + return; + + xlrec = (xl_btree_vacuum *) XLogRecGetData(record); + + /* + * We need to ensure every block is pinned between the + * lastBlockVacuumed and the current block, if there are any. + * This ensures that every block in the index is touched during + * VACUUM as required to ensure scans work correctly. + */ + if ((xlrec->lastBlockVacuumed + 1) != xlrec->block) + { + BlockNumber blkno = xlrec->lastBlockVacuumed + 1; + + for (; blkno < xlrec->block; blkno++) + { + buffer = XLogReadBufferForCleanup(xlrec->node, blkno, false); + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + } + } + + /* + * We need to take a cleanup lock to apply these changes. + * See nbtree/README for details. + */ + buffer = XLogReadBufferForCleanup(xlrec->node, xlrec->block, false); + if (!BufferIsValid(buffer)) + return; + page = (Page) BufferGetPage(buffer); + + if (XLByteLE(lsn, PageGetLSN(page))) + { + UnlockReleaseBuffer(buffer); + return; + } + + if (record->xl_len > SizeOfBtreeVacuum) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeVacuum); + unend = (OffsetNumber *) ((char *) xlrec + record->xl_len); + + PageIndexMultiDelete(page, unused, unend - unused); + } + + /* + * Mark the page as not containing any LP_DEAD items --- see comments in + * _bt_delitems(). + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + } + + static void btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) { xl_btree_delete *xlrec; *************** *** 470,475 **** btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) --- 540,550 ---- return; xlrec = (xl_btree_delete *) XLogRecGetData(record); + + /* + * We don't need to take a cleanup lock to apply these changes. + * See nbtree/README for details. + */ buffer = XLogReadBuffer(xlrec->node, xlrec->block, false); if (!BufferIsValid(buffer)) return; *************** *** 737,742 **** btree_redo(XLogRecPtr lsn, XLogRecord *record) --- 812,819 ---- case XLOG_BTREE_SPLIT_R_ROOT: btree_xlog_split(false, true, lsn, record); break; + case XLOG_BTREE_VACUUM: + btree_xlog_vacuum(lsn, record); case XLOG_BTREE_DELETE: btree_xlog_delete(lsn, record); break; *************** *** 753,758 **** btree_redo(XLogRecPtr lsn, XLogRecord *record) --- 830,899 ---- } } + bool + btree_is_cleanup_record(uint8 info) + { + switch (info) + { + case XLOG_BTREE_VACUUM: + case XLOG_BTREE_DELETE: + return true; + break; + + case XLOG_BTREE_INSERT_LEAF: + case XLOG_BTREE_INSERT_UPPER: + case XLOG_BTREE_INSERT_META: + case XLOG_BTREE_SPLIT_L: + case XLOG_BTREE_SPLIT_R: + case XLOG_BTREE_SPLIT_L_ROOT: + case XLOG_BTREE_SPLIT_R_ROOT: + case XLOG_BTREE_DELETE_PAGE: + case XLOG_BTREE_DELETE_PAGE_META: + case XLOG_BTREE_DELETE_PAGE_HALF: + case XLOG_BTREE_NEWROOT: + return false; + break; + + default: + elog(PANIC, "btree_is_cleanup_record: unknown op code %u", info); + } + + /* never reached */ + return false; + } + + bool + btree_needs_cleanup_lock(uint8 info) + { + switch (info) + { + case XLOG_BTREE_VACUUM: + return true; + break; + + case XLOG_BTREE_INSERT_LEAF: + case XLOG_BTREE_INSERT_UPPER: + case XLOG_BTREE_INSERT_META: + case XLOG_BTREE_SPLIT_L: + case XLOG_BTREE_SPLIT_R: + case XLOG_BTREE_SPLIT_L_ROOT: + case XLOG_BTREE_SPLIT_R_ROOT: + case XLOG_BTREE_DELETE: + case XLOG_BTREE_DELETE_PAGE: + case XLOG_BTREE_DELETE_PAGE_META: + case XLOG_BTREE_DELETE_PAGE_HALF: + case XLOG_BTREE_NEWROOT: + return false; + break; + + default: + elog(PANIC, "btree_needs_cleanup_lock: unknown op code %u", info); + } + + /* never reached */ + return false; + } + static void out_target(StringInfo buf, xl_btreetid *target) { *************** *** 841,853 **** btree_desc(StringInfo buf, uint8 xl_info, char *rec) xlrec->level, xlrec->firstright); break; } case XLOG_BTREE_DELETE: { xl_btree_delete *xlrec = (xl_btree_delete *) rec; ! appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u", xlrec->node.spcNode, xlrec->node.dbNode, ! xlrec->node.relNode, xlrec->block); break; } case XLOG_BTREE_DELETE_PAGE: --- 982,1005 ---- xlrec->level, xlrec->firstright); break; } + case XLOG_BTREE_VACUUM: + { + xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; + + appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block, + xlrec->lastBlockVacuumed); + break; + } case XLOG_BTREE_DELETE: { xl_btree_delete *xlrec = (xl_btree_delete *) rec; ! appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u, latestRemovedXid %u", xlrec->node.spcNode, xlrec->node.dbNode, ! xlrec->node.relNode, xlrec->block, ! xlrec->latestRemovedXid); break; } case XLOG_BTREE_DELETE_PAGE: *** src/backend/access/transam/clog.c --- src/backend/access/transam/clog.c *************** *** 475,480 **** ZeroCLOGPage(int pageno, bool writeXlog) --- 475,483 ---- /* * This must be called ONCE during postmaster or standalone-backend startup, * after StartupXLOG has initialized ShmemVariableCache->nextXid. + * + * We access just a single clog page, so this action is atomic and safe + * for use if other processes are active during recovery. */ void StartupCLOG(void) *** src/backend/access/transam/multixact.c --- src/backend/access/transam/multixact.c *************** *** 1413,1420 **** ZeroMultiXactMemberPage(int pageno, bool writeXlog) * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. Note that we * may already have replayed WAL data into the SLRU files. * ! * We don't need any locks here, really; the SLRU locks are taken ! * only because slru.c expects to be called with locks held. */ void StartupMultiXact(void) --- 1413,1423 ---- * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. Note that we * may already have replayed WAL data into the SLRU files. * ! * We want this operation to be atomic to ensure that other processes can ! * use MultiXact while we complete recovery. We access one page only from the ! * offset and members buffers, so once locks are acquired they will not be ! * dropped and re-acquired by SLRU code. So we take both locks at start, then ! * hold them all the way to the end. */ void StartupMultiXact(void) *************** *** 1426,1431 **** StartupMultiXact(void) --- 1429,1435 ---- /* Clean up offsets state */ LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); + LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); /* * Initialize our idea of the latest page number. *************** *** 1452,1461 **** StartupMultiXact(void) MultiXactOffsetCtl->shared->page_dirty[slotno] = true; } - LWLockRelease(MultiXactOffsetControlLock); - /* And the same for members */ - LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); /* * Initialize our idea of the latest page number. --- 1456,1462 ---- *************** *** 1483,1488 **** StartupMultiXact(void) --- 1484,1490 ---- } LWLockRelease(MultiXactMemberControlLock); + LWLockRelease(MultiXactOffsetControlLock); /* * Initialize lastTruncationPoint to invalid, ensuring that the first *************** *** 1542,1549 **** CheckPointMultiXact(void) * isn't valid (because StartupMultiXact hasn't been called yet) and so * SimpleLruTruncate would get confused. It seems best not to risk * removing any data during recovery anyway, so don't truncate. */ ! if (!InRecovery) TruncateMultiXact(); TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); --- 1544,1552 ---- * isn't valid (because StartupMultiXact hasn't been called yet) and so * SimpleLruTruncate would get confused. It seems best not to risk * removing any data during recovery anyway, so don't truncate. + * We are executing in the bgwriter, so we must access shared status. */ ! if (!IsRecoveryProcessingMode()) TruncateMultiXact(); TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); *** src/backend/access/transam/rmgr.c --- src/backend/access/transam/rmgr.c *************** *** 20,25 **** --- 20,26 ---- #include "commands/dbcommands.h" #include "commands/sequence.h" #include "commands/tablespace.h" + #include "storage/sinval.h" #include "storage/freespace.h" *************** *** 32,38 **** const RmgrData RmgrTable[RM_MAX_ID + 1] = { {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL}, {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL}, {"Reserved 7", NULL, NULL, NULL, NULL, NULL}, ! {"Reserved 8", NULL, NULL, NULL, NULL, NULL}, {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL}, {"Heap", heap_redo, heap_desc, NULL, NULL, NULL}, {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint}, --- 33,39 ---- {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL}, {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL}, {"Reserved 7", NULL, NULL, NULL, NULL, NULL}, ! {"Relation", relation_redo, relation_desc, NULL, NULL, NULL}, {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL}, {"Heap", heap_redo, heap_desc, NULL, NULL, NULL}, {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint}, *** src/backend/access/transam/slru.c --- src/backend/access/transam/slru.c *************** *** 598,604 **** SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) * commands to set the commit status of transactions whose bits are in * already-truncated segments of the commit log (see notes in * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case ! * where the file doesn't exist, and return zeroes instead. */ fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) --- 598,605 ---- * commands to set the commit status of transactions whose bits are in * already-truncated segments of the commit log (see notes in * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case ! * where the file doesn't exist, and return zeroes instead. We also ! * return a zeroed page when seek and read fails. */ fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) *************** *** 619,624 **** SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) --- 620,633 ---- if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { + if (InRecovery) + { + ereport(LOG, + (errmsg("file \"%s\" doesn't exist, reading as zeroes", + path))); + MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + return true; + } slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; close(fd); *************** *** 628,633 **** SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) --- 637,650 ---- errno = 0; if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { + if (InRecovery) + { + ereport(LOG, + (errmsg("file \"%s\" doesn't exist, reading as zeroes", + path))); + MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + return true; + } slru_errcause = SLRU_READ_FAILED; slru_errno = errno; close(fd); *** src/backend/access/transam/subtrans.c --- src/backend/access/transam/subtrans.c *************** *** 223,255 **** ZeroSUBTRANSPage(int pageno) /* * This must be called ONCE during postmaster or standalone-backend startup, * after StartupXLOG has initialized ShmemVariableCache->nextXid. - * - * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid - * if there are none. */ void StartupSUBTRANS(TransactionId oldestActiveXID) { ! int startPage; ! int endPage; - /* - * Since we don't expect pg_subtrans to be valid across crashes, we - * initialize the currently-active page(s) to zeroes during startup. - * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero - * the new page without regard to whatever was previously on disk. - */ LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); ! startPage = TransactionIdToPage(oldestActiveXID); ! endPage = TransactionIdToPage(ShmemVariableCache->nextXid); ! ! while (startPage != endPage) ! { ! (void) ZeroSUBTRANSPage(startPage); ! startPage++; ! } ! (void) ZeroSUBTRANSPage(startPage); LWLockRelease(SubtransControlLock); } --- 223,241 ---- /* * This must be called ONCE during postmaster or standalone-backend startup, * after StartupXLOG has initialized ShmemVariableCache->nextXid. */ void StartupSUBTRANS(TransactionId oldestActiveXID) { ! TransactionId xid = ShmemVariableCache->nextXid; ! int pageno = TransactionIdToPage(xid); LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); ! /* ! * Initialize our idea of the latest page number. ! */ ! SubTransCtl->shared->latest_page_number = pageno; LWLockRelease(SubtransControlLock); } *** src/backend/access/transam/twophase.c --- src/backend/access/transam/twophase.c *************** *** 1719,1724 **** RecordTransactionCommitPrepared(TransactionId xid, --- 1719,1725 ---- /* Emit the XLOG commit record */ xlrec.xid = xid; xlrec.crec.xact_time = GetCurrentTimestamp(); + xlrec.crec.xinfo = 0; xlrec.crec.nrels = nrels; xlrec.crec.nsubxacts = nchildren; rdata[0].data = (char *) (&xlrec); *************** *** 1797,1802 **** RecordTransactionAbortPrepared(TransactionId xid, --- 1798,1804 ---- /* Emit the XLOG abort record */ xlrec.xid = xid; xlrec.arec.xact_time = GetCurrentTimestamp(); + xlrec.arec.xinfo = 0; xlrec.arec.nrels = nrels; xlrec.arec.nsubxacts = nchildren; rdata[0].data = (char *) (&xlrec); *** src/backend/access/transam/xact.c --- src/backend/access/transam/xact.c *************** *** 40,45 **** --- 40,46 ---- #include "storage/fd.h" #include "storage/lmgr.h" #include "storage/procarray.h" + #include "storage/sinval.h" #include "storage/sinvaladt.h" #include "storage/smgr.h" #include "utils/combocid.h" *************** *** 141,146 **** typedef struct TransactionStateData --- 142,149 ---- Oid prevUser; /* previous CurrentUserId setting */ bool prevSecDefCxt; /* previous SecurityDefinerContext setting */ bool prevXactReadOnly; /* entry-time xact r/o state */ + bool xidMarkedInWAL; /* is this xid present in WAL yet? */ + bool hasUnMarkedSubXids; /* had unmarked subxids */ struct TransactionStateData *parent; /* back link to parent */ } TransactionStateData; *************** *** 169,174 **** static TransactionStateData TopTransactionStateData = { --- 172,179 ---- InvalidOid, /* previous CurrentUserId setting */ false, /* previous SecurityDefinerContext setting */ false, /* entry-time xact r/o state */ + false, /* initial state for xidMarkedInWAL */ + false, /* hasUnMarkedSubXids */ NULL /* link to parent state block */ }; *************** *** 212,217 **** static bool forceSyncCommit = false; --- 217,232 ---- static MemoryContext TransactionAbortContext = NULL; /* + * Bookkeeping for tracking emulated transactions in Recovery Procs. + */ + static TransactionId latestObservedXid = InvalidTransactionId; + + /* + * Local state to optimise XactResolveRecoveryConflicts() + */ + static TransactionId localLatestRemovedXid = InvalidTransactionId; + + /* * List of add-on start- and end-of-xact callbacks */ typedef struct XactCallbackItem *************** *** 237,243 **** static SubXactCallbackItem *SubXact_callbacks = NULL; /* local function prototypes */ ! static void AssignTransactionId(TransactionState s); static void AbortTransaction(void); static void AtAbort_Memory(void); static void AtCleanup_Memory(void); --- 252,258 ---- /* local function prototypes */ ! static void AssignTransactionId(TransactionState s, int recursion_level); static void AbortTransaction(void); static void AtAbort_Memory(void); static void AtCleanup_Memory(void); *************** *** 331,337 **** TransactionId GetTopTransactionId(void) { if (!TransactionIdIsValid(TopTransactionStateData.transactionId)) ! AssignTransactionId(&TopTransactionStateData); return TopTransactionStateData.transactionId; } --- 346,352 ---- GetTopTransactionId(void) { if (!TransactionIdIsValid(TopTransactionStateData.transactionId)) ! AssignTransactionId(&TopTransactionStateData, 0); return TopTransactionStateData.transactionId; } *************** *** 361,367 **** GetCurrentTransactionId(void) TransactionState s = CurrentTransactionState; if (!TransactionIdIsValid(s->transactionId)) ! AssignTransactionId(s); return s->transactionId; } --- 376,382 ---- TransactionState s = CurrentTransactionState; if (!TransactionIdIsValid(s->transactionId)) ! AssignTransactionId(s, 0); return s->transactionId; } *************** *** 389,399 **** GetCurrentTransactionIdIfAny(void) * following its parent's. */ static void ! AssignTransactionId(TransactionState s) { bool isSubXact = (s->parent != NULL); ResourceOwner currentOwner; /* Assert that caller didn't screw up */ Assert(!TransactionIdIsValid(s->transactionId)); Assert(s->state == TRANS_INPROGRESS); --- 404,417 ---- * following its parent's. */ static void ! AssignTransactionId(TransactionState s, int recursion_level) { bool isSubXact = (s->parent != NULL); ResourceOwner currentOwner; + if (IsRecoveryProcessingMode()) + elog(FATAL, "cannot assign TransactionIds during recovery"); + /* Assert that caller didn't screw up */ Assert(!TransactionIdIsValid(s->transactionId)); Assert(s->state == TRANS_INPROGRESS); *************** *** 403,409 **** AssignTransactionId(TransactionState s) * than its parent. */ if (isSubXact && !TransactionIdIsValid(s->parent->transactionId)) ! AssignTransactionId(s->parent); /* * Generate a new Xid and record it in PG_PROC and pg_subtrans. --- 421,427 ---- * than its parent. */ if (isSubXact && !TransactionIdIsValid(s->parent->transactionId)) ! AssignTransactionId(s->parent, recursion_level + 1); /* * Generate a new Xid and record it in PG_PROC and pg_subtrans. *************** *** 415,421 **** AssignTransactionId(TransactionState s) */ s->transactionId = GetNewTransactionId(isSubXact); ! if (isSubXact) SubTransSetParent(s->transactionId, s->parent->transactionId); /* --- 433,446 ---- */ s->transactionId = GetNewTransactionId(isSubXact); ! /* ! * If we have overflowed the subxid cache then we must mark subtrans ! * with the parent xid. Prior to 8.4 we marked subtrans for each ! * subtransaction, though that is no longer necessary because the ! * way snapshots are searched in XidInMVCCSnapshot() has changed to ! * allow searching of both subxid cache and subtrans, not either/or. ! */ ! if (isSubXact && MyProc->subxids.overflowed) SubTransSetParent(s->transactionId, s->parent->transactionId); /* *************** *** 437,444 **** AssignTransactionId(TransactionState s) } PG_END_TRY(); CurrentResourceOwner = currentOwner; - } /* * GetCurrentSubTransactionId --- 462,534 ---- } PG_END_TRY(); CurrentResourceOwner = currentOwner; + elog(trace_recovery(DEBUG2), + "AssignXactId xid %d nest %d recursion %d xidMarkedInWAL %s hasParent %s", + s->transactionId, + GetCurrentTransactionNestLevel(), + recursion_level, + s->xidMarkedInWAL ? "t" : "f", + s->parent ? "t" : "f"); + + /* + * Recovery environment needs to know when a transaction first starts + * making changes to the database. We could issue an assignment WAL + * record for every transaction and subtransaction but that would be + * a large performance hit. So we go to some trouble to optimise this + * by marking the first WAL record with additional information, so we can + * piggyback on the normal flow of processing. There are still some cases + * where we need to write xid assignment WAL records, though these cases + * are rare in most applications. + * + * SO, if needed, WAL log this assignment. We can mark an xid and its + * immediate parent on a single WAL record, so if we recursively assign + * more than two xids at the same time we need to write some assignment + * log records. + */ + if (recursion_level > 1 || (recursion_level == 1 && isSubXact)) + { + XLogRecData rdata; + xl_xact_assignment xlrec; + + xlrec.xassign = s->transactionId; + xlrec.isSubXact = (s->parent != NULL); + + if (xlrec.isSubXact) + xlrec.xparent = s->parent->transactionId; + else + xlrec.xparent = InvalidTransactionId; + + START_CRIT_SECTION(); + + rdata.data = (char *) (&xlrec); + rdata.len = sizeof(xl_xact_assignment); + rdata.buffer = InvalidBuffer; + rdata.next = NULL; + + /* + * These WAL records look like no other. We are assigning a + * TransactionId to upper levels of the transaction stack. The + * transaction level we are looking may *not* be the *current* + * transaction. We have not yet assigned the xid for the current + * transaction, so the xid of this WAL record will be + * InvalidTransactionId, even though we are in a transaction. + * Got that? + * + * So we stuff the newly assigned xid into the WAL record and + * let WAL replay sort it out later. + */ + (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT, &rdata); + + END_CRIT_SECTION(); + + /* + * Mark this transaction level, so we can avoid issuing WAL records + * for later subtransactions also. + */ + s->xidMarkedInWAL = true; + } + } /* * GetCurrentSubTransactionId *************** *** 824,834 **** RecordTransactionCommit(void) bool haveNonTemp; int nchildren; TransactionId *children; /* Get data needed for commit record */ nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp); nchildren = xactGetCommittedChildren(&children); ! /* * If we haven't been assigned an XID yet, we neither can, nor do we want * to write a COMMIT record. --- 914,928 ---- bool haveNonTemp; int nchildren; TransactionId *children; + int nmsgs; + SharedInvalidationMessage *invalidationMessages = NULL; + bool RelcacheInitFileInval; /* Get data needed for commit record */ nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp); nchildren = xactGetCommittedChildren(&children); ! nmsgs = xactGetCommittedInvalidationMessages(&invalidationMessages, ! &RelcacheInitFileInval); /* * If we haven't been assigned an XID yet, we neither can, nor do we want * to write a COMMIT record. *************** *** 862,868 **** RecordTransactionCommit(void) /* * Begin commit critical section and insert the commit XLOG record. */ ! XLogRecData rdata[3]; int lastrdata = 0; xl_xact_commit xlrec; --- 956,962 ---- /* * Begin commit critical section and insert the commit XLOG record. */ ! XLogRecData rdata[4]; int lastrdata = 0; xl_xact_commit xlrec; *************** *** 870,875 **** RecordTransactionCommit(void) --- 964,984 ---- BufmgrCommit(); /* + * Set flags required for recovery processing of commits. + * Nothing too critical here that we would want to include this + * within the critical section following. + */ + xlrec.xinfo = 0; + if (CurrentTransactionState->hasUnMarkedSubXids) + xlrec.xinfo |= XACT_COMPLETION_UNMARKED_SUBXIDS; + if (AtEOXact_Database_FlatFile_Update_Needed()) + xlrec.xinfo |= XACT_COMPLETION_UPDATE_DB_FILE; + if (AtEOXact_Auth_FlatFile_Update_Needed()) + xlrec.xinfo |= XACT_COMPLETION_UPDATE_AUTH_FILE; + if (RelcacheInitFileInval) + xlrec.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE; + + /* * Mark ourselves as within our "commit critical section". This * forces any concurrent checkpoint to wait until we've updated * pg_clog. Without this, it is possible for the checkpoint to set *************** *** 893,898 **** RecordTransactionCommit(void) --- 1002,1009 ---- xlrec.xact_time = xactStopTimestamp; xlrec.nrels = nrels; xlrec.nsubxacts = nchildren; + xlrec.nmsgs = nmsgs; + rdata[0].data = (char *) (&xlrec); rdata[0].len = MinSizeOfXactCommit; rdata[0].buffer = InvalidBuffer; *************** *** 914,919 **** RecordTransactionCommit(void) --- 1025,1039 ---- rdata[2].buffer = InvalidBuffer; lastrdata = 2; } + /* dump shared cache invalidation messages */ + if (nmsgs > 0) + { + rdata[lastrdata].next = &(rdata[3]); + rdata[3].data = (char *) invalidationMessages; + rdata[3].len = nmsgs * sizeof(SharedInvalidationMessage); + rdata[3].buffer = InvalidBuffer; + lastrdata = 3; + } rdata[lastrdata].next = NULL; (void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata); *************** *** 1219,1224 **** RecordTransactionAbort(bool isSubXact) --- 1339,1347 ---- } xlrec.nrels = nrels; xlrec.nsubxacts = nchildren; + xlrec.xinfo = 0; + if (CurrentTransactionState->hasUnMarkedSubXids) + xlrec.xinfo |= XACT_COMPLETION_UNMARKED_SUBXIDS; rdata[0].data = (char *) (&xlrec); rdata[0].len = MinSizeOfXactAbort; rdata[0].buffer = InvalidBuffer; *************** *** 1525,1530 **** StartTransaction(void) --- 1648,1655 ---- s->childXids = NULL; s->nChildXids = 0; s->maxChildXids = 0; + s->xidMarkedInWAL = false; + s->hasUnMarkedSubXids = false; GetUserIdAndContext(&s->prevUser, &s->prevSecDefCxt); /* SecurityDefinerContext should never be set outside a transaction */ Assert(!s->prevSecDefCxt); *************** *** 1637,1643 **** CommitTransaction(void) * must be done _before_ releasing locks we hold and _after_ * RecordTransactionCommit. */ ! ProcArrayEndTransaction(MyProc, latestXid); /* * This is all post-commit cleanup. Note that if an error is raised here, --- 1762,1768 ---- * must be done _before_ releasing locks we hold and _after_ * RecordTransactionCommit. */ ! ProcArrayEndTransaction(MyProc, latestXid, 0, NULL); /* * This is all post-commit cleanup. Note that if an error is raised here, *************** *** 2055,2061 **** AbortTransaction(void) * must be done _before_ releasing locks we hold and _after_ * RecordTransactionAbort. */ ! ProcArrayEndTransaction(MyProc, latestXid); /* * Post-abort cleanup. See notes in CommitTransaction() concerning --- 2180,2186 ---- * must be done _before_ releasing locks we hold and _after_ * RecordTransactionAbort. */ ! ProcArrayEndTransaction(MyProc, latestXid, 0, NULL); /* * Post-abort cleanup. See notes in CommitTransaction() concerning *************** *** 3753,3758 **** CommitSubTransaction(void) --- 3878,3889 ---- /* Must CCI to ensure commands of subtransaction are seen as done */ CommandCounterIncrement(); + /* + * Make sure we keep tracking xids that haven't marked WAL. + */ + if (!s->xidMarkedInWAL || s->hasUnMarkedSubXids) + s->parent->hasUnMarkedSubXids = true; + /* * Prior to 8.4 we marked subcommit in clog at this point. We now only * perform that step, if required, as part of the atomic update of the *************** *** 3872,3877 **** AbortSubTransaction(void) --- 4003,4014 ---- s->state = TRANS_ABORT; /* + * Make sure we keep tracking xids that haven't marked WAL. + */ + if (!s->xidMarkedInWAL || s->hasUnMarkedSubXids) + s->parent->hasUnMarkedSubXids = true; + + /* * Reset user ID which might have been changed transiently. (See notes * in AbortTransaction.) */ *************** *** 4214,4244 **** xactGetCommittedChildren(TransactionId **ptr) } /* * XLOG support routines */ static void ! xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid) { TransactionId *sub_xids; TransactionId max_xid; int i; - /* Mark the transaction committed in pg_clog */ - sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); - TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids); - /* Make sure nextXid is beyond any XID mentioned in the record */ max_xid = xid; for (i = 0; i < xlrec->nsubxacts; i++) { if (TransactionIdPrecedes(max_xid, sub_xids[i])) max_xid = sub_xids[i]; } if (TransactionIdFollowsOrEquals(max_xid, ShmemVariableCache->nextXid)) { ShmemVariableCache->nextXid = max_xid; TransactionIdAdvance(ShmemVariableCache->nextXid); } --- 4351,4852 ---- } /* + * Record an enhanced snapshot of running transactions into WAL. + */ + void + LogCurrentRunningXacts(void) + { + RunningTransactions CurrRunningXacts = GetRunningTransactionData(); + xl_xact_running_xacts xlrec; + XLogRecData rdata[3]; + int lastrdata = 0; + XLogRecPtr recptr; + + xlrec.xcnt = CurrRunningXacts->xcnt; + xlrec.subxcnt = CurrRunningXacts->subxcnt; + xlrec.latestRunningXid = CurrRunningXacts->latestRunningXid; + xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; + + /* Header */ + rdata[0].data = (char *) (&xlrec); + rdata[0].len = MinSizeOfXactRunningXacts; + rdata[0].buffer = InvalidBuffer; + + /* array of RunningXact */ + if (xlrec.xcnt > 0) + { + rdata[0].next = &(rdata[1]); + rdata[1].data = (char *) CurrRunningXacts->xrun; + rdata[1].len = xlrec.xcnt * sizeof(RunningXact); + rdata[1].buffer = InvalidBuffer; + lastrdata = 1; + } + + /* array of RunningXact */ + if (xlrec.subxcnt > 0) + { + rdata[lastrdata].next = &(rdata[2]); + rdata[2].data = (char *) CurrRunningXacts->subxip; + rdata[2].len = xlrec.subxcnt * sizeof(TransactionId); + rdata[2].buffer = InvalidBuffer; + lastrdata = 2; + } + + rdata[lastrdata].next = NULL; + + START_CRIT_SECTION(); + + recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_RUNNING_XACTS, rdata); + + END_CRIT_SECTION(); + + elog(trace_recovery(DEBUG2), "captured snapshot of running xacts %X/%X", recptr.xlogid, recptr.xrecoff); + } + + /* + * Is the data available to allow valid snapshots? + */ + bool + IsRunningXactDataValid(void) + { + if (TransactionIdIsValid(latestObservedXid)) + return true; + + return false; + } + + /* + * We need to issue shared invalidations and hold locks. Holding locks + * means others may want to wait on us, so we need to make lock table + * inserts to appear like a transaction. We could create and delete + * lock table entries for each transaction but its simpler just to create + * one permanent entry and leave it there all the time. Locks are then + * acquired and released as needed. Yes, this means you can see the + * Startup process in pg_locks once we have run this. + */ + void + InitRecoveryTransactionEnvironment(void) + { + VirtualTransactionId vxid; + + /* + * Initialise shared invalidation management for Startup process, + * being careful to register ourselves as a sendOnly process so + * we don't need to read messages, nor will we get signalled + * when the queue starts filling up. + */ + SharedInvalBackendInit(true); + + /* + * Additional initialisation tasks. Most of this was performed + * during initial stages of startup. + */ + ProcArrayInitRecoveryEnvironment(); + + /* + * Lock a virtual transaction id for Startup process. + * + * We need to do GetNextLocalTransactionId() because + * SharedInvalBackendInit() leaves localTransactionid invalid and + * the lock manager doesn't like that at all. + * + * Note that we don't need to run XactLockTableInsert() because nobody + * needs to wait on xids. That sounds a little strange, but table locks + * are held by vxids and row level locks are held by xids. All queries + * hold AccessShareLocks so never block while we write or lock new rows. + */ + vxid.backendId = MyBackendId; + vxid.localTransactionId = GetNextLocalTransactionId(); + VirtualXactLockTableInsert(vxid); + + /* + * Now that the database is consistent we can create a valid copy of + * the flat files required for connection and authentication. This + * may already have been executed at appropriate commit points, but + * we cannot trust that those executions were correct, so force it + * again now just to be safe. + */ + BuildFlatFiles(false); + } + + /* + * Called during archive recovery when we already know the WAL record is + * a cleanup record that might remove data that should be visible to + * some currently active snapshot. + * + * * First pull the latestRemovedXid and databaseId out of WAL record. + * * Get all virtual xids whose xmin is earlier than latestRemovedXid + * and who are in the same database + * * Check/Wait until we either give up waiting or vxids end + * * Blow away any backend we gave up waiting for it to complete + */ + void + XactResolveRecoveryConflicts(TransactionId latestRemovedXid, Oid recDatabaseOid) + { + VirtualTransactionId *old_snapshots; + + /* + * Don't bother checking for conflicts for cleanup records earlier than + * we have already tested for. + */ + if (TransactionIdIsValid(localLatestRemovedXid) && + TransactionIdFollowsOrEquals(localLatestRemovedXid, latestRemovedXid)) + return; + + old_snapshots = GetCurrentVirtualXIDs(latestRemovedXid, + recDatabaseOid, + 0 /* no need to exclude vacuum */); + + ResolveRecoveryConflictWithVirtualXIDs(old_snapshots, + "cleanup redo"); + + /* + * Remember how far we've cleaned to avoid some checks in the future, + * since ResolveRecoveryConflictWithVirtualXIDs() accesses the ProcArray + * and is relatively expensive. + */ + localLatestRemovedXid = latestRemovedXid; + } + + /* + * During recovery we maintain ProcArray with incoming xids + * when we first observe them in use. Uses local variables, so + * should only be called by Startup process. + * + * We record all xids that we know have been assigned. That includes + * all the xids on the WAL record, plus all unobserved xids that + * we can deduce have been assigned. We can deduce the existence of + * unobserved xids because we know xids are in sequence, with no gaps. + * + * XXX Be careful of what happens when we use pg_resetxlogs. + */ + void + RecordKnownAssignedTransactionIds(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + TransactionId xid, + child_xid, + top_xid; + PGPROC *proc; + bool first_seen; + bool mark_subtrans = false; + + if (!IsRunningXactDataValid()) + return; + + /* + * If its an assignment record, we need to need extract data from + * the body of the record, rather than take header values. This + * is because an assignment record can be issued when + * GetCurrentTransactionIdIfAny() returns InvalidTransactionId. + */ + if (record->xl_rmid == RM_XACT_ID && info == XLOG_XACT_ASSIGNMENT) + { + xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record); + + child_xid = xlrec->xassign; + top_xid = xlrec->xparent; + } + else + { + child_xid = record->xl_xid; + top_xid = record->xl_parentxid; + } + xid = child_xid; + if (child_xid == top_xid) + child_xid = InvalidTransactionId; + + if (!TransactionIdIsValid(top_xid)) + return; + + /* + elog(trace_recovery(DEBUG4), "RecordKnown xid %d parent %d" + " latestObsvXid %d firstXid %s firstSubXid %s markSubtrans %s", + xid, parent_xid, latestObservedXid, + XLogRecIsFirstXidRecord(record) ? "t" : "f", + XLogRecIsFirstSubXidRecord(record) ? "t" : "f", + XLogRecMustMarkSubtrans(record) ? "t" : "f"); + */ + /* + * Identify the recovery proc that holds replay info for this xid. + * + * XXX: This gets called for every WAL record (with XID). I think we'll + * need a faster version of BackendiXidGetProc, using a hash table or + * something. FWIW, the hash table wouldn't need to be in shared memory, + * because the startup process is the only one doing this. + */ + proc = BackendXidGetProc(top_xid); + + elog(trace_recovery(DEBUG4), + "start recovery top_xid = %u child_xid = %u lsn = %X/%X", + top_xid, child_xid, lsn.xlogid, lsn.xrecoff); + + if (proc == NULL) + { + proc = InitRecoveryProcess(); + proc->xid = top_xid; + ProcArrayAdd(proc); + first_seen = true; + } + else + first_seen = false; + + /* + * Currently, we choose to take ProcArrayLock every time. We don't + * need to do this for every case, since if we know there are no + * UnobservedXids we could just call ProcArrayStartRecoveryTransaction() + * without locks, just is done during normal running. For now, be safe. + * See GetNewTransactionId(). XXX this comment needs updating, there's + * no ProcArrayStartRecoveryTransaction() anymore, for starters. + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Use volatile pointer to prevent code rearrangement; other backends + * could be examining my subxids info concurrently, and we don't want + * them to see an invalid intermediate state, such as incrementing + * nxids before filling the array entry. Note we are assuming that + * TransactionId and int fetch/store are atomic. + * + * XXX Is that a concern when we hold ProcArrayLock? + */ + { + volatile PGPROC *myproc = proc; + + myproc->lsn = lsn; + + if (TransactionIdIsValid(child_xid)) + { + int nxids = myproc->subxids.nxids; + + if (nxids < PGPROC_MAX_CACHED_SUBXIDS) + { + /* XXX: Can we assume that subxids are seen in xid order? */ + if (nxids > 0 && TransactionIdPrecedes(myproc->subxids.xids[nxids - 1], child_xid)) + { + myproc->subxids.xids[nxids] = child_xid; + myproc->subxids.nxids = nxids + 1; + } + } + else + { + myproc->subxids.overflowed = true; + mark_subtrans = true; + } + } + } + + /* + * When a newly observed xid arrives, it is frequently the case + * that it is *not* the next xid in sequence. When this occurs, we + * must treat the intervening xids as running also. So we maintain + * a special list of these UnobservedXids, so that snapshots can + * see the missing xids as in-progress. + * + * We maintain both recovery Procs *and* UnobservedXids because we + * need them both. Recovery procs allow us to store top-level xids + * and subtransactions separately, otherwise we wouldn't know + * when to overflow the subxid cache. UnobservedXids allow us to + * make sense of the out-of-order arrival of xids. + * + * Some examples: + * 1) latestObservedXid = 647 + * next xid observed in WAL = 651 (a top-level transaction) + * so we add 648, 649, 650 to UnobservedXids + * + * 2) latestObservedXid = 769 + * next xid observed in WAL = 771 (a subtransaction) + * so we add 770 to UnobservedXids + * + * 3) latestObservedXid = 769 + * next xid observed in WAL = 810 (a subtransaction) + * 810's parent had not yet recorded WAL = 807 + * so we add 770 thru 809 inclusive to UnobservedXids + * then remove 807 + * + * 4) latestObservedXid = 769 + * next xid observed in WAL = 771 (a subtransaction) + * 771's parent had not yet recorded WAL = 770 + * so do nothing + * + * 5) latestObservedXid = 7747 + * next xid observed in WAL = 7748 (a subtransaction) + * 7748's parent had not yet recorded WAL = 7742 + * so we add 7748 and removed 7742 + */ + + /* + * Just remember when reading this logic that by definition we have + * Assert(TransactionIdPrecedes(parent_xid, xid)) + */ + + /* + * Just have one xid to process, so fairly simple + */ + + for (xid = top_xid; TransactionIdIsValid(xid); xid = child_xid) + { + TransactionId next_expected_xid = latestObservedXid; + TransactionIdAdvance(next_expected_xid); + + if (next_expected_xid == xid) + { + Assert(!XidInUnobservedTransactions(xid)); + /* XXX Assert(!XLogRecIsFirstSubXidRecord(record) || + !XidInUnobservedTransactions(top_xid)); */ + latestObservedXid = xid; + } + else if (TransactionIdPrecedes(next_expected_xid, xid)) + { + UnobservedTransactionsAddXids(next_expected_xid, xid); + latestObservedXid = xid; + } + else if (first_seen) + UnobservedTransactionsRemoveXid(xid, true); + + if (xid == child_xid) + break; + } + + LWLockRelease(ProcArrayLock); + + /* + * Now we've upated the proc we can update subtrans, if appropriate. + * We must do this step last to avoid race conditions. See comments + * and code for AssignTransactionId(). + */ + if (mark_subtrans) + { + /* Assert(XLogRecIsFirstSubXidRecord(record)); */ + elog(trace_recovery(DEBUG2), + "subtrans setting parent %d for xid %d", top_xid, child_xid); + SubTransSetParent(child_xid, top_xid); + } + } + + /* * XLOG support routines */ + /* + * Before 8.4 this was a fairly short function, but now it performs many + * actions for which the order of execution is critical. + */ static void ! xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, bool preparedXact) { TransactionId *sub_xids; TransactionId max_xid; + PGPROC *proc; int i; /* Make sure nextXid is beyond any XID mentioned in the record */ max_xid = xid; + sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); + + /* + * Find the highest xid and remove unobserved xids if required. + */ for (i = 0; i < xlrec->nsubxacts; i++) { if (TransactionIdPrecedes(max_xid, sub_xids[i])) max_xid = sub_xids[i]; } + + if (InArchiveRecovery) + { + /* + * If we've just observed some new xids on the commit record + * make sure they're visible before we update clog. + */ + if (XactCompletionHasUnMarkedSubxids(xlrec)) + { + if (!IsRunningXactDataValid()) + latestObservedXid = xid; + + if (TransactionIdPrecedes(latestObservedXid, max_xid)) + { + TransactionId next_expected_xid = latestObservedXid; + + TransactionIdAdvance(next_expected_xid); + if (TransactionIdPrecedes(next_expected_xid, max_xid)) + { + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + UnobservedTransactionsAddXids(next_expected_xid, max_xid); + LWLockRelease(ProcArrayLock); + } + latestObservedXid = max_xid; + } + } + } + + /* Mark the transaction committed in pg_clog */ + TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids); + + if (InArchiveRecovery && (proc = BackendXidGetProc(xid)) != NULL) + { + /* + * We must mark clog before we update the ProcArray. Only update + * if we have already initialised the state and we have previously + * added an xid to the proc. We need no lock to check xid since it + * is controlled by Startup process. It's possible for xids to + * appear that haven't been seen before. We don't need to check + * UnobservedXids because in the normal case this will already have + * happened, but there are cases where they might sneak through. + * Leave these for the periodic cleanup by XACT_RUNNING_XACT records. + */ + if (IsRunningXactDataValid() && !preparedXact) + { + if (XactCompletionHasUnMarkedSubxids(xlrec)) + ProcArrayEndTransaction(proc, max_xid, xlrec->nsubxacts, sub_xids); + else + ProcArrayEndTransaction(proc, max_xid, 0, NULL); + ProcArrayRemove(proc, InvalidTransactionId); + FreeRecoveryProcess(proc); + } + + /* + * If requested, update the flat files for DB and Auth Files by + * reading the catalog tables. Needs to be the first action taken + * after marking transaction complete to minimise race conditions. + * This is the opposite way round to the original actions, which + * update the files and then mark committed, so there is a race + * condition in both places. + */ + if (XactCompletionUpdateDBFile(xlrec)) + { + if (XactCompletionUpdateAuthFile(xlrec)) + BuildFlatFiles(false); + else + BuildFlatFiles(true); + } + + /* + * Send any cache invalidations attached to the commit. We must + * maintain the same order of invalidation then release locks + * as occurs in RecordTransactionCommit. + */ + if (xlrec->nmsgs > 0) + { + int offset = OffsetSharedInvalInXactCommit(); + SharedInvalidationMessage *msgs = (SharedInvalidationMessage *) + (((char *) xlrec) + offset); + + SendSharedInvalidMessages(msgs, xlrec->nmsgs); + } + + /* + * Release locks, if any. + */ + RelationReleaseRecoveryLocks(xid); + } + + /* Make sure nextXid is beyond any XID mentioned in the record */ if (TransactionIdFollowsOrEquals(max_xid, ShmemVariableCache->nextXid)) { ShmemVariableCache->nextXid = max_xid; + ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; TransactionIdAdvance(ShmemVariableCache->nextXid); } *************** *** 4260,4287 **** xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid) } } static void ! xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid) { TransactionId *sub_xids; TransactionId max_xid; int i; - /* Mark the transaction aborted in pg_clog */ - sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); - TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids); - /* Make sure nextXid is beyond any XID mentioned in the record */ max_xid = xid; for (i = 0; i < xlrec->nsubxacts; i++) { if (TransactionIdPrecedes(max_xid, sub_xids[i])) max_xid = sub_xids[i]; } if (TransactionIdFollowsOrEquals(max_xid, ShmemVariableCache->nextXid)) { ShmemVariableCache->nextXid = max_xid; TransactionIdAdvance(ShmemVariableCache->nextXid); } --- 4868,4963 ---- } } + /* + * Be careful with the order of execution, as with xact_redo_commit(). + * The two functions are similar but differ in key places. + */ static void ! xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid, bool preparedXact) { + PGPROC *proc = NULL; TransactionId *sub_xids; TransactionId max_xid; int i; /* Make sure nextXid is beyond any XID mentioned in the record */ max_xid = xid; + sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]); + + /* + * Find the highest xid and remove unobserved xids if required. + */ for (i = 0; i < xlrec->nsubxacts; i++) { if (TransactionIdPrecedes(max_xid, sub_xids[i])) max_xid = sub_xids[i]; } + + if (InArchiveRecovery) + { + /* + * If we've just observed some new xids on the commit record + * make sure they're visible before we update clog. + */ + if (XactCompletionHasUnMarkedSubxids(xlrec)) + { + if (!IsRunningXactDataValid()) + latestObservedXid = xid; + + if (TransactionIdPrecedes(latestObservedXid, max_xid)) + { + TransactionId next_expected_xid = latestObservedXid; + + TransactionIdAdvance(next_expected_xid); + if (TransactionIdPrecedes(next_expected_xid, max_xid)) + { + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + UnobservedTransactionsAddXids(next_expected_xid, max_xid); + LWLockRelease(ProcArrayLock); + } + latestObservedXid = max_xid; + } + } + } + + /* Mark the transaction aborted in pg_clog */ + TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids); + + if (InArchiveRecovery && (proc = BackendXidGetProc(xid)) != NULL) + { + /* + * We must mark clog before we update the ProcArray. Only update + * if we have already initialised the state and we have previously + * added an xid to the proc. We need no lock to check xid since it + * is controlled by Startup process. It's possible for xids to + * appear that haven't been seen before. We don't need to check + * UnobservedXids because in the normal case this will already have + * happened, but there are cases where they might sneak through. + * Leave these for the periodic cleanup by XACT_RUNNING_XACT records. + */ + if (IsRunningXactDataValid() && + TransactionIdIsValid(proc->xid) && !preparedXact) + { + if (XactCompletionHasUnMarkedSubxids(xlrec)) + ProcArrayEndTransaction(proc, max_xid, xlrec->nsubxacts, sub_xids); + else + ProcArrayEndTransaction(proc, max_xid, 0, NULL); + ProcArrayRemove(proc, InvalidTransactionId); + FreeRecoveryProcess(proc); + } + + /* + * Release locks, if any. There are no invalidations to send. + */ + RelationReleaseRecoveryLocks(xid); + } + + /* Make sure nextXid is beyond any XID mentioned in the record */ if (TransactionIdFollowsOrEquals(max_xid, ShmemVariableCache->nextXid)) { ShmemVariableCache->nextXid = max_xid; + ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; TransactionIdAdvance(ShmemVariableCache->nextXid); } *************** *** 4312,4324 **** xact_redo(XLogRecPtr lsn, XLogRecord *record) { xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); ! xact_redo_commit(xlrec, record->xl_xid); } else if (info == XLOG_XACT_ABORT) { xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); ! xact_redo_abort(xlrec, record->xl_xid); } else if (info == XLOG_XACT_PREPARE) { --- 4988,5004 ---- { xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); ! xact_redo_commit(xlrec, record->xl_xid, false); } else if (info == XLOG_XACT_ABORT) { xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); ! Assert(!XactCompletionUpdateDBFile(xlrec) && ! !XactCompletionUpdateAuthFile(xlrec) && ! !XactCompletionRelcacheInitFileInval(xlrec)); ! ! xact_redo_abort(xlrec, record->xl_xid, false); } else if (info == XLOG_XACT_PREPARE) { *************** *** 4330,4345 **** xact_redo(XLogRecPtr lsn, XLogRecord *record) { xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) XLogRecGetData(record); ! xact_redo_commit(&xlrec->crec, xlrec->xid); RemoveTwoPhaseFile(xlrec->xid, false); } else if (info == XLOG_XACT_ABORT_PREPARED) { xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) XLogRecGetData(record); ! xact_redo_abort(&xlrec->arec, xlrec->xid); RemoveTwoPhaseFile(xlrec->xid, false); } else elog(PANIC, "xact_redo: unknown op code %u", info); } --- 5010,5053 ---- { xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) XLogRecGetData(record); ! xact_redo_commit(&xlrec->crec, xlrec->xid, true); RemoveTwoPhaseFile(xlrec->xid, false); } else if (info == XLOG_XACT_ABORT_PREPARED) { xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) XLogRecGetData(record); ! xact_redo_abort(&xlrec->arec, xlrec->xid, true); RemoveTwoPhaseFile(xlrec->xid, false); } + else if (info == XLOG_XACT_ASSIGNMENT) + { + /* + * This is a no-op since RecordKnownAssignedTransactionIds() + * already did all the work on this record for us. + */ + return; + } + else if (info == XLOG_XACT_RUNNING_XACTS) + { + xl_xact_running_xacts *xlrec = (xl_xact_running_xacts *) XLogRecGetData(record); + + /* + * Initialise if we have a valid snapshot to work with + */ + if (TransactionIdIsValid(xlrec->latestRunningXid) && + (!IsRunningXactDataValid() || + TransactionIdPrecedes(latestObservedXid, xlrec->latestRunningXid))) + { + latestObservedXid = xlrec->latestRunningXid; + ShmemVariableCache->latestCompletedXid = xlrec->latestCompletedXid; + elog(trace_recovery(DEBUG1), + "initial snapshot created; latestObservedXid = %d latestCompletedXid = %d", + latestObservedXid, xlrec->latestCompletedXid); + } + + ProcArrayUpdateRecoveryTransactions(lsn, xlrec); + } else elog(PANIC, "xact_redo: unknown op code %u", info); } *************** *** 4349,4358 **** xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec) { int i; appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time)); if (xlrec->nrels > 0) { ! appendStringInfo(buf, "; rels:"); for (i = 0; i < xlrec->nrels; i++) { char *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM); --- 5057,5075 ---- { int i; + if (XactCompletionUpdateDBFile(xlrec)) + appendStringInfo(buf, "; update db file"); + + if (XactCompletionUpdateDBFile(xlrec)) + appendStringInfo(buf, "; update auth file"); + + if (XactCompletionRelcacheInitFileInval(xlrec)) + appendStringInfo(buf, "; relcache init file inval"); + appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time)); if (xlrec->nrels > 0) { ! appendStringInfo(buf, "; %d rels:", xlrec->nrels); for (i = 0; i < xlrec->nrels; i++) { char *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM); *************** *** 4363,4374 **** xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec) if (xlrec->nsubxacts > 0) { TransactionId *xacts = (TransactionId *) ! &xlrec->xnodes[xlrec->nrels]; ! ! appendStringInfo(buf, "; subxacts:"); for (i = 0; i < xlrec->nsubxacts; i++) appendStringInfo(buf, " %u", xacts[i]); } } static void --- 5080,5113 ---- if (xlrec->nsubxacts > 0) { TransactionId *xacts = (TransactionId *) ! &xlrec->xnodes[xlrec->nrels]; ! appendStringInfo(buf, "; %d subxacts:", xlrec->nsubxacts); for (i = 0; i < xlrec->nsubxacts; i++) appendStringInfo(buf, " %u", xacts[i]); } + if (xlrec->nmsgs > 0) + { + /* + * The invalidation messages are the third variable length array + * from the start of the record. The record header has everything + * we need to calculate where that starts. + */ + int offset = OffsetSharedInvalInXactCommit(); + SharedInvalidationMessage *msgs = (SharedInvalidationMessage *) + (((char *) xlrec) + offset); + appendStringInfo(buf, "; %d inval msgs:", xlrec->nmsgs); + for (i = 0; i < xlrec->nmsgs; i++) + { + SharedInvalidationMessage *msg = msgs + i; + + if (msg->id >= 0) + appendStringInfo(buf, "catcache id%d ", msg->id); + else if (msg->id == SHAREDINVALRELCACHE_ID) + appendStringInfo(buf, "relcache "); + else if (msg->id == SHAREDINVALSMGR_ID) + appendStringInfo(buf, "smgr "); + } + } } static void *************** *** 4398,4403 **** xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec) --- 5137,5179 ---- } } + static void + xact_desc_running_xacts(StringInfo buf, xl_xact_running_xacts *xlrec) + { + int xid_index, + subxid_index; + TransactionId *subxip = (TransactionId *) &(xlrec->xrun[xlrec->xcnt]); + + appendStringInfo(buf, "nxids %u nsubxids %u latestRunningXid %d", + xlrec->xcnt, + xlrec->subxcnt, + xlrec->latestRunningXid); + + for (xid_index = 0; xid_index < xlrec->xcnt; xid_index++) + { + RunningXact *rxact = (RunningXact *) xlrec->xrun; + + appendStringInfo(buf, "; xid %d pid %d db %d role %d " + "vacflag %u nsubxids %u offset %d overflowed %s", + rxact[xid_index].xid, + rxact[xid_index].pid, + rxact[xid_index].databaseId, + rxact[xid_index].roleId, + rxact[xid_index].vacuumFlags, + rxact[xid_index].nsubxids, + rxact[xid_index].subx_offset, + (rxact[xid_index].overflowed ? "t" : "f")); + + if (rxact[xid_index].nsubxids > 0) + { + appendStringInfo(buf, "; subxacts: "); + for (subxid_index = 0; subxid_index < rxact[xid_index].nsubxids; subxid_index++) + appendStringInfo(buf, " %u", + subxip[subxid_index + rxact[xid_index].subx_offset]); + } + } + } + void xact_desc(StringInfo buf, uint8 xl_info, char *rec) { *************** *** 4435,4440 **** xact_desc(StringInfo buf, uint8 xl_info, char *rec) --- 5211,5231 ---- appendStringInfo(buf, "abort %u: ", xlrec->xid); xact_desc_abort(buf, &xlrec->arec); } + else if (info == XLOG_XACT_ASSIGNMENT) + { + xl_xact_assignment *xlrec = (xl_xact_assignment *) rec; + + /* ignore the main xid, it may be Invalid and misleading */ + appendStringInfo(buf, "assignment: xid %u", + xlrec->xassign); + } + else if (info == XLOG_XACT_RUNNING_XACTS) + { + xl_xact_running_xacts *xlrec = (xl_xact_running_xacts *) rec; + + appendStringInfo(buf, "running xacts: "); + xact_desc_running_xacts(buf, xlrec); + } else appendStringInfo(buf, "UNKNOWN"); } *** src/backend/access/transam/xlog.c --- src/backend/access/transam/xlog.c *************** *** 24,29 **** --- 24,30 ---- #include "access/clog.h" #include "access/multixact.h" + #include "access/nbtree.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/tuptoaster.h" *************** *** 43,48 **** --- 44,50 ---- #include "storage/ipc.h" #include "storage/pmsignal.h" #include "storage/procarray.h" + #include "storage/sinval.h" #include "storage/smgr.h" #include "storage/spin.h" #include "utils/builtins.h" *************** *** 50,55 **** --- 52,58 ---- #include "utils/ps_status.h" #include "pg_trace.h" + #define WAL_DEBUG /* File path names (all relative to $PGDATA) */ #define BACKUP_LABEL_FILE "backup_label" *************** *** 69,75 **** bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; #ifdef WAL_DEBUG ! bool XLOG_DEBUG = false; #endif /* --- 72,80 ---- int sync_method = DEFAULT_SYNC_METHOD; #ifdef WAL_DEBUG ! bool XLOG_DEBUG_FLUSH = false; ! bool XLOG_DEBUG_BGFLUSH = false; ! bool XLOG_DEBUG_REDO = true; #endif /* *************** *** 114,120 **** CheckpointStatsData CheckpointStats; /* * ThisTimeLineID will be same in all backends --- it identifies current ! * WAL timeline for the database system. */ TimeLineID ThisTimeLineID = 0; --- 119,126 ---- /* * ThisTimeLineID will be same in all backends --- it identifies current ! * WAL timeline for the database system. Zero is always a bug, so we ! * start with that to allow us to spot any errors. */ TimeLineID ThisTimeLineID = 0; *************** *** 122,141 **** TimeLineID ThisTimeLineID = 0; bool InRecovery = false; /* Are we recovering using offline XLOG archives? */ ! static bool InArchiveRecovery = false; /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; /* options taken from recovery.conf */ static char *recoveryRestoreCommand = NULL; - static bool recoveryTarget = false; static bool recoveryTargetExact = false; static bool recoveryTargetInclusive = true; static bool recoveryLogRestartpoints = false; static TransactionId recoveryTargetXid; static TimestampTz recoveryTargetTime; static TimestampTz recoveryLastXTime = 0; /* if recoveryStopsHere returns true, it saves actual stop xid/time here */ static TransactionId recoveryStopXid; --- 128,171 ---- bool InRecovery = false; /* Are we recovering using offline XLOG archives? */ ! bool InArchiveRecovery = false; ! ! /* Local copy of shared RecoveryProcessingMode state */ ! static bool LocalRecoveryProcessingMode = true; ! static bool knownProcessingMode = false; ! ! /* is the database proven consistent yet? */ ! bool reachedSafeStartPoint = false; /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; /* options taken from recovery.conf */ static char *recoveryRestoreCommand = NULL; static bool recoveryTargetExact = false; static bool recoveryTargetInclusive = true; static bool recoveryLogRestartpoints = false; static TransactionId recoveryTargetXid; static TimestampTz recoveryTargetTime; + static int recoveryTargetAdvance = 0; + + /* recovery target modes */ + #define RECOVERY_TARGET_NONE 0 + #define RECOVERY_TARGET_PAUSE_ALL 1 + #define RECOVERY_TARGET_PAUSE_CLEANUP 2 + #define RECOVERY_TARGET_PAUSE_XID 3 + #define RECOVERY_TARGET_PAUSE_TIME 4 + #define RECOVERY_TARGET_ADVANCE 5 + #define RECOVERY_TARGET_STOP_IMMEDIATE 6 + #define RECOVERY_TARGET_STOP_XID 7 + #define RECOVERY_TARGET_STOP_TIME 8 + static int recoveryTargetMode = RECOVERY_TARGET_NONE; + + #define DEFAULT_MAX_STANDBY_DELAY 300 + int maxStandbyDelay = DEFAULT_MAX_STANDBY_DELAY; + static TimestampTz recoveryLastXTime = 0; + static TransactionId recoveryLastXid = InvalidTransactionId; /* if recoveryStopsHere returns true, it saves actual stop xid/time here */ static TransactionId recoveryStopXid; *************** *** 241,250 **** static XLogRecPtr RedoRecPtr; * ControlFileLock: must be held to read/update control file or create * new log file. * ! * CheckpointLock: must be held to do a checkpoint (ensures only one ! * checkpointer at a time; currently, with all checkpoints done by the ! * bgwriter, this is just pro forma). * *---------- */ --- 271,300 ---- * ControlFileLock: must be held to read/update control file or create * new log file. * ! * CheckpointLock: must be held to do a checkpoint or restartpoint, ensuring ! * we get just one of those at any time. In 8.4+ recovery, both startup and ! * bgwriter processes may take restartpoints, so this locking must be strict ! * to ensure there are no mistakes. * + * In 8.4 we progress through a number of states at startup. Initially, the + * postmaster is in PM_STARTUP state and spawns the Startup process. We then + * progress until the database is in a consistent state, then if we are in + * InArchiveRecovery we go into PM_RECOVERY state. The bgwriter then starts + * up and takes over responsibility for performing restartpoints. We then + * progress until the end of recovery when we enter PM_RUN state upon + * termination of the Startup process. In summary: + * + * PM_STARTUP state: Startup process performs restartpoints + * PM_RECOVERY state: bgwriter process performs restartpoints + * PM_RUN state: bgwriter process performs checkpoints + * + * These transitions are fairly delicate, with many things that need to + * happen at the same time in order to change state successfully throughout + * the system. Changing PM_STARTUP to PM_RECOVERY only occurs when we can + * prove the databases are in a consistent state. Changing from PM_RECOVERY + * to PM_RUN happens whenever recovery ends, which could be forced upon us + * externally or it can occur because of damage or termination of the WAL + * sequence. *---------- */ *************** *** 286,296 **** typedef struct XLogCtlWrite --- 336,353 ---- /* * Total shared-memory state for XLOG. + * + * This small structure is accessed by many backends, so we take care to + * pad out the parts of the structure so they can be accessed by separate + * CPUs without causing false sharing cache flushes. Padding is generous + * to allow for a wide variety of CPU architectures. */ + #define XLOGCTL_BUFFER_SPACING 128 typedef struct XLogCtlData { /* Protected by WALInsertLock: */ XLogCtlInsert Insert; + char InsertPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlInsert)]; /* Protected by info_lck: */ XLogwrtRqst LogwrtRqst; *************** *** 298,306 **** typedef struct XLogCtlData --- 355,370 ---- uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */ TransactionId ckptXid; XLogRecPtr asyncCommitLSN; /* LSN of newest async commit */ + /* add data structure padding for above info_lck declarations */ + char InfoPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogwrtRqst) + - sizeof(XLogwrtResult) + - sizeof(uint32) + - sizeof(TransactionId) + - sizeof(XLogRecPtr)]; /* Protected by WALWriteLock: */ XLogCtlWrite Write; + char WritePadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlWrite)]; /* * These values do not change after startup, although the pointed-to pages *************** *** 312,317 **** typedef struct XLogCtlData --- 376,412 ---- int XLogCacheBlck; /* highest allocated xlog buffer index */ TimeLineID ThisTimeLineID; + /* + * IsRecoveryProcessingMode shows whether the postmaster is in a + * postmaster state earlier than PM_RUN, or not. This is a globally + * accessible state to allow EXEC_BACKEND case. + * + * We also retain a local state variable InRecovery. InRecovery=true + * means the code is being executed by Startup process and therefore + * always during Recovery Processing Mode. This allows us to identify + * code executed *during* Recovery Processing Mode but not necessarily + * by Startup process itself. + * + * Protected by mode_lck + */ + bool SharedRecoveryProcessingMode; + slock_t mode_lck; + + /* + * recovery target control information + * + * Protected by info_lck + */ + int recoveryTargetMode; + TransactionId recoveryTargetXid; + TimestampTz recoveryTargetTime; + int recoveryTargetAdvance; + + TimestampTz recoveryLastXTime; + TransactionId recoveryLastXid; + + char InfoLockPadding[XLOGCTL_BUFFER_SPACING]; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; *************** *** 398,405 **** static void XLogArchiveCleanup(const char *xlog); --- 493,502 ---- static void readRecoveryCommandFile(void); static void exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg); + static void exitRecovery(void); static bool recoveryStopsHere(XLogRecord *record, bool *includeThis); static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); + static XLogRecPtr GetRedoLocationForCheckpoint(void); static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites, XLogRecPtr *lsn, BkpBlock *bkpb); *************** *** 482,487 **** XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) --- 579,592 ---- bool updrqst; bool doPageWrites; bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); + bool isRecoveryEnd = (rmid == RM_XLOG_ID && + (info == XLOG_RECOVERY_END || + info == XLOG_CHECKPOINT_ONLINE)); + + /* cross-check on whether we should be here or not */ + if (IsRecoveryProcessingMode() && !isRecoveryEnd) + elog(FATAL, "cannot make new WAL entries during recovery " + "(RMgrId = %d info = %d)", rmid, info); /* info's high bits are reserved for use by me */ if (info & XLR_INFO_MASK) *************** *** 820,825 **** begin:; --- 925,931 ---- record->xl_len = len; /* doesn't include backup blocks */ record->xl_info = info; record->xl_rmid = rmid; + record->xl_parentxid = GetTopTransactionIdIfAny(); /* Now we can finish computing the record's CRC */ COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32), *************** *** 827,851 **** begin:; FIN_CRC32(rdata_crc); record->xl_crc = rdata_crc; - #ifdef WAL_DEBUG - if (XLOG_DEBUG) - { - StringInfoData buf; - - initStringInfo(&buf); - appendStringInfo(&buf, "INSERT @ %X/%X: ", - RecPtr.xlogid, RecPtr.xrecoff); - xlog_outrec(&buf, record); - if (rdata->data != NULL) - { - appendStringInfo(&buf, " - "); - RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data); - } - elog(LOG, "%s", buf.data); - pfree(buf.data); - } - #endif - /* Record begin of record in appropriate places */ ProcLastRecPtr = RecPtr; Insert->PrevRecord = RecPtr; --- 933,938 ---- *************** *** 1728,1735 **** XLogFlush(XLogRecPtr record) XLogRecPtr WriteRqstPtr; XLogwrtRqst WriteRqst; ! /* Disabled during REDO */ ! if (InRedo) return; /* Quick exit if already known flushed */ --- 1815,1821 ---- XLogRecPtr WriteRqstPtr; XLogwrtRqst WriteRqst; ! if (IsRecoveryProcessingMode()) return; /* Quick exit if already known flushed */ *************** *** 1737,1743 **** XLogFlush(XLogRecPtr record) return; #ifdef WAL_DEBUG ! if (XLOG_DEBUG) elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X", record.xlogid, record.xrecoff, LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff, --- 1823,1829 ---- return; #ifdef WAL_DEBUG ! if (XLOG_DEBUG_FLUSH) elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X", record.xlogid, record.xrecoff, LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff, *************** *** 1817,1825 **** XLogFlush(XLogRecPtr record) * the bad page is encountered again during recovery then we would be * unable to restart the database at all! (This scenario has actually * happened in the field several times with 7.1 releases. Note that we ! * cannot get here while InRedo is true, but if the bad page is brought in ! * and marked dirty during recovery then CreateCheckPoint will try to ! * flush it at the end of recovery.) * * The current approach is to ERROR under normal conditions, but only * WARNING during recovery, so that the system can be brought up even if --- 1903,1911 ---- * the bad page is encountered again during recovery then we would be * unable to restart the database at all! (This scenario has actually * happened in the field several times with 7.1 releases. Note that we ! * cannot get here while IsRecoveryProcessingMode(), but if the bad page is ! * brought in and marked dirty during recovery then if a checkpoint were ! * performed at the end of recovery it will try to flush it. * * The current approach is to ERROR under normal conditions, but only * WARNING during recovery, so that the system can be brought up even if *************** *** 1829,1835 **** XLogFlush(XLogRecPtr record) * and so we will not force a restart for a bad LSN on a data page. */ if (XLByteLT(LogwrtResult.Flush, record)) ! elog(InRecovery ? WARNING : ERROR, "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", record.xlogid, record.xrecoff, LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff); --- 1915,1921 ---- * and so we will not force a restart for a bad LSN on a data page. */ if (XLByteLT(LogwrtResult.Flush, record)) ! elog(ERROR, "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", record.xlogid, record.xrecoff, LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff); *************** *** 1887,1893 **** XLogBackgroundFlush(void) return; #ifdef WAL_DEBUG ! if (XLOG_DEBUG) elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X", WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff, LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff, --- 1973,1979 ---- return; #ifdef WAL_DEBUG ! if (XLOG_DEBUG_BGFLUSH) elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X", WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff, LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff, *************** *** 2102,2108 **** XLogFileInit(uint32 log, uint32 seg, unlink(tmppath); } ! elog(DEBUG2, "done creating and filling new WAL file"); /* Set flag to tell caller there was no existent file */ *use_existent = false; --- 2188,2195 ---- unlink(tmppath); } ! XLogFileName(tmppath, ThisTimeLineID, log, seg); ! elog(DEBUG2, "done creating and filling new WAL file %s", tmppath); /* Set flag to tell caller there was no existent file */ *use_existent = false; *************** *** 2408,2413 **** XLogFileRead(uint32 log, uint32 seg, int emode) --- 2495,2522 ---- xlogfname); set_ps_display(activitymsg, false); + /* + * Calculate and write out a new safeStartPoint. This defines + * the latest LSN that might appear on-disk while we apply + * the WAL records in this file. If we crash during recovery + * we must reach this point again before we can prove + * database consistency. Not a restartpoint! Restart points + * define where we should start recovery from, if we crash. + */ + if (InArchiveRecovery) + { + uint32 nextLog = log; + uint32 nextSeg = seg; + + NextLogSeg(nextLog, nextSeg); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->minSafeStartPoint.xlogid = nextLog; + ControlFile->minSafeStartPoint.xrecoff = nextSeg * XLogSegSize; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + } + return fd; } if (errno != ENOENT) /* unexpected failure? */ *************** *** 2920,2925 **** CleanupBackupHistory(void) --- 3029,3132 ---- FreeDir(xldir); } + static void + ResolveRedoVisibilityConflicts(XLogRecPtr lsn, XLogRecord *record) + { + Oid recDatabaseOid = 0; + TransactionId latestRemovedXid = 0; + + RmgrId rmid = record->xl_rmid; + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + if (rmid == RM_HEAP2_ID && + (info == XLOG_HEAP2_CLEAN || info == XLOG_HEAP2_CLEAN_MOVE )) + { + xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record); + + latestRemovedXid = xlrec->latestRemovedXid; + recDatabaseOid = xlrec->node.dbNode; + } + else if (rmid == RM_HEAP2_ID && info == XLOG_HEAP2_CLEANUP_INFO) + { + xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record); + + latestRemovedXid = xlrec->latestRemovedXid; + recDatabaseOid = xlrec->node.dbNode; + } + else if (rmid == RM_BTREE_ID && info == XLOG_BTREE_DELETE) + { + xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); + + latestRemovedXid = xlrec->latestRemovedXid; + recDatabaseOid = xlrec->node.dbNode; + } + else if (rmid == RM_BTREE_ID && info == XLOG_BTREE_VACUUM) + { + /* + * This action never conflicts with queries. Although we have to + * use cleanup locks to apply changes made by this record type, we + * are only removing tuples that have xids equal to or prior to the + * latestRemovedXid of a prior RM_HEAP2_ID record. That is the main + * purpose of a a XLOG_HEAP2_CLEANUP_INFO record during lazy vacuum. + * VACUUM FULL will always have seen a higher latestRemovedXid via + * the other record types. So this record is always a no-op here. + */ + return; + } + else + elog(FATAL, "unrecognised cleanup record"); + + XactResolveRecoveryConflicts(latestRemovedXid, recDatabaseOid); + } + + /* + * RecordIsCleanupRecord() determines whether or not the record + * will remove rows from data blocks. This is important because + * applying these records could effect the validity of MVCC snapshots, + * so there are various controls over replaying such records. + */ + static bool + RecordIsCleanupRecord(XLogRecord *record) + { + RmgrId rmid = record->xl_rmid; + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + /* + * XXX should we implement this as an additional RMgr API call? + * We shouldn't presume we know which Rmgrs have cleanup records, + * which we do by including access/nbtree.h and calling an Rmgr + * specific function directly by name here. + */ + if ((rmid == RM_HEAP2_ID) || + (rmid == RM_BTREE_ID && btree_is_cleanup_record(info))) + return true; + + return false; + } + + /* + * RecordNeedsCleanupLock() determines whether or not the record + * requires a cleanup lock when removing rows from data blocks. + */ + static bool + RecordNeedsCleanupLock(XLogRecord *record) + { + RmgrId rmid = record->xl_rmid; + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + /* + * XXX should we implement this as an additional RMgr API call? + * We shouldn't presume we know which Rmgrs need cleanup locks, + * which we do by including access/nbtree.h and calling an Rmgr + * specific function directly by name here. + */ + if ((rmid == RM_HEAP2_ID) || + (rmid == RM_BTREE_ID && btree_needs_cleanup_lock(info))) + return true; + + return false; + } + /* * Restore the backup blocks present in an XLOG record, if any. * *************** *** 2942,2947 **** RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn) --- 3149,3163 ---- BkpBlock bkpb; char *blk; int i; + int lockmode; + + /* + * What kind of lock do we need to apply the backup blocks? + */ + if (RecordNeedsCleanupLock(record)) + lockmode = BUFFER_LOCK_CLEANUP; + else + lockmode = BUFFER_LOCK_EXCLUSIVE; blk = (char *) XLogRecGetData(record) + record->xl_len; for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) *************** *** 2953,2959 **** RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn) blk += sizeof(BkpBlock); buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block, ! RBM_ZERO); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); --- 3169,3175 ---- blk += sizeof(BkpBlock); buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block, ! RBM_ZERO, lockmode); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); *************** *** 4283,4288 **** XLOGShmemInit(void) --- 4499,4505 ---- XLogCtl->XLogCacheBlck = XLOGbuffers - 1; XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages); SpinLockInit(&XLogCtl->info_lck); + SpinLockInit(&XLogCtl->mode_lck); /* * If we are not in bootstrap mode, pg_control should already exist. Read *************** *** 4366,4371 **** BootStrapXLOG(void) --- 4583,4589 ---- record->xl_prev.xlogid = 0; record->xl_prev.xrecoff = 0; record->xl_xid = InvalidTransactionId; + record->xl_parentxid = InvalidTransactionId; record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint); record->xl_len = sizeof(checkPoint); record->xl_info = XLOG_CHECKPOINT_SHUTDOWN; *************** *** 4549,4555 **** readRecoveryCommandFile(void) ereport(LOG, (errmsg("recovery_target_xid = %u", recoveryTargetXid))); ! recoveryTarget = true; recoveryTargetExact = true; } else if (strcmp(tok1, "recovery_target_time") == 0) --- 4767,4773 ---- ereport(LOG, (errmsg("recovery_target_xid = %u", recoveryTargetXid))); ! recoveryTargetMode = RECOVERY_TARGET_STOP_XID; recoveryTargetExact = true; } else if (strcmp(tok1, "recovery_target_time") == 0) *************** *** 4560,4566 **** readRecoveryCommandFile(void) */ if (recoveryTargetExact) continue; ! recoveryTarget = true; recoveryTargetExact = false; /* --- 4778,4784 ---- */ if (recoveryTargetExact) continue; ! recoveryTargetMode = RECOVERY_TARGET_STOP_TIME; recoveryTargetExact = false; /* *************** *** 4599,4604 **** readRecoveryCommandFile(void) --- 4817,4842 ---- ereport(LOG, (errmsg("log_restartpoints = %s", tok2))); } + else if (strcmp(tok1, "max_standby_delay") == 0) + { + errno = 0; + maxStandbyDelay = (TransactionId) strtoul(tok2, NULL, 0); + if (errno == EINVAL || errno == ERANGE) + ereport(FATAL, + (errmsg("max_standby_delay is not a valid number: \"%s\"", + tok2))); + /* + * 2E6 seconds is about 23 days. Allows us to measure delay in + * milliseconds. + */ + if (maxStandbyDelay > INT_MAX || maxStandbyDelay < 0) + ereport(FATAL, + (errmsg("max_standby_delay must be between 0 (wait forever) and 2 000 000 secs"))); + + ereport(LOG, + (errmsg("max_standby_delay = %u", + maxStandbyDelay))); + } else ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", *************** *** 4733,4755 **** exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg) unlink(recoveryPath); /* ignore any error */ /* ! * Rename the config file out of the way, so that we don't accidentally ! * re-enter archive recovery mode in a subsequent crash. */ - unlink(RECOVERY_COMMAND_DONE); - if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE))); ereport(LOG, (errmsg("archive recovery complete"))); } /* ! * For point-in-time recovery, this function decides whether we want to ! * stop applying the XLOG at or after the current record. * * Returns TRUE if we are stopping, FALSE otherwise. On TRUE return, * *includeThis is set TRUE if we should apply this record before stopping. --- 4971,5027 ---- unlink(recoveryPath); /* ignore any error */ /* ! * As of 8.4 we no longer rename the recovery.conf file out of the ! * way until after we have performed a full checkpoint. This ensures ! * that any crash between now and the end of the checkpoint does not ! * attempt to restart from a WAL file that is no longer available to us. ! * As soon as we remove recovery.conf we lose our recovery_command and ! * cannot reaccess WAL files from the archive. */ ereport(LOG, (errmsg("archive recovery complete"))); } + #ifdef DEBUG_RECOVERY_CONTROL + static void + LogRecoveryTargetModeInfo(void) + { + int lrecoveryTargetMode; + TransactionId lrecoveryTargetXid; + TimestampTz lrecoveryTargetTime; + int lrecoveryTargetAdvance; + + TimestampTz lrecoveryLastXTime; + TransactionId lrecoveryLastXid; + + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + SpinLockAcquire(&xlogctl->info_lck); + + lrecoveryTargetMode = xlogctl->recoveryTargetMode; + lrecoveryTargetXid = xlogctl->recoveryTargetXid; + lrecoveryTargetTime = xlogctl->recoveryTargetTime; + lrecoveryTargetAdvance = xlogctl->recoveryTargetAdvance; + lrecoveryLastXTime = xlogctl->recoveryLastXTime; + lrecoveryLastXid = xlogctl->recoveryLastXid; + + SpinLockRelease(&xlogctl->info_lck); + } + + elog(LOG, "mode %d xid %u time %s adv %d", + lrecoveryTargetMode, + lrecoveryTargetXid, + timestamptz_to_str(lrecoveryTargetTime), + lrecoveryTargetAdvance); + } + #endif + /* ! * For archive recovery, this function decides whether we want to ! * pause or stop applying the XLOG at or after the current record. * * Returns TRUE if we are stopping, FALSE otherwise. On TRUE return, * *includeThis is set TRUE if we should apply this record before stopping. *************** *** 4762,4833 **** exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg) static bool recoveryStopsHere(XLogRecord *record, bool *includeThis) { ! bool stopsHere; ! uint8 record_info; ! TimestampTz recordXtime; ! /* We only consider stopping at COMMIT or ABORT records */ ! if (record->xl_rmid != RM_XACT_ID) ! return false; ! record_info = record->xl_info & ~XLR_INFO_MASK; ! if (record_info == XLOG_XACT_COMMIT) { ! xl_xact_commit *recordXactCommitData; ! recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); ! recordXtime = recordXactCommitData->xact_time; ! } ! else if (record_info == XLOG_XACT_ABORT) ! { ! xl_xact_abort *recordXactAbortData; ! recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); ! recordXtime = recordXactAbortData->xact_time; ! } ! else ! return false; ! /* Do we have a PITR target at all? */ ! if (!recoveryTarget) ! { ! recoveryLastXTime = recordXtime; ! return false; } ! if (recoveryTargetExact) { /* ! * there can be only one transaction end record with this exact ! * transactionid ! * ! * when testing for an xid, we MUST test for equality only, since ! * transactions are numbered in the order they start, not the order ! * they complete. A higher numbered xid will complete before you about ! * 50% of the time... */ ! stopsHere = (record->xl_xid == recoveryTargetXid); ! if (stopsHere) ! *includeThis = recoveryTargetInclusive; ! } ! else ! { /* ! * there can be many transactions that share the same commit time, so ! * we stop after the last one, if we are inclusive, or stop at the ! * first one if we are exclusive */ ! if (recoveryTargetInclusive) ! stopsHere = (recordXtime > recoveryTargetTime); ! else ! stopsHere = (recordXtime >= recoveryTargetTime); ! if (stopsHere) ! *includeThis = false; } if (stopsHere) { recoveryStopXid = record->xl_xid; ! recoveryStopTime = recordXtime; recoveryStopAfter = *includeThis; if (record_info == XLOG_XACT_COMMIT) --- 5034,5276 ---- static bool recoveryStopsHere(XLogRecord *record, bool *includeThis) { ! bool stopsHere = false; ! bool pauseHere = false; ! bool paused = false; ! uint8 record_info = 0; /* valid iff (is_xact_completion_record) */ ! TimestampTz recordXtime = 0; ! bool is_xact_completion_record = false; ! /* We only consider stopping at COMMIT or ABORT records */ ! if (record->xl_rmid == RM_XACT_ID) { ! record_info = record->xl_info & ~XLR_INFO_MASK; ! if (record_info == XLOG_XACT_COMMIT) ! { ! xl_xact_commit *recordXactCommitData; ! recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); ! recordXtime = recordXactCommitData->xact_time; ! is_xact_completion_record = true; ! } ! else if (record_info == XLOG_XACT_ABORT) ! { ! xl_xact_abort *recordXactAbortData; ! recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); ! recordXtime = recordXactAbortData->xact_time; ! is_xact_completion_record = true; ! } ! /* Remember the most recent COMMIT/ABORT time for logging purposes */ ! if (is_xact_completion_record) ! { ! recoveryLastXTime = recordXtime; ! recoveryLastXid = record->xl_xid; ! } } ! do { + int prevRecoveryTargetMode = recoveryTargetMode; + /* ! * Let's see if user has updated our recoveryTargetMode. */ ! { ! /* use volatile pointer to prevent code rearrangement */ ! volatile XLogCtlData *xlogctl = XLogCtl; ! ! SpinLockAcquire(&xlogctl->info_lck); ! recoveryTargetMode = xlogctl->recoveryTargetMode; ! if (recoveryTargetMode != RECOVERY_TARGET_NONE) ! { ! recoveryTargetXid = xlogctl->recoveryTargetXid; ! recoveryTargetTime = xlogctl->recoveryTargetTime; ! recoveryTargetAdvance = xlogctl->recoveryTargetAdvance; ! } ! if (is_xact_completion_record) ! { ! xlogctl->recoveryLastXTime = recordXtime; ! xlogctl->recoveryLastXid = record->xl_xid; ! } ! SpinLockRelease(&xlogctl->info_lck); ! } ! ! /* Decide how to act on any pause target */ ! switch (recoveryTargetMode) ! { ! case RECOVERY_TARGET_NONE: ! /* ! * If we aren't paused and we're not looking to stop, ! * just exit out quickly and get on with recovery. ! */ ! if (paused) ! ereport(LOG, ! (errmsg("recovery restarting"))); ! return false; ! ! case RECOVERY_TARGET_PAUSE_ALL: ! pauseHere = true; ! break; ! ! case RECOVERY_TARGET_ADVANCE: ! if (paused) ! { ! if (recoveryTargetAdvance > 0) ! return false; ! } ! else if (recoveryTargetAdvance-- <= 0) ! pauseHere = true; ! break; ! ! case RECOVERY_TARGET_STOP_IMMEDIATE: ! case RECOVERY_TARGET_STOP_XID: ! case RECOVERY_TARGET_STOP_TIME: ! paused = false; ! break; ! ! /* ! * If we're paused, and mode has changed reset to allow new settings ! * to apply and maybe allow us to continue. ! */ ! if (paused && prevRecoveryTargetMode != recoveryTargetMode) ! paused = false; ! ! case RECOVERY_TARGET_PAUSE_CLEANUP: ! /* ! * Advance until we see a cleanup record, then pause. ! */ ! if (RecordIsCleanupRecord(record)) ! pauseHere = true; ! break; ! ! case RECOVERY_TARGET_PAUSE_XID: ! /* ! * there can be only one transaction end record with this exact ! * transactionid ! * ! * when testing for an xid, we MUST test for equality only, since ! * transactions are numbered in the order they start, not the order ! * they complete. A higher numbered xid will complete before you about ! * 50% of the time... ! */ ! if (is_xact_completion_record) ! pauseHere = (record->xl_xid == recoveryTargetXid); ! break; ! ! case RECOVERY_TARGET_PAUSE_TIME: ! /* ! * there can be many transactions that share the same commit time, so ! * we pause after the last one, if we are inclusive, or pause at the ! * first one if we are exclusive ! */ ! if (is_xact_completion_record) ! { ! if (recoveryTargetInclusive) ! pauseHere = (recoveryLastXTime > recoveryTargetTime); ! else ! pauseHere = (recoveryLastXTime >= recoveryTargetTime); ! } ! break; ! ! default: ! ereport(WARNING, ! (errmsg("unknown recovery mode %d, continuing recovery", ! recoveryTargetMode))); ! return false; ! } ! ! if (pauseHere && !paused) ! { ! if (is_xact_completion_record) ! { ! if (record_info == XLOG_XACT_COMMIT) ! ereport(LOG, ! (errmsg("recovery pausing before commit of transaction %u, time %s", ! record->xl_xid, ! timestamptz_to_str(recoveryLastXTime)))); ! else ! ereport(LOG, ! (errmsg("recovery pausing before abort of transaction %u, time %s", ! record->xl_xid, ! timestamptz_to_str(recoveryLastXTime)))); ! } ! else ! ereport(LOG, ! (errmsg("recovery pausing; last completed transaction %u, time %s", ! recoveryLastXid, ! timestamptz_to_str(recoveryLastXTime)))); ! ! set_ps_display("recovery paused", false); ! ! paused = true; ! } ! /* ! * Pause for a while before rechecking mode at top of loop. */ ! if (paused) ! pg_usleep(200000L); ! ! /* ! * We leave the loop at the bottom only if our recovery mode is ! * set (or has been recently reset) to one of the stop options. ! */ ! } while (paused); ! ! /* ! * Decide how to act if stop target mode set. We run this separately from ! * pause to allow user to reset their stop target while paused. ! */ ! switch (recoveryTargetMode) ! { ! case RECOVERY_TARGET_STOP_IMMEDIATE: ! ereport(LOG, ! (errmsg("recovery stopping immediately"))); ! return true; ! ! case RECOVERY_TARGET_STOP_XID: ! /* ! * there can be only one transaction end record with this exact ! * transactionid ! * ! * when testing for an xid, we MUST test for equality only, since ! * transactions are numbered in the order they start, not the order ! * they complete. A higher numbered xid will complete before you about ! * 50% of the time... ! */ ! if (is_xact_completion_record) ! { ! stopsHere = (record->xl_xid == recoveryTargetXid); ! if (stopsHere) ! *includeThis = recoveryTargetInclusive; ! } ! break; ! ! case RECOVERY_TARGET_STOP_TIME: ! /* ! * there can be many transactions that share the same commit time, so ! * we stop after the last one, if we are inclusive, or stop at the ! * first one if we are exclusive ! */ ! if (is_xact_completion_record) ! { ! if (recoveryTargetInclusive) ! stopsHere = (recoveryLastXTime > recoveryTargetTime); ! else ! stopsHere = (recoveryLastXTime >= recoveryTargetTime); ! if (stopsHere) ! *includeThis = false; ! } ! break; } if (stopsHere) { + Assert(is_xact_completion_record); recoveryStopXid = record->xl_xid; ! recoveryStopTime = recoveryLastXTime; recoveryStopAfter = *includeThis; if (record_info == XLOG_XACT_COMMIT) *************** *** 4856,4869 **** recoveryStopsHere(XLogRecord *record, bool *includeThis) recoveryStopXid, timestamptz_to_str(recoveryStopTime)))); } ! if (recoveryStopAfter) ! recoveryLastXTime = recordXtime; } else ! recoveryLastXTime = recordXtime; ! return stopsHere; } /* --- 5299,5496 ---- recoveryStopXid, timestamptz_to_str(recoveryStopTime)))); } + } ! return stopsHere; ! } ! ! /* ! * Utility function used by various user functions to set the recovery ! * target mode. This allows user control over the progress of recovery. ! */ ! static void ! SetRecoveryTargetMode(int mode, TransactionId xid, TimestampTz ts, int advance) ! { ! if (!superuser()) ! ereport(ERROR, ! (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), ! errmsg("must be superuser to control recovery"))); ! ! if (!IsRecoveryProcessingMode()) ! ereport(ERROR, ! (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), ! errmsg("recovery is not in progress"), ! errhint("WAL control functions can only be executed during recovery."))); ! ! { ! /* use volatile pointer to prevent code rearrangement */ ! volatile XLogCtlData *xlogctl = XLogCtl; ! ! SpinLockAcquire(&xlogctl->info_lck); ! xlogctl->recoveryTargetMode = mode; ! ! if (mode == RECOVERY_TARGET_STOP_XID || ! mode == RECOVERY_TARGET_PAUSE_XID) ! xlogctl->recoveryTargetXid = xid; ! else if (mode == RECOVERY_TARGET_STOP_TIME || ! mode == RECOVERY_TARGET_PAUSE_TIME) ! xlogctl->recoveryTargetTime = ts; ! else if (mode == RECOVERY_TARGET_ADVANCE) ! xlogctl->recoveryTargetAdvance = advance; ! ! SpinLockRelease(&xlogctl->info_lck); } + + return; + } + + /* + * Forces recovery mode to reset to unfrozen. + * Returns void. + */ + Datum + pg_recovery_continue(PG_FUNCTION_ARGS) + { + SetRecoveryTargetMode(RECOVERY_TARGET_NONE, InvalidTransactionId, 0, 0); + + PG_RETURN_VOID(); + } + + /* + * Pause recovery immediately. Stays paused until asked to play again. + * Returns void. + */ + Datum + pg_recovery_pause(PG_FUNCTION_ARGS) + { + SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_ALL, InvalidTransactionId, 0, 0); + + PG_RETURN_VOID(); + } + + /* + * Pause recovery at the next cleanup record. Stays paused until asked to + * play again. + */ + Datum + pg_recovery_pause_cleanup(PG_FUNCTION_ARGS) + { + SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_CLEANUP, InvalidTransactionId, 0, 0); + + PG_RETURN_VOID(); + } + + /* + * Pause recovery at stated xid, if ever seen. Once paused, stays paused + * until asked to play again. + */ + Datum + pg_recovery_pause_xid(PG_FUNCTION_ARGS) + { + int xidi = PG_GETARG_INT32(0); + TransactionId xid = (TransactionId) xidi; + + if (xid < 3) + elog(ERROR, "cannot specify special values for transaction id"); + + SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_XID, xid, 0, 0); + + PG_RETURN_VOID(); + } + + /* + * Pause recovery at stated timestamp, if ever reached. Once paused, stays paused + * until asked to play again. + */ + Datum + pg_recovery_pause_time(PG_FUNCTION_ARGS) + { + TimestampTz ts = PG_GETARG_TIMESTAMPTZ(0); + + SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_TIME, InvalidTransactionId, ts, 0); + + PG_RETURN_VOID(); + } + + /* + * If paused, advance N records. + */ + Datum + pg_recovery_advance(PG_FUNCTION_ARGS) + { + int adv = PG_GETARG_INT32(0); + + if (adv < 1) + elog(ERROR, "recovery advance must be greater than or equal to 1"); + + SetRecoveryTargetMode(RECOVERY_TARGET_ADVANCE, InvalidTransactionId, 0, adv); + + PG_RETURN_VOID(); + } + + /* + * Forces recovery to stop now if paused, or at end of next record if playing. + */ + Datum + pg_recovery_stop(PG_FUNCTION_ARGS) + { + SetRecoveryTargetMode(RECOVERY_TARGET_STOP_IMMEDIATE, InvalidTransactionId, 0, 0); + + PG_RETURN_VOID(); + } + + /* + * Returns bool with current recovery mode + */ + Datum + pg_is_in_recovery(PG_FUNCTION_ARGS) + { + PG_RETURN_BOOL(IsRecoveryProcessingMode()); + } + + /* + * Returns timestamp of last completed transaction + */ + Datum + pg_last_completed_xact_timestamp(PG_FUNCTION_ARGS) + { + PG_RETURN_TIMESTAMPTZ(recoveryLastXTime); + } + + /* + * Returns delay in milliseconds, or -1 if delay too large + */ + int + GetLatestReplicationDelay(void) + { + long delay_secs; + int delay_usecs; + int delay; + TimestampTz currTz = GetCurrentTimestamp(); + + TimestampDifference(recoveryLastXTime, currTz, + &delay_secs, &delay_usecs); + + /* + * If delay is very large we probably aren't looking at + * a replication situation at all, just a recover from backup. + * So return a special value instead. + */ + if (delay_secs > (long)(INT_MAX / 1000)) + delay = -1; else ! delay = (int)(delay_secs * 1000) + (delay_usecs / 1000); ! return delay; ! } ! ! /* ! * Returns xid of last completed transaction ! */ ! Datum ! pg_last_completed_xid(PG_FUNCTION_ARGS) ! { ! PG_RETURN_INT32(recoveryLastXid); } /* *************** *** 4876,4881 **** StartupXLOG(void) --- 5503,5509 ---- CheckPoint checkPoint; bool wasShutdown; bool reachedStopPoint = false; + bool performedRecovery = false; bool haveBackupLabel = false; XLogRecPtr RecPtr, LastRec, *************** *** 4888,4893 **** StartupXLOG(void) --- 5516,5523 ---- uint32 freespace; TransactionId oldestActiveXID; + XLogCtl->SharedRecoveryProcessingMode = true; + /* * Read control file and check XLOG status looks valid. * *************** *** 5108,5116 **** StartupXLOG(void) --- 5738,5752 ---- if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0) ControlFile->minRecoveryPoint = minRecoveryLoc; ControlFile->time = (pg_time_t) time(NULL); + /* No need to hold ControlFileLock yet, we aren't up far enough */ UpdateControlFile(); /* + * Reset pgstat data, because it may be invalid after recovery. + */ + pgstat_reset_all(); + + /* * If there was a backup label file, it's done its job and the info * has now been propagated into pg_control. We must get rid of the * label file so that if we crash during recovery, we'll pick up at *************** *** 5167,5173 **** StartupXLOG(void) do { #ifdef WAL_DEBUG ! if (XLOG_DEBUG) { StringInfoData buf; --- 5803,5814 ---- do { #ifdef WAL_DEBUG ! int loglevel = DEBUG3; ! ! if (rmid == RM_XACT_ID) ! loglevel = DEBUG2; ! ! if (loglevel >= trace_recovery_messages) { StringInfoData buf; *************** *** 5210,5215 **** StartupXLOG(void) --- 5851,5875 ---- TransactionIdAdvance(ShmemVariableCache->nextXid); } + if (InArchiveRecovery) + { + /* + * Make sure the incoming transaction is emulated as running + * prior to allowing any changes made by it to touch data. + */ + RecordKnownAssignedTransactionIds(EndRecPtr, record); + + /* + * Wait, kill or otherwise resolve any conflicts between + * incoming cleanup records and user queries. This is the + * main barrier that allows MVCC to work correctly when + * running standby servers. Only need to do this if there + * is a possibility that users may be active. + */ + if (reachedSafeStartPoint && RecordIsCleanupRecord(record)) + ResolveRedoVisibilityConflicts(EndRecPtr, record); + } + if (record->xl_info & XLR_BKP_BLOCK_MASK) RestoreBkpBlocks(record, EndRecPtr); *************** *** 5220,5225 **** StartupXLOG(void) --- 5880,5920 ---- LastRec = ReadRecPtr; + /* + * Can we signal Postmaster to enter consistent recovery mode? + * + * There are two points in the log that we must pass. The first + * is minRecoveryPoint, which is the LSN at the time the + * base backup was taken that we are about to rollforward from. + * If recovery has ever crashed or was stopped there is also + * another point also: minSafeStartPoint, which we know the + * latest LSN that recovery could have reached prior to crash. + * + * We must also have assembled sufficient information about + * transaction state to allow valid snapshots to be taken. + * + * XXX: Shouldn't we call StartupSUBTRANS() and the other + * startup functions like we do below, before letting + * anyone in? + */ + if (!reachedSafeStartPoint && + IsRunningXactDataValid() && + XLByteLE(ControlFile->minSafeStartPoint, EndRecPtr) && + XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr)) + { + reachedSafeStartPoint = true; + if (InArchiveRecovery) + { + ereport(LOG, + (errmsg("database has now reached consistent state at %X/%X", + EndRecPtr.xlogid, EndRecPtr.xrecoff))); + InitRecoveryTransactionEnvironment(); + StartCleanupDelayStats(); + if (IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_RECOVERY_START); + } + } + record = ReadRecord(NULL, LOG); } while (record != NULL && recoveryContinue); *************** *** 5241,5246 **** StartupXLOG(void) --- 5936,5942 ---- /* there are no WAL records following the checkpoint */ ereport(LOG, (errmsg("redo is not required"))); + reachedSafeStartPoint = true; } } *************** *** 5254,5269 **** StartupXLOG(void) /* * Complain if we did not roll forward far enough to render the backup ! * dump consistent. */ ! if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint)) { if (reachedStopPoint) /* stopped because of stop request */ ereport(FATAL, (errmsg("requested recovery stop point is before end time of backup dump"))); else /* ran off end of WAL */ ereport(FATAL, ! (errmsg("WAL ends before end time of backup dump"))); } /* --- 5950,5965 ---- /* * Complain if we did not roll forward far enough to render the backup ! * dump consistent and start safely. */ ! if (InArchiveRecovery && !reachedSafeStartPoint) { if (reachedStopPoint) /* stopped because of stop request */ ereport(FATAL, (errmsg("requested recovery stop point is before end time of backup dump"))); else /* ran off end of WAL */ ereport(FATAL, ! (errmsg("end of WAL reached before end time of backup dump"))); } /* *************** *** 5378,5416 **** StartupXLOG(void) XLogCheckInvalidPages(); /* ! * Reset pgstat data, because it may be invalid after recovery. */ ! pgstat_reset_all(); ! /* ! * Perform a checkpoint to update all our recovery activity to disk. ! * ! * Note that we write a shutdown checkpoint rather than an on-line ! * one. This is not particularly critical, but since we may be ! * assigning a new TLI, using a shutdown checkpoint allows us to have ! * the rule that TLI only changes in shutdown checkpoints, which ! * allows some extra error checking in xlog_redo. ! */ ! CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); } - /* - * Preallocate additional log files, if wanted. - */ - PreallocXlogFiles(EndOfLog); - - /* - * Okay, we're officially UP. - */ - InRecovery = false; - - ControlFile->state = DB_IN_PRODUCTION; - ControlFile->time = (pg_time_t) time(NULL); - UpdateControlFile(); - - /* start the archive_timeout timer running */ - XLogCtl->Write.lastSegSwitchTime = ControlFile->time; - /* initialize shared-memory copy of latest checkpoint XID/epoch */ XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch; XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid; --- 6074,6087 ---- XLogCheckInvalidPages(); /* ! * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote ! * a shutdown checkpoint here, but we ask bgwriter to do that now. */ ! exitRecovery(); ! performedRecovery = true; } /* initialize shared-memory copy of latest checkpoint XID/epoch */ XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch; XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid; *************** *** 5419,5424 **** StartupXLOG(void) --- 6090,6099 ---- ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; TransactionIdRetreat(ShmemVariableCache->latestCompletedXid); + /* Shutdown the recovery environment. Must be in this order */ + ProcArrayClearRecoveryTransactions(); + RelationClearRecoveryLocks(); + /* Start up the commit log and related stuff, too */ StartupCLOG(); StartupSUBTRANS(oldestActiveXID); *************** *** 5444,5449 **** StartupXLOG(void) --- 6119,6219 ---- readRecordBuf = NULL; readRecordBufSize = 0; } + + /* + * Prior to 8.4 we wrote a Shutdown Checkpoint at the end of recovery. + * This could add minutes to the startup time, so we want bgwriter + * to perform it. This then frees the Startup process to complete so we can + * allow transactions and WAL inserts. We still write a checkpoint, but + * it will be an online checkpoint. Online checkpoints have a redo + * location that can be prior to the actual checkpoint record. So we want + * to derive that redo location *before* we let anybody else write WAL, + * otherwise we might miss some WAL records if we crash. + */ + if (performedRecovery) + { + XLogRecPtr redo; + + /* + * We must grab the pointer before anybody writes WAL + */ + redo = GetRedoLocationForCheckpoint(); + + /* + * Set up information for the bgwriter, but if it is not active + * for whatever reason, perform the checkpoint ourselves. + */ + if (SetRedoLocationForArchiveCheckpoint(redo)) + { + /* + * Okay, we can come up now. Allow others to write WAL. + */ + XLogCtl->SharedRecoveryProcessingMode = false; + + /* + * Now request checkpoint from bgwriter. + */ + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE); + } + else + { + /* + * Startup process performs the checkpoint, but defers + * the change in processing mode until afterwards. + */ + CreateCheckPoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE); + } + } + else + { + /* + * No recovery, so lets just get on with it. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_IN_PRODUCTION; + ControlFile->time = (pg_time_t) time(NULL); + UpdateControlFile(); + LWLockRelease(ControlFileLock); + } + + /* + * Okay, we can come up now. Allow others to write WAL. + */ + XLogCtl->SharedRecoveryProcessingMode = false; + + /* start the archive_timeout timer running */ + XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL); + } + + /* + * IsRecoveryProcessingMode() + * + * Fast test for whether we're still in recovery or not. We test the shared + * state each time only until we leave recovery mode. After that we never + * look again, relying upon the settings of our local state variables. This + * is designed to avoid the need for a separate initialisation step. + */ + bool + IsRecoveryProcessingMode(void) + { + if (knownProcessingMode && !LocalRecoveryProcessingMode) + return false; + + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + if (xlogctl == NULL) + return false; + + SpinLockAcquire(&xlogctl->mode_lck); + LocalRecoveryProcessingMode = XLogCtl->SharedRecoveryProcessingMode; + SpinLockRelease(&xlogctl->mode_lck); + } + + knownProcessingMode = true; + + return LocalRecoveryProcessingMode; } /* *************** *** 5701,5720 **** ShutdownXLOG(int code, Datum arg) static void LogCheckpointStart(int flags) { ! elog(LOG, "checkpoint starting:%s%s%s%s%s%s", ! (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", ! (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", ! (flags & CHECKPOINT_FORCE) ? " force" : "", ! (flags & CHECKPOINT_WAIT) ? " wait" : "", ! (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "", ! (flags & CHECKPOINT_CAUSE_TIME) ? " time" : ""); } /* * Log end of a checkpoint. */ static void ! LogCheckpointEnd(void) { long write_secs, sync_secs, --- 6471,6494 ---- static void LogCheckpointStart(int flags) { ! if (flags & CHECKPOINT_RESTARTPOINT) ! elog(LOG, "restartpoint starting:%s", ! (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : ""); ! else ! elog(LOG, "checkpoint starting:%s%s%s%s%s%s", ! (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", ! (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", ! (flags & CHECKPOINT_FORCE) ? " force" : "", ! (flags & CHECKPOINT_WAIT) ? " wait" : "", ! (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "", ! (flags & CHECKPOINT_CAUSE_TIME) ? " time" : ""); } /* * Log end of a checkpoint. */ static void ! LogCheckpointEnd(int flags) { long write_secs, sync_secs, *************** *** 5737,5753 **** LogCheckpointEnd(void) CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); ! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " ! "%d transaction log file(s) added, %d removed, %d recycled; " ! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", ! CheckpointStats.ckpt_bufs_written, ! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, ! CheckpointStats.ckpt_segs_added, ! CheckpointStats.ckpt_segs_removed, ! CheckpointStats.ckpt_segs_recycled, ! write_secs, write_usecs / 1000, ! sync_secs, sync_usecs / 1000, ! total_secs, total_usecs / 1000); } /* --- 6511,6536 ---- CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); ! if (flags & CHECKPOINT_RESTARTPOINT) ! elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); " ! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", ! CheckpointStats.ckpt_bufs_written, ! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, ! write_secs, write_usecs / 1000, ! sync_secs, sync_usecs / 1000, ! total_secs, total_usecs / 1000); ! else ! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " ! "%d transaction log file(s) added, %d removed, %d recycled; " ! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", ! CheckpointStats.ckpt_bufs_written, ! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, ! CheckpointStats.ckpt_segs_added, ! CheckpointStats.ckpt_segs_removed, ! CheckpointStats.ckpt_segs_recycled, ! write_secs, write_usecs / 1000, ! sync_secs, sync_usecs / 1000, ! total_secs, total_usecs / 1000); } /* *************** *** 5772,5788 **** CreateCheckPoint(int flags) XLogRecPtr recptr; XLogCtlInsert *Insert = &XLogCtl->Insert; XLogRecData rdata; - uint32 freespace; uint32 _logId; uint32 _logSeg; TransactionId *inCommitXids; int nInCommit; /* * Acquire CheckpointLock to ensure only one checkpoint happens at a time. ! * (This is just pro forma, since in the present system structure there is ! * only one process that is allowed to issue checkpoints at any given ! * time.) */ LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); --- 6555,6570 ---- XLogRecPtr recptr; XLogCtlInsert *Insert = &XLogCtl->Insert; XLogRecData rdata; uint32 _logId; uint32 _logSeg; TransactionId *inCommitXids; int nInCommit; + bool leavingArchiveRecovery = false; /* * Acquire CheckpointLock to ensure only one checkpoint happens at a time. ! * That shouldn't be happening, but checkpoints are an important aspect ! * of our resilience, so we take no chances. */ LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); *************** *** 5797,5811 **** CreateCheckPoint(int flags) --- 6579,6602 ---- CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); /* + * Find out if this is the first checkpoint after archive recovery. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + leavingArchiveRecovery = (ControlFile->state == DB_IN_ARCHIVE_RECOVERY); + LWLockRelease(ControlFileLock); + + /* * Use a critical section to force system panic if we have trouble. */ START_CRIT_SECTION(); if (shutdown) { + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->state = DB_SHUTDOWNING; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); } /* *************** *** 5861,5901 **** CreateCheckPoint(int flags) } } ! /* ! * Compute new REDO record ptr = location of next XLOG record. ! * ! * NB: this is NOT necessarily where the checkpoint record itself will be, ! * since other backends may insert more XLOG records while we're off doing ! * the buffer flush work. Those XLOG records are logically after the ! * checkpoint, even though physically before it. Got that? ! */ ! freespace = INSERT_FREESPACE(Insert); ! if (freespace < SizeOfXLogRecord) ! { ! (void) AdvanceXLInsertBuffer(false); ! /* OK to ignore update return flag, since we will do flush anyway */ ! freespace = INSERT_FREESPACE(Insert); ! } ! INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx); ! ! /* ! * Here we update the shared RedoRecPtr for future XLogInsert calls; this ! * must be done while holding the insert lock AND the info_lck. ! * ! * Note: if we fail to complete the checkpoint, RedoRecPtr will be left ! * pointing past where it really needs to point. This is okay; the only ! * consequence is that XLogInsert might back up whole buffers that it ! * didn't really need to. We can't postpone advancing RedoRecPtr because ! * XLogInserts that happen while we are dumping buffers must assume that ! * their buffer changes are not included in the checkpoint. ! */ { ! /* use volatile pointer to prevent code rearrangement */ ! volatile XLogCtlData *xlogctl = XLogCtl; ! ! SpinLockAcquire(&xlogctl->info_lck); ! RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo; ! SpinLockRelease(&xlogctl->info_lck); } /* --- 6652,6670 ---- } } ! if (leavingArchiveRecovery) ! checkPoint.redo = GetRedoLocationForArchiveCheckpoint(); ! else { ! /* ! * Compute new REDO record ptr = location of next XLOG record. ! * ! * NB: this is NOT necessarily where the checkpoint record itself will be, ! * since other backends may insert more XLOG records while we're off doing ! * the buffer flush work. Those XLOG records are logically after the ! * checkpoint, even though physically before it. Got that? ! */ ! checkPoint.redo = GetRedoLocationForCheckpoint(); } /* *************** *** 6013,6023 **** CreateCheckPoint(int flags) XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg); /* ! * Update the control file. */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); if (shutdown) ControlFile->state = DB_SHUTDOWNED; ControlFile->prevCheckPoint = ControlFile->checkPoint; ControlFile->checkPoint = ProcLastRecPtr; ControlFile->checkPointCopy = checkPoint; --- 6782,6799 ---- XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg); /* ! * Update the control file. In 8.4, this routine becomes the primary ! * point for recording changes of state in the control file at the ! * end of recovery. Postmaster state already shows us being in ! * normal running mode, but it is only after this point that we ! * are completely free of reperforming a recovery if we crash. Note ! * that this is executed by bgwriter after the death of Startup process. */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); if (shutdown) ControlFile->state = DB_SHUTDOWNED; + else + ControlFile->state = DB_IN_PRODUCTION; ControlFile->prevCheckPoint = ControlFile->checkPoint; ControlFile->checkPoint = ProcLastRecPtr; ControlFile->checkPointCopy = checkPoint; *************** *** 6025,6030 **** CreateCheckPoint(int flags) --- 6801,6821 ---- UpdateControlFile(); LWLockRelease(ControlFileLock); + if (leavingArchiveRecovery) + { + /* + * Rename the config file out of the way, so that we don't accidentally + * re-enter archive recovery mode in a subsequent crash. Prior to + * 8.4 this step was performed at end of exitArchiveRecovery(). + */ + unlink(RECOVERY_COMMAND_DONE); + if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE))); + } + /* Update shared-memory copy of checkpoint XID/epoch */ { /* use volatile pointer to prevent code rearrangement */ *************** *** 6068,6082 **** CreateCheckPoint(int flags) * Truncate pg_subtrans if possible. We can throw away all data before * the oldest XMIN of any running transaction. No future transaction will * attempt to reference any pg_subtrans entry older than that (see Asserts ! * in subtrans.c). During recovery, though, we mustn't do this because ! * StartupSUBTRANS hasn't been called yet. */ ! if (!InRecovery) TruncateSUBTRANS(GetOldestXmin(true, false)); /* All real work is done, but log before releasing lock. */ if (log_checkpoints) ! LogCheckpointEnd(); TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written, NBuffers, CheckpointStats.ckpt_segs_added, --- 6859,6872 ---- * Truncate pg_subtrans if possible. We can throw away all data before * the oldest XMIN of any running transaction. No future transaction will * attempt to reference any pg_subtrans entry older than that (see Asserts ! * in subtrans.c). */ ! if (!shutdown) TruncateSUBTRANS(GetOldestXmin(true, false)); /* All real work is done, but log before releasing lock. */ if (log_checkpoints) ! LogCheckpointEnd(flags); TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written, NBuffers, CheckpointStats.ckpt_segs_added, *************** *** 6084,6089 **** CreateCheckPoint(int flags) --- 6874,6935 ---- CheckpointStats.ckpt_segs_recycled); LWLockRelease(CheckpointLock); + + /* + * Take a snapshot of running transactions and write this to WAL. + * This allows us to reconstruct the state of running transactions + * during archive recovery, if required. + * + * If we are shutting down, or Startup process is completing crash + * recovery we don't need to write running xact data. + */ + if (!shutdown && !IsRecoveryProcessingMode()) + LogCurrentRunningXacts(); + } + + /* + * GetRedoLocationForCheckpoint() + * + * When !IsRecoveryProcessingMode() this must be called while holding + * WALInsertLock(). + */ + static XLogRecPtr + GetRedoLocationForCheckpoint() + { + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint32 freespace; + XLogRecPtr redo; + + freespace = INSERT_FREESPACE(Insert); + if (freespace < SizeOfXLogRecord) + { + (void) AdvanceXLInsertBuffer(false); + /* OK to ignore update return flag, since we will do flush anyway */ + freespace = INSERT_FREESPACE(Insert); + } + INSERT_RECPTR(redo, Insert, Insert->curridx); + + /* + * Here we update the shared RedoRecPtr for future XLogInsert calls; this + * must be done while holding the insert lock AND the info_lck. + * + * Note: if we fail to complete the checkpoint, RedoRecPtr will be left + * pointing past where it really needs to point. This is okay; the only + * consequence is that XLogInsert might back up whole buffers that it + * didn't really need to. We can't postpone advancing RedoRecPtr because + * XLogInserts that happen while we are dumping buffers must assume that + * their buffer changes are not included in the checkpoint. + */ + { + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; + + SpinLockAcquire(&xlogctl->info_lck); + RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo; + SpinLockRelease(&xlogctl->info_lck); + } + + return redo; } /* *************** *** 6142,6148 **** RecoveryRestartPoint(const CheckPoint *checkPoint) if (RmgrTable[rmid].rm_safe_restartpoint != NULL) if (!(RmgrTable[rmid].rm_safe_restartpoint())) { ! elog(DEBUG2, "RM %d not safe to record restart point at %X/%X", rmid, checkPoint->redo.xlogid, checkPoint->redo.xrecoff); --- 6988,6994 ---- if (RmgrTable[rmid].rm_safe_restartpoint != NULL) if (!(RmgrTable[rmid].rm_safe_restartpoint())) { ! elog(trace_recovery(DEBUG2), "RM %d not safe to record restart point at %X/%X", rmid, checkPoint->redo.xlogid, checkPoint->redo.xrecoff); *************** *** 6150,6180 **** RecoveryRestartPoint(const CheckPoint *checkPoint) } } /* ! * OK, force data out to disk */ ! CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE); /* ! * Update pg_control so that any subsequent crash will restart from this ! * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint ! * record itself. */ - ControlFile->prevCheckPoint = ControlFile->checkPoint; - ControlFile->checkPoint = ReadRecPtr; - ControlFile->checkPointCopy = *checkPoint; - ControlFile->time = (pg_time_t) time(NULL); - UpdateControlFile(); ereport((recoveryLogRestartpoints ? LOG : DEBUG2), ! (errmsg("recovery restart point at %X/%X", ! checkPoint->redo.xlogid, checkPoint->redo.xrecoff))); ! if (recoveryLastXTime) ! ereport((recoveryLogRestartpoints ? LOG : DEBUG2), ! (errmsg("last completed transaction was at log time %s", ! timestamptz_to_str(recoveryLastXTime)))); ! } /* * Write a NEXTOID log record */ --- 6996,7068 ---- } } + RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStartPoint); + } + + /* + * As of 8.4, RestartPoints are always created by the bgwriter + * once we have reachedSafeStartPoint. We use bgwriter's shared memory + * area wherever we call it from, to keep better code structure. + */ + void + CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int flags) + { + if (recoveryLogRestartpoints || log_checkpoints) + { + /* + * Prepare to accumulate statistics. + */ + + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + + LogCheckpointStart(CHECKPOINT_RESTARTPOINT | flags); + } + + /* + * Acquire CheckpointLock to ensure only one restartpoint happens at a time. + * We rely on this lock to ensure that the startup process doesn't exit + * Recovery while we are half way through a restartpoint. + */ + LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); + + CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT | flags); + /* ! * Update pg_control, using current time */ ! LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ! ControlFile->prevCheckPoint = ControlFile->checkPoint; ! ControlFile->checkPoint = ReadPtr; ! ControlFile->checkPointCopy = *restartPoint; ! ControlFile->time = (pg_time_t) time(NULL); ! UpdateControlFile(); ! LWLockRelease(ControlFileLock); /* ! * Currently, there is no need to truncate pg_subtrans during recovery. ! * If we did do that, we will need to have called StartupSUBTRANS() ! * already and then TruncateSUBTRANS() would go here. */ + /* All real work is done, but log before releasing lock. */ + if (recoveryLogRestartpoints || log_checkpoints) + LogCheckpointEnd(CHECKPOINT_RESTARTPOINT); + ereport((recoveryLogRestartpoints ? LOG : DEBUG2), ! (errmsg("recovery restart point at %X/%X", ! restartPoint->redo.xlogid, restartPoint->redo.xrecoff))); ! ! ReportCleanupDelayStats(); ! ! if (recoveryLastXTime) ! ereport((recoveryLogRestartpoints ? LOG : DEBUG2), ! (errmsg("last completed transaction was at log time %s", ! timestamptz_to_str(recoveryLastXTime)))); + LWLockRelease(CheckpointLock); + } + /* * Write a NEXTOID log record */ *************** *** 6237,6243 **** RequestXLogSwitch(void) } /* ! * XLOG resource manager's routines */ void xlog_redo(XLogRecPtr lsn, XLogRecord *record) --- 7125,7187 ---- } /* ! * exitRecovery() ! * ! * Exit recovery state and write a XLOG_RECOVERY_END record. This is the ! * only record type that can record a change of timelineID. We assume ! * caller has already set ThisTimeLineID, if appropriate. ! */ ! static void ! exitRecovery(void) ! { ! XLogRecData rdata; ! ! rdata.buffer = InvalidBuffer; ! rdata.data = (char *) (&ThisTimeLineID); ! rdata.len = sizeof(TimeLineID); ! rdata.next = NULL; ! ! /* ! * If a restartpoint is in progress, we will not be able to successfully ! * acquire CheckpointLock. If bgwriter is still in progress then send ! * a second signal to nudge bgwriter to go faster so we can avoid delay. ! * Then wait for lock, so we know the restartpoint has completed. We do ! * this because we don't want to interrupt the restartpoint half way ! * through, which might leave us in a mess and we want to be robust. We're ! * going to checkpoint soon anyway, so not it's not wasted effort. ! */ ! if (LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE)) ! LWLockRelease(CheckpointLock); ! else ! { ! RequestRestartPointCompletion(); ! ereport(trace_recovery(DEBUG1), ! (errmsg("startup process waiting for restartpoint to complete"))); ! LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); ! LWLockRelease(CheckpointLock); ! } ! ! /* ! * This is the only type of WAL message that can be inserted during ! * recovery. This ensures that we don't allow others to get access ! * until after we have changed state. ! */ ! (void) XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata); ! ! /* ! * We don't XLogFlush() here otherwise we'll end up zeroing the WAL ! * file ourselves. So just let bgwriter's forthcoming checkpoint do ! * that for us. ! */ ! ! InRecovery = false; ! } ! ! /* ! * XLOG resource manager's routines. ! * ! * Definitions of message info are in include/catalog/pg_control.h, ! * though not all messages relate to control file processing. */ void xlog_redo(XLogRecPtr lsn, XLogRecord *record) *************** *** 6267,6293 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record) MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch; ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; ! /* ! * TLI may change in a shutdown checkpoint, but it shouldn't decrease */ - if (checkPoint.ThisTimeLineID != ThisTimeLineID) - { - if (checkPoint.ThisTimeLineID < ThisTimeLineID || - !list_member_int(expectedTLIs, - (int) checkPoint.ThisTimeLineID)) - ereport(PANIC, - (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", - checkPoint.ThisTimeLineID, ThisTimeLineID))); - /* Following WAL records should be run with new TLI */ - ThisTimeLineID = checkPoint.ThisTimeLineID; - } RecoveryRestartPoint(&checkPoint); } else if (info == XLOG_CHECKPOINT_ONLINE) { CheckPoint checkPoint; --- 7211,7262 ---- MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + /* We know nothing was running on the master at this point */ + ProcArrayClearRecoveryTransactions(); + RelationClearRecoveryLocks(); + /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch; ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; ! /* ! * TLI no longer changes at shutdown checkpoint, since as of 8.4, ! * shutdown checkpoints only occur at shutdown. Much less confusing. */ RecoveryRestartPoint(&checkPoint); } + else if (info == XLOG_RECOVERY_END) + { + TimeLineID tli; + + memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID)); + + /* We know nothing was running on the master at this point */ + ProcArrayClearRecoveryTransactions(); + RelationClearRecoveryLocks(); + + /* + * TLI may change when recovery ends, but it shouldn't decrease. + * + * This is the only WAL record that can tell us to change timelineID + * while we process WAL records. + * + * We can *choose* to stop recovery at any point, generating a + * new timelineID which is recorded using this record type. + */ + if (tli != ThisTimeLineID) + { + if (tli < ThisTimeLineID || + !list_member_int(expectedTLIs, + (int) tli)) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (after %u) at recovery end record", + tli, ThisTimeLineID))); + /* Following WAL records should be run with new TLI */ + ThisTimeLineID = tli; + } + } else if (info == XLOG_CHECKPOINT_ONLINE) { CheckPoint checkPoint; *************** *** 6309,6315 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record) ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch; ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; ! /* TLI should not change in an on-line checkpoint */ if (checkPoint.ThisTimeLineID != ThisTimeLineID) ereport(PANIC, (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record", --- 7278,7284 ---- ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch; ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; ! /* TLI must not change at a checkpoint */ if (checkPoint.ThisTimeLineID != ThisTimeLineID) ereport(PANIC, (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record", *************** *** 6377,6382 **** xlog_outrec(StringInfo buf, XLogRecord *record) --- 7346,7355 ---- record->xl_prev.xlogid, record->xl_prev.xrecoff, record->xl_xid); + appendStringInfo(buf, "; pxid %u len %u", + record->xl_parentxid, + record->xl_len); + for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) { if (record->xl_info & XLR_SET_BKP_BLOCK(i)) *************** *** 6545,6550 **** pg_start_backup(PG_FUNCTION_ARGS) --- 7518,7529 ---- errhint("archive_command must be defined before " "online backups can be made safely."))); + if (IsRecoveryProcessingMode()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + backupidstr = text_to_cstring(backupid); /* *************** *** 6710,6715 **** pg_stop_backup(PG_FUNCTION_ARGS) --- 7689,7700 ---- errmsg("WAL archiving is not active"), errhint("archive_mode must be enabled at server start."))); + if (IsRecoveryProcessingMode()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + /* * OK to clear forcePageWrites */ *************** *** 6865,6870 **** pg_switch_xlog(PG_FUNCTION_ARGS) --- 7850,7861 ---- (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to switch transaction log files")))); + if (IsRecoveryProcessingMode()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + switchpoint = RequestXLogSwitch(); /* *************** *** 6887,6892 **** pg_current_xlog_location(PG_FUNCTION_ARGS) --- 7878,7889 ---- { char location[MAXFNAMELEN]; + if (IsRecoveryProcessingMode()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + /* Make sure we have an up-to-date local LogwrtResult */ { /* use volatile pointer to prevent code rearrangement */ *************** *** 6914,6919 **** pg_current_xlog_insert_location(PG_FUNCTION_ARGS) --- 7911,7922 ---- XLogRecPtr current_recptr; char location[MAXFNAMELEN]; + if (IsRecoveryProcessingMode()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + /* * Get the current end-of-WAL position ... shared lock is sufficient */ *** src/backend/access/transam/xlogutils.c --- src/backend/access/transam/xlogutils.c *************** *** 227,233 **** Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) { return XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, ! init ? RBM_ZERO : RBM_NORMAL); } /* --- 227,240 ---- XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) { return XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, ! init ? RBM_ZERO : RBM_NORMAL, BUFFER_LOCK_EXCLUSIVE); ! } ! ! Buffer ! XLogReadBufferForCleanup(RelFileNode rnode, BlockNumber blkno, bool init) ! { ! return XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, ! init ? RBM_ZERO : RBM_NORMAL, BUFFER_LOCK_CLEANUP); } /* *************** *** 254,260 **** XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) */ Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, ! BlockNumber blkno, ReadBufferMode mode) { BlockNumber lastblock; Buffer buffer; --- 261,267 ---- */ Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, ! BlockNumber blkno, ReadBufferMode mode, int lockmode) { BlockNumber lastblock; Buffer buffer; *************** *** 306,312 **** XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, Assert(BufferGetBlockNumber(buffer) == blkno); } ! LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (mode == RBM_NORMAL) { --- 313,324 ---- Assert(BufferGetBlockNumber(buffer) == blkno); } ! if (lockmode == BUFFER_LOCK_EXCLUSIVE) ! LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); ! else if (lockmode == BUFFER_LOCK_CLEANUP) ! LockBufferForCleanup(buffer); ! else ! elog(FATAL, "Invalid buffer lock mode %d", lockmode); if (mode == RBM_NORMAL) { *** src/backend/bootstrap/bootstrap.c --- src/backend/bootstrap/bootstrap.c *************** *** 35,40 **** --- 35,41 ---- #include "storage/bufmgr.h" #include "storage/ipc.h" #include "storage/proc.h" + #include "storage/sinvaladt.h" #include "tcop/tcopprot.h" #include "utils/builtins.h" #include "utils/flatfiles.h" *** src/backend/commands/dbcommands.c --- src/backend/commands/dbcommands.c *************** *** 1976,1981 **** dbase_redo(XLogRecPtr lsn, XLogRecord *record) --- 1976,1986 ---- * We don't need to copy subdirectories */ copydir(src_path, dst_path, false); + + /* + * Flat files are updated immediately following transaction commit. + * Nothing to do here. + */ } else if (info == XLOG_DBASE_DROP) { *************** *** 1998,2003 **** dbase_redo(XLogRecPtr lsn, XLogRecord *record) --- 2003,2012 ---- ereport(WARNING, (errmsg("some useless files may be left behind in old database directory \"%s\"", dst_path))); + /* + * Flat files are updated immediately following transaction commit. + * Nothing to do here. + */ } else elog(PANIC, "dbase_redo: unknown op code %u", info); *** src/backend/commands/discard.c --- src/backend/commands/discard.c *************** *** 65,71 **** DiscardAll(bool isTopLevel) ResetAllOptions(); DropAllPreparedStatements(); PortalHashTableDeleteAll(); ! Async_UnlistenAll(); LockReleaseAll(USER_LOCKMETHOD, true); ResetPlanCache(); ResetTempTableNamespace(); --- 65,72 ---- ResetAllOptions(); DropAllPreparedStatements(); PortalHashTableDeleteAll(); ! if (!IsRecoveryProcessingMode()) ! Async_UnlistenAll(); LockReleaseAll(USER_LOCKMETHOD, true); ResetPlanCache(); ResetTempTableNamespace(); *** src/backend/commands/indexcmds.c --- src/backend/commands/indexcmds.c *************** *** 648,654 **** DefineIndex(RangeVar *heapRelation, * Also, GetCurrentVirtualXIDs never reports our own vxid, so we need not * check for that. */ ! old_snapshots = GetCurrentVirtualXIDs(snapshot->xmax, false, PROC_IS_AUTOVACUUM | PROC_IN_VACUUM); while (VirtualTransactionIdIsValid(*old_snapshots)) --- 648,654 ---- * Also, GetCurrentVirtualXIDs never reports our own vxid, so we need not * check for that. */ ! old_snapshots = GetCurrentVirtualXIDs(snapshot->xmax, MyDatabaseId, PROC_IS_AUTOVACUUM | PROC_IN_VACUUM); while (VirtualTransactionIdIsValid(*old_snapshots)) *** src/backend/commands/lockcmds.c --- src/backend/commands/lockcmds.c *************** *** 49,54 **** LockTableCommand(LockStmt *lockstmt) --- 49,66 ---- */ reloid = RangeVarGetRelid(relation, false); + /* + * During recovery we only accept these variations: + * + * LOCK TABLE foo -- parser translates as AccessEclusiveLock request + * LOCK TABLE foo IN AccessShareLock MODE + * LOCK TABLE foo IN AccessExclusiveLock MODE + */ + if (IsRecoveryProcessingMode() && + !(lockstmt->mode == AccessShareLock || + lockstmt->mode == AccessExclusiveLock)) + PreventCommandDuringRecovery(); + if (lockstmt->mode == AccessShareLock) aclresult = pg_class_aclcheck(reloid, GetUserId(), ACL_SELECT); *** src/backend/commands/sequence.c --- src/backend/commands/sequence.c *************** *** 457,462 **** nextval_internal(Oid relid) --- 457,464 ---- rescnt = 0; bool logit = false; + PreventCommandDuringRecovery(); + /* open and AccessShareLock sequence */ init_sequence(relid, &elm, &seqrel); *** src/backend/commands/vacuum.c --- src/backend/commands/vacuum.c *************** *** 140,145 **** typedef struct VRelStats --- 140,146 ---- /* vtlinks array for tuple chain following - sorted by new_tid */ int num_vtlinks; VTupleLink vtlinks; + TransactionId latestRemovedXid; } VRelStats; /*---------------------------------------------------------------------- *************** *** 223,229 **** static void scan_heap(VRelStats *vacrelstats, Relation onerel, static void repair_frag(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages, VacPageList fraged_pages, int nindexes, Relation *Irel); ! static void move_chain_tuple(Relation rel, Buffer old_buf, Page old_page, HeapTuple old_tup, Buffer dst_buf, Page dst_page, VacPage dst_vacpage, ExecContext ec, ItemPointer ctid, bool cleanVpd); --- 224,230 ---- static void repair_frag(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages, VacPageList fraged_pages, int nindexes, Relation *Irel); ! static void move_chain_tuple(VRelStats *vacrelstats, Relation rel, Buffer old_buf, Page old_page, HeapTuple old_tup, Buffer dst_buf, Page dst_page, VacPage dst_vacpage, ExecContext ec, ItemPointer ctid, bool cleanVpd); *************** *** 236,242 **** static void update_hint_bits(Relation rel, VacPageList fraged_pages, int num_moved); static void vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacpagelist); ! static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage); static void vacuum_index(VacPageList vacpagelist, Relation indrel, double num_tuples, int keep_tuples); static void scan_index(Relation indrel, double num_tuples); --- 237,243 ---- int num_moved); static void vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacpagelist); ! static void vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage); static void vacuum_index(VacPageList vacpagelist, Relation indrel, double num_tuples, int keep_tuples); static void scan_index(Relation indrel, double num_tuples); *************** *** 1238,1243 **** full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt) --- 1239,1245 ---- vacrelstats->rel_tuples = 0; vacrelstats->rel_indexed_tuples = 0; vacrelstats->hasindex = false; + vacrelstats->latestRemovedXid = InvalidTransactionId; /* scan the heap */ vacuum_pages.num_pages = fraged_pages.num_pages = 0; *************** *** 1641,1646 **** scan_heap(VRelStats *vacrelstats, Relation onerel, --- 1643,1651 ---- { ItemId lpp; + HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, + &vacrelstats->latestRemovedXid); + /* * Here we are building a temporary copy of the page with dead * tuples removed. Below we will apply *************** *** 1954,1960 **** repair_frag(VRelStats *vacrelstats, Relation onerel, /* there are dead tuples on this page - clean them */ Assert(!isempty); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); ! vacuum_page(onerel, buf, last_vacuum_page); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else --- 1959,1965 ---- /* there are dead tuples on this page - clean them */ Assert(!isempty); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); ! vacuum_page(vacrelstats, onerel, buf, last_vacuum_page); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else *************** *** 2443,2449 **** repair_frag(VRelStats *vacrelstats, Relation onerel, tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid); tuple_len = tuple.t_len = ItemIdGetLength(Citemid); ! move_chain_tuple(onerel, Cbuf, Cpage, &tuple, dst_buffer, dst_page, destvacpage, &ec, &Ctid, vtmove[ti].cleanVpd); --- 2448,2454 ---- tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid); tuple_len = tuple.t_len = ItemIdGetLength(Citemid); ! move_chain_tuple(vacrelstats, onerel, Cbuf, Cpage, &tuple, dst_buffer, dst_page, destvacpage, &ec, &Ctid, vtmove[ti].cleanVpd); *************** *** 2529,2535 **** repair_frag(VRelStats *vacrelstats, Relation onerel, dst_page = BufferGetPage(dst_buffer); /* if this page was not used before - clean it */ if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0) ! vacuum_page(onerel, dst_buffer, dst_vacpage); } else LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE); --- 2534,2540 ---- dst_page = BufferGetPage(dst_buffer); /* if this page was not used before - clean it */ if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0) ! vacuum_page(vacrelstats, onerel, dst_buffer, dst_vacpage); } else LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE); *************** *** 2706,2712 **** repair_frag(VRelStats *vacrelstats, Relation onerel, LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (!PageIsEmpty(page)) ! vacuum_page(onerel, buf, *curpage); UnlockReleaseBuffer(buf); } } --- 2711,2717 ---- LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (!PageIsEmpty(page)) ! vacuum_page(vacrelstats, onerel, buf, *curpage); UnlockReleaseBuffer(buf); } } *************** *** 2842,2848 **** repair_frag(VRelStats *vacrelstats, Relation onerel, recptr = log_heap_clean(onerel, buf, NULL, 0, NULL, 0, unused, uncnt, ! false); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } --- 2847,2853 ---- recptr = log_heap_clean(onerel, buf, NULL, 0, NULL, 0, unused, uncnt, ! vacrelstats->latestRemovedXid, false); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } *************** *** 2892,2898 **** repair_frag(VRelStats *vacrelstats, Relation onerel, * already too long and almost unreadable. */ static void ! move_chain_tuple(Relation rel, Buffer old_buf, Page old_page, HeapTuple old_tup, Buffer dst_buf, Page dst_page, VacPage dst_vacpage, ExecContext ec, ItemPointer ctid, bool cleanVpd) --- 2897,2903 ---- * already too long and almost unreadable. */ static void ! move_chain_tuple(VRelStats *vacrelstats, Relation rel, Buffer old_buf, Page old_page, HeapTuple old_tup, Buffer dst_buf, Page dst_page, VacPage dst_vacpage, ExecContext ec, ItemPointer ctid, bool cleanVpd) *************** *** 2948,2954 **** move_chain_tuple(Relation rel, int sv_offsets_used = dst_vacpage->offsets_used; dst_vacpage->offsets_used = 0; ! vacuum_page(rel, dst_buf, dst_vacpage); dst_vacpage->offsets_used = sv_offsets_used; } --- 2953,2959 ---- int sv_offsets_used = dst_vacpage->offsets_used; dst_vacpage->offsets_used = 0; ! vacuum_page(vacrelstats, rel, dst_buf, dst_vacpage); dst_vacpage->offsets_used = sv_offsets_used; } *************** *** 3272,3278 **** vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages) buf = ReadBufferExtended(onerel, MAIN_FORKNUM, (*vacpage)->blkno, RBM_NORMAL, vac_strategy); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); ! vacuum_page(onerel, buf, *vacpage); UnlockReleaseBuffer(buf); } } --- 3277,3283 ---- buf = ReadBufferExtended(onerel, MAIN_FORKNUM, (*vacpage)->blkno, RBM_NORMAL, vac_strategy); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); ! vacuum_page(vacrelstats, onerel, buf, *vacpage); UnlockReleaseBuffer(buf); } } *************** *** 3302,3308 **** vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages) * Caller must hold pin and lock on buffer. */ static void ! vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage) { Page page = BufferGetPage(buffer); int i; --- 3307,3313 ---- * Caller must hold pin and lock on buffer. */ static void ! vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage) { Page page = BufferGetPage(buffer); int i; *************** *** 3331,3337 **** vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage) recptr = log_heap_clean(onerel, buffer, NULL, 0, NULL, 0, vacpage->offsets, vacpage->offsets_free, ! false); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } --- 3336,3342 ---- recptr = log_heap_clean(onerel, buffer, NULL, 0, NULL, 0, vacpage->offsets, vacpage->offsets_free, ! vacrelstats->latestRemovedXid, false); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } *** src/backend/commands/vacuumlazy.c --- src/backend/commands/vacuumlazy.c *************** *** 91,96 **** typedef struct LVRelStats --- 91,97 ---- ItemPointer dead_tuples; /* array of ItemPointerData */ int num_index_scans; bool scanned_all; /* have we scanned all pages (this far)? */ + TransactionId latestRemovedXid; } LVRelStats; *************** *** 235,240 **** lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, --- 236,271 ---- *scanned_all = vacrelstats->scanned_all; } + /* + * For Hot Standby we need to know the highest transaction id that will + * be removed by any change. VACUUM proceeds in a number of passes so + * we need to consider how each pass operates. The first pass runs + * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it + * progresses - these will have a latestRemovedXid on each record. + * In many cases this removes all of the tuples to be removed. + * Then we look at tuples to be removed, but do not actually remove them + * until phase three. However, index records for those rows are removed + * in phase two and index blocks do not have MVCC information attached. + * So before we can allow removal of *any* index tuples we need to issue + * a WAL record indicating what the latestRemovedXid will be at the end + * of phase three. This then allows Hot Standby queries to block at the + * correct place, i.e. before phase two, rather than during phase three + * as we issue more XLOG_HEAP2_CLEAN records. If we need to run multiple + * phase two/three because of memory constraints we need to issue multiple + * log records also. + */ + static void + vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats) + { + /* + * No need to log changes for temp tables, they do not contain + * data visible on the standby server. + */ + if (rel->rd_istemp) + return; + + (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid); + } /* * lazy_scan_heap() -- scan an open heap relation *************** *** 284,289 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 315,321 ---- nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->nonempty_pages = 0; + vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); *************** *** 328,333 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 360,368 ---- if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { + /* Log cleanup info before we touch indexes */ + vacuum_log_cleanup_info(onerel, vacrelstats); + /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], *************** *** 567,572 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 602,609 ---- if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); + HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, + &vacrelstats->latestRemovedXid); tups_vacuumed += 1; } else *************** *** 677,682 **** lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, --- 714,722 ---- /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { + /* Log cleanup info before we touch indexes */ + vacuum_log_cleanup_info(onerel, vacrelstats); + /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], *************** *** 821,827 **** lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, recptr = log_heap_clean(onerel, buffer, NULL, 0, NULL, 0, unused, uncnt, ! false); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } --- 861,867 ---- recptr = log_heap_clean(onerel, buffer, NULL, 0, NULL, 0, unused, uncnt, ! vacrelstats->latestRemovedXid, false); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } *** src/backend/postmaster/bgwriter.c --- src/backend/postmaster/bgwriter.c *************** *** 49,54 **** --- 49,55 ---- #include #include "access/xlog_internal.h" + #include "catalog/pg_control.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" *************** *** 129,134 **** typedef struct --- 130,142 ---- int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ + /* + * When the Startup process wants bgwriter to perform a restartpoint, it + * sets these fields so that we can update the control file afterwards. + */ + XLogRecPtr ReadPtr; /* Requested log pointer */ + CheckPoint restartPoint; /* restartPoint data for ControlFile */ + uint32 num_backend_writes; /* counts non-bgwriter buffer writes */ int num_requests; /* current # of requests */ *************** *** 165,171 **** static bool ckpt_active = false; /* these values are valid when ckpt_active is true: */ static pg_time_t ckpt_start_time; ! static XLogRecPtr ckpt_start_recptr; static double ckpt_cached_elapsed; static pg_time_t last_checkpoint_time; --- 173,179 ---- /* these values are valid when ckpt_active is true: */ static pg_time_t ckpt_start_time; ! static XLogRecPtr ckpt_start_recptr; /* not used if IsRecoveryProcessingMode */ static double ckpt_cached_elapsed; static pg_time_t last_checkpoint_time; *************** *** 197,202 **** BackgroundWriterMain(void) --- 205,211 ---- { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; + bool BgWriterRecoveryMode; BgWriterShmem->bgwriter_pid = MyProcPid; am_bg_writer = true; *************** *** 355,370 **** BackgroundWriterMain(void) */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { - bool do_checkpoint = false; - int flags = 0; - pg_time_t now; - int elapsed_secs; - /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. --- 364,380 ---- */ PG_SETMASK(&UnBlockSig); + BgWriterRecoveryMode = IsRecoveryProcessingMode(); + + if (BgWriterRecoveryMode) + elog(DEBUG1, "bgwriter starting during recovery, pid = %u", + BgWriterShmem->bgwriter_pid); + /* * Loop forever */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. *************** *** 372,499 **** BackgroundWriterMain(void) if (!PostmasterIsAlive(true)) exit(1); - /* - * Process any requests or signals received recently. - */ - AbsorbFsyncRequests(); - if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } - if (checkpoint_requested) - { - checkpoint_requested = false; - do_checkpoint = true; - BgWriterStats.m_requested_checkpoints++; - } - if (shutdown_requested) - { - /* - * From here on, elog(ERROR) should end with exit(1), not send - * control back to the sigsetjmp block above - */ - ExitOnAnyError = true; - /* Close down the database */ - ShutdownXLOG(0, 0); - /* Normal exit from the bgwriter is here */ - proc_exit(0); /* done */ - } ! /* ! * Force a checkpoint if too much time has elapsed since the last one. ! * Note that we count a timed checkpoint in stats only when this ! * occurs without an external request, but we set the CAUSE_TIME flag ! * bit even if there is also an external request. ! */ ! now = (pg_time_t) time(NULL); ! elapsed_secs = now - last_checkpoint_time; ! if (elapsed_secs >= CheckPointTimeout) { ! if (!do_checkpoint) ! BgWriterStats.m_timed_checkpoints++; ! do_checkpoint = true; ! flags |= CHECKPOINT_CAUSE_TIME; ! } ! /* ! * Do a checkpoint if requested, otherwise do one cycle of ! * dirty-buffer writing. ! */ ! if (do_checkpoint) ! { ! /* use volatile pointer to prevent code rearrangement */ ! volatile BgWriterShmemStruct *bgs = BgWriterShmem; ! /* ! * Atomically fetch the request flags to figure out what kind of a ! * checkpoint we should perform, and increase the started-counter ! * to acknowledge that we've started a new checkpoint. ! */ ! SpinLockAcquire(&bgs->ckpt_lck); ! flags |= bgs->ckpt_flags; ! bgs->ckpt_flags = 0; ! bgs->ckpt_started++; ! SpinLockRelease(&bgs->ckpt_lck); ! /* ! * We will warn if (a) too soon since last checkpoint (whatever ! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag ! * since the last checkpoint start. Note in particular that this ! * implementation will not generate warnings caused by ! * CheckPointTimeout < CheckPointWarning. ! */ ! if ((flags & CHECKPOINT_CAUSE_XLOG) && ! elapsed_secs < CheckPointWarning) ! ereport(LOG, ! (errmsg("checkpoints are occurring too frequently (%d seconds apart)", ! elapsed_secs), ! errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); /* ! * Initialize bgwriter-private variables used during checkpoint. */ ! ckpt_active = true; ! ckpt_start_recptr = GetInsertRecPtr(); ! ckpt_start_time = now; ! ckpt_cached_elapsed = 0; ! /* ! * Do the checkpoint. ! */ ! CreateCheckPoint(flags); /* ! * After any checkpoint, close all smgr files. This is so we ! * won't hang onto smgr references to deleted files indefinitely. */ ! smgrcloseall(); /* ! * Indicate checkpoint completion to any waiting backends. */ ! SpinLockAcquire(&bgs->ckpt_lck); ! bgs->ckpt_done = bgs->ckpt_started; ! SpinLockRelease(&bgs->ckpt_lck); ! ckpt_active = false; ! /* ! * Note we record the checkpoint start time not end time as ! * last_checkpoint_time. This is so that time-driven checkpoints ! * happen at a predictable spacing. ! */ ! last_checkpoint_time = now; } - else - BgBufferSync(); - - /* Check for archive_timeout and switch xlog files if necessary. */ - CheckArchiveTimeout(); - - /* Nap for the configured time. */ - BgWriterNap(); } } --- 382,595 ---- if (!PostmasterIsAlive(true)) exit(1); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } ! if (BgWriterRecoveryMode) { ! if (shutdown_requested) ! { ! /* ! * From here on, elog(ERROR) should end with exit(1), not send ! * control back to the sigsetjmp block above ! */ ! ExitOnAnyError = true; ! /* Normal exit from the bgwriter is here */ ! proc_exit(0); /* done */ ! } ! if (!IsRecoveryProcessingMode()) ! { ! elog(DEBUG2, "bgwriter changing from recovery to normal mode"); ! ! InitXLOGAccess(); ! BgWriterRecoveryMode = false; ! ! /* ! * Start time-driven events from now ! */ ! last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); ! ! /* ! * Notice that we do *not* act on a checkpoint_requested ! * state at this point. We have changed mode, so we wish to ! * perform a checkpoint not a restartpoint. ! */ ! continue; ! } ! if (checkpoint_requested) ! { ! XLogRecPtr ReadPtr; ! CheckPoint restartPoint; ! ! checkpoint_requested = false; ! ! /* ! * Initialize bgwriter-private variables used during checkpoint. ! */ ! ckpt_active = true; ! ckpt_start_time = (pg_time_t) time(NULL); ! ckpt_cached_elapsed = 0; ! ! /* ! * Get the requested values from shared memory that the ! * Startup process has put there for us. ! */ ! SpinLockAcquire(&BgWriterShmem->ckpt_lck); ! ReadPtr = BgWriterShmem->ReadPtr; ! memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint)); ! SpinLockRelease(&BgWriterShmem->ckpt_lck); ! ! /* Use smoothed writes, until interrupted if ever */ ! CreateRestartPoint(ReadPtr, &restartPoint, 0); ! ! /* ! * After any checkpoint, close all smgr files. This is so we ! * won't hang onto smgr references to deleted files indefinitely. ! */ ! smgrcloseall(); ! ! ckpt_active = false; ! checkpoint_requested = false; ! } ! else ! { ! /* Clean buffers dirtied by recovery */ ! BgBufferSync(); ! /* Nap for the configured time. */ ! BgWriterNap(); ! } ! } ! else /* Normal processing */ ! { ! bool do_checkpoint = false; ! int flags = 0; ! pg_time_t now; ! int elapsed_secs; /* ! * Process any requests or signals received recently. */ ! AbsorbFsyncRequests(); ! if (checkpoint_requested) ! { ! checkpoint_requested = false; ! do_checkpoint = true; ! BgWriterStats.m_requested_checkpoints++; ! } ! if (shutdown_requested) ! { ! /* ! * From here on, elog(ERROR) should end with exit(1), not send ! * control back to the sigsetjmp block above ! */ ! ExitOnAnyError = true; ! /* Close down the database */ ! ShutdownXLOG(0, 0); ! /* Normal exit from the bgwriter is here */ ! proc_exit(0); /* done */ ! } /* ! * Force a checkpoint if too much time has elapsed since the last one. ! * Note that we count a timed checkpoint in stats only when this ! * occurs without an external request, but we set the CAUSE_TIME flag ! * bit even if there is also an external request. */ ! now = (pg_time_t) time(NULL); ! elapsed_secs = now - last_checkpoint_time; ! if (elapsed_secs >= CheckPointTimeout) ! { ! if (!do_checkpoint) ! BgWriterStats.m_timed_checkpoints++; ! do_checkpoint = true; ! flags |= CHECKPOINT_CAUSE_TIME; ! } /* ! * Do a checkpoint if requested, otherwise do one cycle of ! * dirty-buffer writing. */ ! if (do_checkpoint) ! { ! /* use volatile pointer to prevent code rearrangement */ ! volatile BgWriterShmemStruct *bgs = BgWriterShmem; ! ! /* ! * Atomically fetch the request flags to figure out what kind of a ! * checkpoint we should perform, and increase the started-counter ! * to acknowledge that we've started a new checkpoint. ! */ ! SpinLockAcquire(&bgs->ckpt_lck); ! flags |= bgs->ckpt_flags; ! bgs->ckpt_flags = 0; ! bgs->ckpt_started++; ! SpinLockRelease(&bgs->ckpt_lck); ! ! /* ! * We will warn if (a) too soon since last checkpoint (whatever ! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag ! * since the last checkpoint start. Note in particular that this ! * implementation will not generate warnings caused by ! * CheckPointTimeout < CheckPointWarning. ! */ ! if ((flags & CHECKPOINT_CAUSE_XLOG) && ! elapsed_secs < CheckPointWarning) ! ereport(LOG, ! (errmsg("checkpoints are occurring too frequently (%d seconds apart)", ! elapsed_secs), ! errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); ! ! /* ! * Initialize bgwriter-private variables used during checkpoint. ! */ ! ckpt_active = true; ! ckpt_start_recptr = GetInsertRecPtr(); ! ckpt_start_time = now; ! ckpt_cached_elapsed = 0; ! ! /* ! * Do the checkpoint. ! */ ! CreateCheckPoint(flags); ! ! /* ! * After any checkpoint, close all smgr files. This is so we ! * won't hang onto smgr references to deleted files indefinitely. ! */ ! smgrcloseall(); ! ! /* ! * Indicate checkpoint completion to any waiting backends. ! */ ! SpinLockAcquire(&bgs->ckpt_lck); ! bgs->ckpt_done = bgs->ckpt_started; ! SpinLockRelease(&bgs->ckpt_lck); ! ! ckpt_active = false; ! ! /* ! * Note we record the checkpoint start time not end time as ! * last_checkpoint_time. This is so that time-driven checkpoints ! * happen at a predictable spacing. ! */ ! last_checkpoint_time = now; ! } ! else ! BgBufferSync(); ! /* Check for archive_timeout and switch xlog files if necessary. */ ! CheckArchiveTimeout(); ! /* Nap for the configured time. */ ! BgWriterNap(); } } } *************** *** 586,592 **** BgWriterNap(void) (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)) break; pg_usleep(1000000L); ! AbsorbFsyncRequests(); udelay -= 1000000L; } --- 682,689 ---- (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)) break; pg_usleep(1000000L); ! if (!IsRecoveryProcessingMode()) ! AbsorbFsyncRequests(); udelay -= 1000000L; } *************** *** 640,645 **** CheckpointWriteDelay(int flags, double progress) --- 737,755 ---- if (!am_bg_writer) return; + /* Perform minimal duties during recovery and skip wait if requested */ + if (IsRecoveryProcessingMode()) + { + BgBufferSync(); + + if (!shutdown_requested && + !checkpoint_requested && + IsCheckpointOnSchedule(progress)) + BgWriterNap(); + + return; + } + /* * Perform the usual bgwriter duties and take a nap, unless we're behind * schedule, in which case we just try to catch up as quickly as possible. *************** *** 714,729 **** IsCheckpointOnSchedule(double progress) * However, it's good enough for our purposes, we're only calculating an * estimate anyway. */ ! recptr = GetInsertRecPtr(); ! elapsed_xlogs = ! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + ! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / ! CheckPointSegments; ! ! if (progress < elapsed_xlogs) { ! ckpt_cached_elapsed = elapsed_xlogs; ! return false; } /* --- 824,842 ---- * However, it's good enough for our purposes, we're only calculating an * estimate anyway. */ ! if (!IsRecoveryProcessingMode()) { ! recptr = GetInsertRecPtr(); ! elapsed_xlogs = ! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + ! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / ! CheckPointSegments; ! ! if (progress < elapsed_xlogs) ! { ! ckpt_cached_elapsed = elapsed_xlogs; ! return false; ! } } /* *************** *** 989,994 **** RequestCheckpoint(int flags) --- 1102,1180 ---- } /* + * Always runs in Startup process (see xlog.c) + */ + void + RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter) + { + /* + * Should we just do it ourselves? + */ + if (!IsPostmasterEnvironment || !sendToBGWriter) + { + CreateRestartPoint(ReadPtr, restartPoint, CHECKPOINT_IMMEDIATE); + return; + } + + /* + * Push requested values into shared memory, then signal to request restartpoint. + */ + if (BgWriterShmem->bgwriter_pid == 0) + elog(LOG, "could not request restartpoint because bgwriter not running"); + + SpinLockAcquire(&BgWriterShmem->ckpt_lck); + BgWriterShmem->ReadPtr = ReadPtr; + memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint)); + SpinLockRelease(&BgWriterShmem->ckpt_lck); + + if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0) + elog(LOG, "could not signal for restartpoint: %m"); + } + + /* + * Sends another checkpoint request signal to bgwriter, which causes it + * to avoid smoothed writes and continue processing as if it had been + * called with CHECKPOINT_IMMEDIATE. This is used at the end of recovery. + */ + void + RequestRestartPointCompletion(void) + { + if (BgWriterShmem->bgwriter_pid != 0 && + kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0) + elog(LOG, "could not signal for restartpoint immediate: %m"); + } + + XLogRecPtr + GetRedoLocationForArchiveCheckpoint(void) + { + XLogRecPtr redo; + + SpinLockAcquire(&BgWriterShmem->ckpt_lck); + redo = BgWriterShmem->ReadPtr; + SpinLockRelease(&BgWriterShmem->ckpt_lck); + + return redo; + } + + /* + * Store the information needed for a checkpoint at the end of recovery. + * Returns true if bgwriter can perform checkpoint, or false if bgwriter + * not active or otherwise unable to comply. + */ + bool + SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo) + { + SpinLockAcquire(&BgWriterShmem->ckpt_lck); + BgWriterShmem->ReadPtr = redo; + SpinLockRelease(&BgWriterShmem->ckpt_lck); + + if (BgWriterShmem->bgwriter_pid == 0 || !IsPostmasterEnvironment) + return false; + + return true; + } + + /* * ForwardFsyncRequest * Forward a file-fsync request from a backend to the bgwriter * *** src/backend/postmaster/postmaster.c --- src/backend/postmaster/postmaster.c *************** *** 230,237 **** static bool FatalError = false; /* T if recovering from backend crash */ * We use a simple state machine to control startup, shutdown, and * crash recovery (which is rather like shutdown followed by startup). * ! * Normal child backends can only be launched when we are in PM_RUN state. ! * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.) * In other states we handle connection requests by launching "dead_end" * child processes, which will simply send the client an error message and * quit. (We track these in the BackendList so that we can know when they --- 230,239 ---- * We use a simple state machine to control startup, shutdown, and * crash recovery (which is rather like shutdown followed by startup). * ! * Normal child backends can only be launched when we are in PM_RUN or ! * PM_RECOVERY state. Any transaction started in PM_RECOVERY state will ! * be read-only for the whole of its life. (We also allow launch of normal ! * child backends in PM_WAIT_BACKUP state, but only for superusers.) * In other states we handle connection requests by launching "dead_end" * child processes, which will simply send the client an error message and * quit. (We track these in the BackendList so that we can know when they *************** *** 254,259 **** typedef enum --- 256,266 ---- { PM_INIT, /* postmaster starting */ PM_STARTUP, /* waiting for startup subprocess */ + PM_RECOVERY, /* consistent recovery mode; state only + * entered for archive and streaming recovery, + * and only after the point where the + * all data is in consistent state. + */ PM_RUN, /* normal "database is alive" state */ PM_WAIT_BACKUP, /* waiting for online backup mode to end */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ *************** *** 1302,1308 **** ServerLoop(void) * state that prevents it, start one. It doesn't matter if this * fails, we'll just try again later. */ ! if (BgWriterPID == 0 && pmState == PM_RUN) BgWriterPID = StartBackgroundWriter(); /* --- 1309,1315 ---- * state that prevents it, start one. It doesn't matter if this * fails, we'll just try again later. */ ! if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY)) BgWriterPID = StartBackgroundWriter(); /* *************** *** 1651,1661 **** retry1: (errcode(ERRCODE_CANNOT_CONNECT_NOW), errmsg("the database system is shutting down"))); break; - case CAC_RECOVERY: - ereport(FATAL, - (errcode(ERRCODE_CANNOT_CONNECT_NOW), - errmsg("the database system is in recovery mode"))); - break; case CAC_TOOMANY: ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), --- 1658,1663 ---- *************** *** 1664,1669 **** retry1: --- 1666,1672 ---- case CAC_WAITBACKUP: /* OK for now, will check in InitPostgres */ break; + case CAC_RECOVERY: case CAC_OK: break; } *************** *** 1982,1991 **** pmdie(SIGNAL_ARGS) ereport(LOG, (errmsg("received smart shutdown request"))); ! if (pmState == PM_RUN) { /* autovacuum workers are told to shut down immediately */ ! SignalAutovacWorkers(SIGTERM); /* and the autovac launcher too */ if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); --- 1985,1995 ---- ereport(LOG, (errmsg("received smart shutdown request"))); ! if (pmState == PM_RUN || pmState == PM_RECOVERY) { /* autovacuum workers are told to shut down immediately */ ! if (pmState == PM_RUN) ! SignalAutovacWorkers(SIGTERM); /* and the autovac launcher too */ if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); *************** *** 2019,2025 **** pmdie(SIGNAL_ARGS) if (StartupPID != 0) signal_child(StartupPID, SIGTERM); ! if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP) { ereport(LOG, (errmsg("aborting any active transactions"))); --- 2023,2029 ---- if (StartupPID != 0) signal_child(StartupPID, SIGTERM); ! if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_WAIT_BACKUP) { ereport(LOG, (errmsg("aborting any active transactions"))); *************** *** 2115,2122 **** reaper(SIGNAL_ARGS) */ if (pid == StartupPID) { StartupPID = 0; ! Assert(pmState == PM_STARTUP); /* FATAL exit of startup is treated as catastrophic */ if (!EXIT_STATUS_0(exitstatus)) --- 2119,2129 ---- */ if (pid == StartupPID) { + bool leavingRecovery = (pmState == PM_RECOVERY); + StartupPID = 0; ! Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY || ! pmState == PM_WAIT_BACKUP || pmState == PM_WAIT_BACKENDS); /* FATAL exit of startup is treated as catastrophic */ if (!EXIT_STATUS_0(exitstatus)) *************** *** 2124,2130 **** reaper(SIGNAL_ARGS) LogChildExit(LOG, _("startup process"), pid, exitstatus); ereport(LOG, ! (errmsg("aborting startup due to startup process failure"))); ExitPostmaster(1); } --- 2131,2137 ---- LogChildExit(LOG, _("startup process"), pid, exitstatus); ereport(LOG, ! (errmsg("aborting startup due to startup process failure"))); ExitPostmaster(1); } *************** *** 2157,2166 **** reaper(SIGNAL_ARGS) load_role(); /* ! * Crank up the background writer. It doesn't matter if this ! * fails, we'll just try again later. */ ! Assert(BgWriterPID == 0); BgWriterPID = StartBackgroundWriter(); /* --- 2164,2173 ---- load_role(); /* ! * Check whether we need to start background writer, if not ! * already running. */ ! if (BgWriterPID == 0) BgWriterPID = StartBackgroundWriter(); /* *************** *** 2177,2184 **** reaper(SIGNAL_ARGS) PgStatPID = pgstat_start(); /* at this point we are really open for business */ ! ereport(LOG, ! (errmsg("database system is ready to accept connections"))); continue; } --- 2184,2195 ---- PgStatPID = pgstat_start(); /* at this point we are really open for business */ ! if (leavingRecovery) ! ereport(LOG, ! (errmsg("database can now be accessed with read and write transactions"))); ! else ! ereport(LOG, ! (errmsg("database system is ready to accept connections"))); continue; } *************** *** 2898,2904 **** BackendStartup(Port *port) bn->pid = pid; bn->cancel_key = MyCancelKey; bn->is_autovacuum = false; ! bn->dead_end = (port->canAcceptConnections != CAC_OK && port->canAcceptConnections != CAC_WAITBACKUP); DLAddHead(BackendList, DLNewElem(bn)); #ifdef EXEC_BACKEND --- 2909,2916 ---- bn->pid = pid; bn->cancel_key = MyCancelKey; bn->is_autovacuum = false; ! bn->dead_end = (!(port->canAcceptConnections == CAC_RECOVERY || ! port->canAcceptConnections == CAC_OK) && port->canAcceptConnections != CAC_WAITBACKUP); DLAddHead(BackendList, DLNewElem(bn)); #ifdef EXEC_BACKEND *************** *** 3847,3852 **** sigusr1_handler(SIGNAL_ARGS) --- 3859,3911 ---- PG_SETMASK(&BlockSig); + if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START)) + { + Assert(pmState == PM_STARTUP); + + /* + * Go to shutdown mode if a shutdown request was pending. + */ + if (Shutdown > NoShutdown) + { + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + } + else + { + /* + * Startup process has entered recovery + */ + pmState = PM_RECOVERY; + + /* + * Load the flat authorization file into postmaster's cache. The + * startup process won't have recomputed this from the database yet, + * so we it may change following recovery. + */ + load_role(); + + /* + * Crank up the background writer. It doesn't matter if this + * fails, we'll just try again later. + */ + Assert(BgWriterPID == 0); + BgWriterPID = StartBackgroundWriter(); + + /* + * Likewise, start other special children as needed. + */ + Assert(PgStatPID == 0); + PgStatPID = pgstat_start(); + + /* We can now accept read-only connections */ + ereport(LOG, + (errmsg("database system is ready to accept connections"))); + ereport(LOG, + (errmsg("database can now be accessed with read only transactions"))); + } + } + if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE)) { /* *** src/backend/storage/buffer/README --- src/backend/storage/buffer/README *************** *** 268,270 **** out (and anyone else who flushes buffer contents to disk must do so too). --- 268,279 ---- This ensures that the page image transferred to disk is reasonably consistent. We might miss a hint-bit update or two but that isn't a problem, for the same reasons mentioned under buffer access rules. + + As of 8.4, background writer starts during recovery mode when there is + some form of potentially extended recovery to perform. It performs an + identical service to normal processing, except that checkpoints it + writes are technically restartpoints. Flushing outstanding WAL for dirty + buffers is also skipped, though there shouldn't ever be new WAL entries + at that time in any case. We could choose to start background writer + immediately but we hold off until we can prove the database is in a + consistent state so that postmaster has a single, clean state change. *** src/backend/storage/buffer/bufmgr.c --- src/backend/storage/buffer/bufmgr.c *************** *** 71,77 **** static bool IsForInput; /* local state for LockBufferForCleanup */ static volatile BufferDesc *PinCountWaitBuf = NULL; ! static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf, ForkNumber forkNum, BlockNumber blockNum, --- 71,79 ---- /* local state for LockBufferForCleanup */ static volatile BufferDesc *PinCountWaitBuf = NULL; ! static long CleanupWaitSecs = 0; ! static int CleanupWaitUSecs = 0; ! static bool CleanupWaitStats = false; static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf, ForkNumber forkNum, BlockNumber blockNum, *************** *** 2308,2313 **** ConditionalLockBuffer(Buffer buffer) --- 2310,2362 ---- } /* + * On standby servers only the Startup process applies Cleanup. As a result + * a single buffer pin can be enough to effectively halt recovery for short + * periods. We need special instrumentation to monitor this so we can judge + * whether additional measures are required to control the negative effects. + */ + void + StartCleanupDelayStats(void) + { + CleanupWaitSecs = 0; + CleanupWaitUSecs = 0; + CleanupWaitStats = true; + } + + void + EndCleanupDelayStats(void) + { + CleanupWaitStats = false; + } + + /* + * Called by Startup process whenever we request restartpoint + */ + void + ReportCleanupDelayStats(void) + { + elog(trace_recovery(DEBUG2), "cleanup wait total=%ld.%03d s", + CleanupWaitSecs, CleanupWaitUSecs / 1000); + } + + static void + CleanupDelayStats(TimestampTz start_ts, TimestampTz end_ts) + { + long wait_secs; + int wait_usecs; + + TimestampDifference(start_ts, end_ts, &wait_secs, &wait_usecs); + + CleanupWaitSecs +=wait_secs; + CleanupWaitUSecs +=wait_usecs; + if (CleanupWaitUSecs > 999999) + { + CleanupWaitSecs += 1; + CleanupWaitUSecs -= 1000000; + } + } + + /* * LockBufferForCleanup - lock a buffer in preparation for deleting items * * Items may be deleted from a disk page only when the caller (a) holds an *************** *** 2350,2355 **** LockBufferForCleanup(Buffer buffer) --- 2399,2406 ---- for (;;) { + TimestampTz start_ts = 0; + /* Try to acquire lock */ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); LockBufHdr(bufHdr); *************** *** 2372,2380 **** LockBufferForCleanup(Buffer buffer) --- 2423,2436 ---- PinCountWaitBuf = bufHdr; UnlockBufHdr(bufHdr); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + if (CleanupWaitStats) + start_ts = GetCurrentTimestamp(); /* Wait to be signaled by UnpinBuffer() */ ProcWaitForSignal(); PinCountWaitBuf = NULL; + if (CleanupWaitStats) + CleanupDelayStats(start_ts, GetCurrentTimestamp()); + /* Loop back and try again */ } } *** src/backend/storage/freespace/freespace.c --- src/backend/storage/freespace/freespace.c *************** *** 211,217 **** XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, blkno = fsm_logical_to_physical(addr); /* If the page doesn't exist already, extend */ ! buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR); page = BufferGetPage(buf); if (PageIsNew(page)) PageInit(page, BLCKSZ, 0); --- 211,218 ---- blkno = fsm_logical_to_physical(addr); /* If the page doesn't exist already, extend */ ! buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, ! RBM_ZERO_ON_ERROR, BUFFER_LOCK_CLEANUP); page = BufferGetPage(buf); if (PageIsNew(page)) PageInit(page, BLCKSZ, 0); *** src/backend/storage/ipc/procarray.c --- src/backend/storage/ipc/procarray.c *************** *** 17,22 **** --- 17,37 ---- * as are the myProcLocks lists. They can be distinguished from regular * backend PGPROCs at need by checking for pid == 0. * + * The process array now also includes PGPROC structures representing + * transactions being recovered. The xid and subxids fields of these are valid, + * though few other fields are. They can be distinguished from regular backend + * PGPROCs by checking for pid == 0. The proc array also has an + * secondary array of UnobservedXids representing transactions that are + * known to be running on the master but for which we do not yet have + * a recovery proc. We infer + * the existence of UnobservedXids by watching the sequence of arriving + * xids. This is very important because if we leave those xids out of the + * the snapshot then they will appear to be already complete. Later, when + * they have actually completed this could lead to confusion as to whether + * those xids are visible or not, blowing a huge hole in MVCC. We need 'em. + * We go to extreme lengths to ensure that the number of UnobservedXids is + * both bounded and realistically manageable. There are simpler designs, + * but they lead to unbounded worst case behaviour, so we sweat. * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California *************** *** 33,56 **** #include "access/subtrans.h" #include "access/transam.h" ! #include "access/xact.h" #include "access/twophase.h" #include "miscadmin.h" #include "storage/procarray.h" #include "utils/snapmgr.h" /* Our shared memory area */ typedef struct ProcArrayStruct { int numProcs; /* number of valid procs entries */ ! int maxProcs; /* allocated size of procs array */ /* * We declare procs[] as 1 entry because C wants a fixed-size array, but * actually it is maxProcs entries long. */ PGPROC *procs[1]; /* VARIABLE LENGTH ARRAY */ } ProcArrayStruct; static ProcArrayStruct *procArray; --- 48,86 ---- #include "access/subtrans.h" #include "access/transam.h" ! #include "access/xlog.h" #include "access/twophase.h" #include "miscadmin.h" + #include "storage/proc.h" #include "storage/procarray.h" #include "utils/snapmgr.h" + static RunningXactsData CurrentRunningXactsData; + + /* Handy constant for an invalid xlog recptr */ + static const XLogRecPtr InvalidXLogRecPtr = {0, 0}; + + void ProcArrayDisplay(int trace_level); + /* Our shared memory area */ typedef struct ProcArrayStruct { int numProcs; /* number of valid procs entries */ ! int maxProcs; /* allocated size of total procs array */ ! ! int maxRecoveryProcs; /* number of allocated recovery procs */ ! ! int numUnobservedXids; /* number of valid unobserved xids */ ! int maxUnobservedXids; /* allocated size of unobserved array */ /* * We declare procs[] as 1 entry because C wants a fixed-size array, but * actually it is maxProcs entries long. */ PGPROC *procs[1]; /* VARIABLE LENGTH ARRAY */ + + /* ARRAY OF UNOBSERVED TRANSACTION XIDs FOLLOWS */ } ProcArrayStruct; static ProcArrayStruct *procArray; *************** *** 100,107 **** ProcArrayShmemSize(void) Size size; size = offsetof(ProcArrayStruct, procs); ! size = add_size(size, mul_size(sizeof(PGPROC *), ! add_size(MaxBackends, max_prepared_xacts))); return size; } --- 130,148 ---- Size size; size = offsetof(ProcArrayStruct, procs); ! ! /* Normal processing */ ! /* MyProc slots */ ! size = add_size(size, mul_size(sizeof(PGPROC *), MaxBackends)); ! size = add_size(size, mul_size(sizeof(PGPROC *), max_prepared_xacts)); ! ! /* Recovery processing */ ! ! /* Recovery Procs */ ! size = add_size(size, mul_size(sizeof(PGPROC *), MaxBackends)); ! /* UnobservedXids */ ! size = add_size(size, mul_size(sizeof(TransactionId), MaxBackends)); ! size = add_size(size, mul_size(sizeof(TransactionId), MaxBackends)); return size; } *************** *** 123,130 **** CreateSharedProcArray(void) --- 164,209 ---- /* * We're the first - initialize. */ + /* Normal processing */ procArray->numProcs = 0; procArray->maxProcs = MaxBackends + max_prepared_xacts; + + /* Recovery processing */ + procArray->maxRecoveryProcs = MaxBackends; + procArray->maxProcs += procArray->maxRecoveryProcs; + + procArray->maxUnobservedXids = 2 * MaxBackends; + procArray->numUnobservedXids = 0; + + if (!IsUnderPostmaster) + { + int i; + + /* XXX: We should probably have a separate pool for recovery + * procs, similar to how we handle prepared transactions. The + * fields only used for recovery procs (lsn), could then also be + * included only for the recovery procs, like the extra fields + * in GlobalTransactionData + */ + /* + * Create and add the Procs for recovery emulation. + * + * We do this now, so that we can identify which Recovery Proc + * goes with each normal backend. Normal procs were allocated + * first so we can use the slotId of the *proc* to look up + * the Recovery Proc in the *procarray*. Recovery Procs never + * move around in the procarray, whereas normal procs do. + * e.g. Proc with slotId=7 is always associated with procarray[7] + * for recovery processing. see also + */ + for (i = 0; i < procArray->maxRecoveryProcs; i++) + { + PGPROC *RecoveryProc = InitRecoveryProcess(); + + ProcArrayAdd(RecoveryProc); + } + elog(DEBUG3, "Added %d Recovery Procs", i); + } } } *************** *** 213,218 **** ProcArrayRemove(PGPROC *proc, TransactionId latestXid) --- 292,306 ---- elog(LOG, "failed to find proc %p in ProcArray", proc); } + /* + * Initialisation when we switch into PM_RECOVERY mode. + * Expected caller is InitRecoveryTransactionEnvironment() + */ + void + ProcArrayInitRecoveryEnvironment(void) + { + PublishStartupProcessInformation(); + } /* * ProcArrayEndTransaction -- mark a transaction as no longer running *************** *** 220,226 **** ProcArrayRemove(PGPROC *proc, TransactionId latestXid) * This is used interchangeably for commit and abort cases. The transaction * commit/abort must already be reported to WAL and pg_clog. * ! * proc is currently always MyProc, but we pass it explicitly for flexibility. * latestXid is the latest Xid among the transaction's main XID and * subtransactions, or InvalidTransactionId if it has no XID. (We must ask * the caller to pass latestXid, instead of computing it from the PGPROC's --- 308,316 ---- * This is used interchangeably for commit and abort cases. The transaction * commit/abort must already be reported to WAL and pg_clog. * ! * In normal running proc is currently always MyProc, but in recovery we pass ! * one of the recovery procs. ! * * latestXid is the latest Xid among the transaction's main XID and * subtransactions, or InvalidTransactionId if it has no XID. (We must ask * the caller to pass latestXid, instead of computing it from the PGPROC's *************** *** 228,234 **** ProcArrayRemove(PGPROC *proc, TransactionId latestXid) * incomplete.) */ void ! ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) { if (TransactionIdIsValid(latestXid)) { --- 318,325 ---- * incomplete.) */ void ! ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid, ! int nsubxids, TransactionId *subxids) { if (TransactionIdIsValid(latestXid)) { *************** *** 253,258 **** ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) --- 344,370 ---- proc->subxids.nxids = 0; proc->subxids.overflowed = false; + /* + * Check that any subtransactions are removed from UnobservedXids. + * We include the subxids array so that they can be removed atomically + * from UnobservedXids at the same time as we zero the main xid on + * the Recovery proc. + */ + if (nsubxids > 0) + { + int i; + + Assert(subxids != NULL); + + /* + * Ignore any failure to find the xids - this avoids complex + * bookkeeping solely to account for rare strangeness that + * would add too much overhead to be worth the cost. + */ + for (i = 0; i < nsubxids; i++) + UnobservedTransactionsRemoveXid(subxids[i], false); + } + /* Also advance global latestCompletedXid while holding the lock */ if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, latestXid)) *************** *** 301,306 **** ProcArrayClearTransaction(PGPROC *proc) --- 413,419 ---- proc->xid = InvalidTransactionId; proc->lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; + proc->lsn = InvalidXLogRecPtr; /* redundant, but just in case */ proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; *************** *** 311,316 **** ProcArrayClearTransaction(PGPROC *proc) --- 424,606 ---- proc->subxids.overflowed = false; } + /* + * ProcArrayClearRecoveryTransactions + * + * Called during recovery when we see a Shutdown checkpoint or EndRecovery + * record, or at the end of recovery processing. + */ + void + ProcArrayClearRecoveryTransactions(void) + { + ProcArrayStruct *arrayP = procArray; + int index; + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Reset Recovery Procs + */ + for (index = 0; index < arrayP->maxRecoveryProcs; index++) + { + PGPROC *RecoveryProc = arrayP->procs[index]; + + ProcArrayClearTransaction(RecoveryProc); + } + + /* + * Clear the UnobservedXids also + */ + UnobservedTransactionsClearXids(); + + LWLockRelease(ProcArrayLock); + } + + /* debug support functions for recovery processing */ + bool + XidInRecoveryProcs(TransactionId xid) + { + ProcArrayStruct *arrayP = procArray; + int index; + + for (index = 0; index < arrayP->maxRecoveryProcs; index++) + { + PGPROC *RecoveryProc = arrayP->procs[index]; + + if (RecoveryProc->xid == xid) + return true; + } + return false; + } + + void + ProcArrayDisplay(int trace_level) + { + ProcArrayStruct *arrayP = procArray; + int index; + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + for (index = 0; index < arrayP->maxRecoveryProcs; index++) + { + PGPROC *RecoveryProc = arrayP->procs[index]; + + if (TransactionIdIsValid(RecoveryProc->xid)) + elog(trace_level, + "proc %d proc->xid %d proc->lsn %X/%X", index, RecoveryProc->xid, + RecoveryProc->lsn.xlogid, RecoveryProc->lsn.xrecoff); + } + + UnobservedTransactionsDisplay(trace_level); + + LWLockRelease(ProcArrayLock); + } + + /* + * Use the data about running transactions on master to either create the + * initial state of the Recovery Procs, or maintain correctness of their + * state. This is almost the opposite of GetSnapshotData(). + * + * Only used during recovery. Notice the signature is very similar to a + * _redo function. + */ + void + ProcArrayUpdateRecoveryTransactions(XLogRecPtr lsn, xl_xact_running_xacts *xlrec) + { + ProcArrayStruct *arrayP = procArray; + int xid_index; + TransactionId *subxip = (TransactionId *) &(xlrec->xrun[xlrec->xcnt]); + int index; + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + for (xid_index = 0; xid_index < xlrec->xcnt; xid_index++) + { + RunningXact *rxact = (RunningXact *) xlrec->xrun; + PGPROC *proc = NULL; + TransactionId xid = rxact->xid; + + for (index = 0; index < arrayP->numProcs; index++) + { + PGPROC *p = arrayP->procs[index]; + + if (p->xid == xid) + { + proc = p; + break; + } + } + + if (proc == NULL) + { + /* TODO should add it to array here */ + continue; + } + + elog(trace_recovery(DEBUG5), + "running xact proc->lsn %X/%X lsn %X/%X proc->xid %d xid %d", + proc->lsn.xlogid, proc->lsn.xrecoff, + lsn.xlogid, lsn.xrecoff, proc->xid, rxact[xid_index].xid); + /* + * If our state information is later for this proc, then + * overwrite it. It's possible for a commit and possibly + * a new transaction record to have arrived in WAL in between + * us doing GetRunningTransactionData() and grabbing the + * WALInsertLock, so we musn't assume we know best always. + */ + if (XLByteLT(proc->lsn, lsn)) + { + proc->lsn = lsn; + proc->xid = rxact[xid_index].xid; + /* proc-> pid stays 0 for Recovery Procs */ + proc->databaseId = rxact[xid_index].databaseId; + proc->roleId = rxact[xid_index].roleId; + proc->vacuumFlags = rxact[xid_index].vacuumFlags; + + proc->subxids.nxids = rxact[xid_index].nsubxids; + proc->subxids.overflowed = rxact[xid_index].overflowed; + + memcpy(proc->subxids.xids, subxip, + rxact[xid_index].nsubxids * sizeof(TransactionId)); + } + } + + /* + * Scan the proc array for stale recovery PGPROC entries, and + * remove them. + */ + for (index = 0; index < arrayP->numProcs; index++) + { + PGPROC *p = arrayP->procs[index]; + + if (p->pid == 0 && !XLogRecPtrIsInvalid(p->lsn) && XLByteLT(p->lsn, lsn)) + { + arrayP->procs[index] = arrayP->procs[arrayP->numProcs - 1]; + arrayP->numProcs--; + FreeRecoveryProcess(p); + } + } + + /* Advance global latestCompletedXid while holding the lock */ + if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, + xlrec->latestCompletedXid)) + ShmemVariableCache->latestCompletedXid = xlrec->latestCompletedXid; + + /* + * Left prune the UnobservedXids array up to latestRunningXid. + * This is correct because at the time we take this snapshot, all + * completed transactions prior to latestRunningXid will be marked in + * WAL. So we won't ever see a WAL record for them again. + * + * We can't clear the array completely because race conditions allow + * things to slip through sometimes. + */ + UnobservedTransactionsPruneXids(xlrec->latestRunningXid); + + LWLockRelease(ProcArrayLock); + + ProcArrayDisplay(trace_recovery(DEBUG5)); + } /* * TransactionIdIsInProgress -- is given transaction running in some backend *************** *** 655,661 **** GetOldestXmin(bool allDbs, bool ignoreVacuum) * but since PGPROC has only a limited cache area for subxact XIDs, full * information may not be available. If we find any overflowed subxid arrays, * we have to mark the snapshot's subxid data as overflowed, and extra work ! * will need to be done to determine what's running (see XidInMVCCSnapshot() * in tqual.c). * * We also update the following backend-global variables: --- 945,951 ---- * but since PGPROC has only a limited cache area for subxact XIDs, full * information may not be available. If we find any overflowed subxid arrays, * we have to mark the snapshot's subxid data as overflowed, and extra work ! * *may* need to be done to determine what's running (see XidInMVCCSnapshot() * in tqual.c). * * We also update the following backend-global variables: *************** *** 680,685 **** GetSnapshotData(Snapshot snapshot) --- 970,976 ---- int index; int count = 0; int subcount = 0; + bool suboverflowed = false; Assert(snapshot != NULL); *************** *** 706,713 **** GetSnapshotData(Snapshot snapshot) (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); Assert(snapshot->subxip == NULL); snapshot->subxip = (TransactionId *) ! malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId)); if (snapshot->subxip == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), --- 997,1005 ---- (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); Assert(snapshot->subxip == NULL); + #define maxNumSubXids (arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS) snapshot->subxip = (TransactionId *) ! malloc(maxNumSubXids * sizeof(TransactionId)); if (snapshot->subxip == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), *************** *** 771,781 **** GetSnapshotData(Snapshot snapshot) } /* ! * Save subtransaction XIDs if possible (if we've already overflowed, ! * there's no point). Note that the subxact XIDs must be later than ! * their parent, so no need to check them against xmin. We could ! * filter against xmax, but it seems better not to do that much work ! * while holding the ProcArrayLock. * * The other backend can add more subxids concurrently, but cannot * remove any. Hence it's important to fetch nxids just once. Should --- 1063,1073 ---- } /* ! * Save subtransaction XIDs, whether or not we have overflowed. ! * Note that the subxact XIDs must be later than their parent, so no ! * need to check them against xmin. We could filter against xmax, ! * but it seems better not to do that much work while holding the ! * ProcArrayLock. * * The other backend can add more subxids concurrently, but cannot * remove any. Hence it's important to fetch nxids just once. Should *************** *** 784,806 **** GetSnapshotData(Snapshot snapshot) * * Again, our own XIDs are not included in the snapshot. */ ! if (subcount >= 0 && proc != MyProc) ! { ! if (proc->subxids.overflowed) ! subcount = -1; /* overflowed */ ! else { int nxids = proc->subxids.nxids; if (nxids > 0) { memcpy(snapshot->subxip + subcount, (void *) proc->subxids.xids, nxids * sizeof(TransactionId)); subcount += nxids; } } } } if (!TransactionIdIsValid(MyProc->xmin)) --- 1076,1144 ---- * * Again, our own XIDs are not included in the snapshot. */ ! if (proc != MyProc) { int nxids = proc->subxids.nxids; if (nxids > 0) { + if (proc->subxids.overflowed) + suboverflowed = true; + memcpy(snapshot->subxip + subcount, (void *) proc->subxids.xids, nxids * sizeof(TransactionId)); subcount += nxids; } + } } + + /* + * Also check for unobserved xids. There is no need for us to specify + * only if IsRecoveryProcessingMode(), since the list will always be + * empty when normal processing begins and the test will be optimised + * to nearly nothing very quickly. + */ + for (index = 0; index < arrayP->numUnobservedXids; index++) + { + volatile TransactionId *UnobservedXids; + TransactionId xid; + + UnobservedXids = (TransactionId *) &(arrayP->procs[arrayP->maxProcs]); + + /* Fetch xid just once - see GetNewTransactionId */ + xid = UnobservedXids[index]; + + /* + * If there are no more visible xids, we're done. This works + * because UnobservedXids is maintained in strict ascending order. + */ + if (!TransactionIdIsNormal(xid) || TransactionIdPrecedes(xid, xmax)) + break; + + /* + * Typically, there will be space in the snapshot. We know that the + * unobserved xids are being run by one of the procs marked with + * an xid of InvalidTransactionId, so we will have ignored that above, + * and the xidcache for that proc will have been empty also. + * + * We put the unobserved xids into the subxid cache. The xid might + * be a top-level or it might be a subtransaction, but it won't + * change the answer to XidInMVCCSnapshot() whichever it is. That's + * just as well, since we don't know which it is, by definition. + * The subxid cache gets searched first, so put it there. + */ + snapshot->subxip[subcount++] = xid; + + /* + * We don't really need xmin during recovery, but lets derive + * it anyway for consistency. It is possible that an unobserved + * xid could be xmin if there is contention between long-lived + * transactions. + */ + if (TransactionIdPrecedes(xid, xmin)) + xmin = xid; } if (!TransactionIdIsValid(MyProc->xmin)) *************** *** 824,829 **** GetSnapshotData(Snapshot snapshot) --- 1162,1168 ---- snapshot->xmax = xmax; snapshot->xcnt = count; snapshot->subxcnt = subcount; + snapshot->suboverflowed = suboverflowed; snapshot->curcid = GetCurrentCommandId(false); *************** *** 839,844 **** GetSnapshotData(Snapshot snapshot) --- 1178,1415 ---- } /* + * GetRunningTransactionData -- returns information about running transactions. + * + * Similar to GetSnapshotData but returning more information. We include + * all PGPROCs with an assigned TransactionId, even VACUUM processes. We + * include slotId and databaseId for each PGPROC. We also keep track + * of which subtransactions go with each PGPROC, information which is lost + * when we GetSnapshotData. + * + * This is never executed when IsRecoveryMode() so there is no need to look + * at UnobservedXids. + * + * We don't worry about updating other counters, we want to keep this as + * simple as possible and leave GetSnapshotData() as the primary code for + * that bookkeeping. + */ + RunningTransactions + GetRunningTransactionData(void) + { + ProcArrayStruct *arrayP = procArray; + RunningTransactions CurrentRunningXacts = (RunningTransactions) &CurrentRunningXactsData; + RunningXact *rxact; + TransactionId *subxip; + TransactionId latestRunningXid = InvalidTransactionId; + TransactionId prev_latestRunningXid = InvalidTransactionId; + TransactionId latestCompletedXid; + int numAttempts = 0; + int index; + int count = 0; + int subcount = 0; + bool suboverflowed = false; + + /* + * Allocating space for maxProcs xids is usually overkill; numProcs would + * be sufficient. But it seems better to do the malloc while not holding + * the lock, so we can't look at numProcs. Likewise, we allocate much + * more subxip storage than is probably needed. + * + * Should only be allocated for bgwriter, since only ever executed + * during checkpoints. + */ + if (CurrentRunningXacts->xrun == NULL) + { + /* + * First call + */ + CurrentRunningXacts->xrun = (RunningXact *) + malloc(arrayP->maxProcs * sizeof(RunningXact)); + if (CurrentRunningXacts->xrun == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + Assert(CurrentRunningXacts->subxip == NULL); + CurrentRunningXacts->subxip = (TransactionId *) + malloc(maxNumSubXids * sizeof(TransactionId)); + if (CurrentRunningXacts->subxip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + rxact = CurrentRunningXacts->xrun; + subxip = CurrentRunningXacts->subxip; + + /* + * Loop until we get a valid snapshot. See exit conditions below. + */ + for (;;) + { + count = 0; + subcount = 0; + suboverflowed = false; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + latestCompletedXid = ShmemVariableCache->latestCompletedXid; + + /* + * Spin over procArray checking xid, and subxids. Shared lock is enough + * because new transactions don't use locks at all, so LW_EXCLUSIVE + * wouldn't be enough to prevent them, so don't bother. + */ + for (index = 0; index < arrayP->numProcs; index++) + { + volatile PGPROC *proc = arrayP->procs[index]; + TransactionId xid; + int nxids; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = proc->xid; + + /* + * We store all xids, even XIDs >= xmax and our own XID, if any. + * But we don't store transactions that don't have a TransactionId + * yet because they will not show as running on a standby server. + */ + if (!TransactionIdIsValid(xid)) + continue; + + rxact[count].xid = xid; + rxact[count].databaseId = proc->databaseId; + rxact[count].roleId = proc->roleId; + rxact[count].vacuumFlags = proc->vacuumFlags; + + if (TransactionIdPrecedes(latestRunningXid, xid)) + latestRunningXid = xid; + + /* + * Save subtransaction XIDs. + * + * The other backend can add more subxids concurrently, but cannot + * remove any. Hence it's important to fetch nxids just once. Should + * be safe to use memcpy, though. (We needn't worry about missing any + * xids added concurrently, because they must postdate xmax.) + * + * Again, our own XIDs *are* included in the snapshot. + */ + nxids = proc->subxids.nxids; + + if (nxids > 0) + { + TransactionId *subxids = (TransactionId *) proc->subxids.xids; + + rxact[count].subx_offset = subcount; + + memcpy(subxip + subcount, + (void *) proc->subxids.xids, + nxids * sizeof(TransactionId)); + subcount += nxids; + + if (proc->subxids.overflowed) + { + rxact[count].overflowed = true; + suboverflowed = true; + } + + if (TransactionIdPrecedes(latestRunningXid, subxids[nxids - 1])) + latestRunningXid = subxids[nxids - 1]; + } + else + { + rxact[count].subx_offset = 0; + rxact[count].overflowed = false; + } + + rxact[count].nsubxids = nxids; + + count++; + } + + LWLockRelease(ProcArrayLock); + + /* + * If there's no procs with TransactionIds allocated we need to + * find what the last xid assigned was. This takes and releases + * XidGenLock, but that shouldn't cause contention in this case. + * We could do this as well if the snapshot overflowed, but in + * that case we think that XidGenLock might be high, so we punt. + * + * By the time we do this, another proc may have incremented the + * nextxid, so we must rescan the procarray to check whether + * there are either new running transactions or the counter is + * the same as before. If transactions appear and disappear + * faster than we can do this, we're in trouble. So spin for at + * a few 3 attempts before giving up. + * + * We do it this way to avoid needing to grab XidGenLock in all + * cases, which is hardly ever actually required. + */ + if (count > 0) + break; + else + { + #define MAX_SNAPSHOT_ATTEMPTS 3 + if (numAttempts >= MAX_SNAPSHOT_ATTEMPTS) + { + latestRunningXid = InvalidTransactionId; + break; + } + + latestRunningXid = ReadNewTransactionId(); + TransactionIdRetreat(latestRunningXid); + + if (prev_latestRunningXid == latestRunningXid) + break; + + prev_latestRunningXid = latestRunningXid; + numAttempts++; + } + } + + CurrentRunningXacts->xcnt = count; + CurrentRunningXacts->subxcnt = subcount; + CurrentRunningXacts->latestCompletedXid = latestCompletedXid; + if (!suboverflowed) + CurrentRunningXacts->latestRunningXid = latestRunningXid; + else + CurrentRunningXacts->latestRunningXid = InvalidTransactionId; + + #ifdef RUNNING_XACT_DEBUG + elog(trace_recovery(DEBUG3), + "logging running xacts xcnt %d subxcnt %d latestCompletedXid %d latestRunningXid %d", + CurrentRunningXacts->xcnt, + CurrentRunningXacts->subxcnt, + CurrentRunningXacts->latestCompletedXid, + CurrentRunningXacts->latestRunningXid); + + for (index = 0; index < CurrentRunningXacts->xcnt; index++) + { + int j; + elog(trace_recovery(DEBUG3), + "xid %d pid %d backend %d db %d role %d nsubxids %d offset %d vf %u, overflow %s", + CurrentRunningXacts->xrun[index].xid, + CurrentRunningXacts->xrun[index].pid, + CurrentRunningXacts->xrun[index].slotId, + CurrentRunningXacts->xrun[index].databaseId, + CurrentRunningXacts->xrun[index].roleId, + CurrentRunningXacts->xrun[index].nsubxids, + CurrentRunningXacts->xrun[index].subx_offset, + CurrentRunningXacts->xrun[index].vacuumFlags, + CurrentRunningXacts->xrun[index].overflowed ? "t" : "f"); + for (j = 0; j < CurrentRunningXacts->xrun[index].nsubxids; j++) + elog(trace_recovery(DEBUG3), + "subxid offset %d j %d xid %d", + CurrentRunningXacts->xrun[index].subx_offset, j, + CurrentRunningXacts->subxip[j + CurrentRunningXacts->xrun[index].subx_offset]); + } + #endif + + return CurrentRunningXacts; + } + + /* * GetTransactionsInCommit -- Get the XIDs of transactions that are committing * * Constructs an array of XIDs of transactions that are currently in commit *************** *** 968,973 **** BackendPidGetProc(int pid) --- 1539,1579 ---- } /* + * BackendXidGetProc -- get a backend's PGPROC given its XID + * + * Returns NULL if not found. Note that it is up to the caller to be + * sure that the question remains meaningful for long enough for the + * answer to be used ... + */ + PGPROC * + BackendXidGetProc(TransactionId xid) + { + PGPROC *result = NULL; + ProcArrayStruct *arrayP = procArray; + int index; + + if (xid == InvalidTransactionId) /* never match invalid xid */ + return 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + PGPROC *proc = arrayP->procs[index]; + + if (proc->xid == xid) + { + result = proc; + break; + } + } + + LWLockRelease(ProcArrayLock); + + return result; + } + + /* * BackendXidGetPid -- get a backend's pid given its XID * * Returns 0 if not found or it's a prepared transaction. Note that *************** *** 1024,1036 **** IsBackendPid(int pid) * The array is palloc'd and is terminated with an invalid VXID. * * If limitXmin is not InvalidTransactionId, we skip any backends ! * with xmin >= limitXmin. If allDbs is false, we skip backends attached * to other databases. If excludeVacuum isn't zero, we skip processes for * which (excludeVacuum & vacuumFlags) is not zero. Also, our own process * is always skipped. */ VirtualTransactionId * ! GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum) { VirtualTransactionId *vxids; ProcArrayStruct *arrayP = procArray; --- 1630,1642 ---- * The array is palloc'd and is terminated with an invalid VXID. * * If limitXmin is not InvalidTransactionId, we skip any backends ! * with xmin >= limitXmin. If dbOid is valid we skip backends attached * to other databases. If excludeVacuum isn't zero, we skip processes for * which (excludeVacuum & vacuumFlags) is not zero. Also, our own process * is always skipped. */ VirtualTransactionId * ! GetCurrentVirtualXIDs(TransactionId limitXmin, Oid dbOid, int excludeVacuum) { VirtualTransactionId *vxids; ProcArrayStruct *arrayP = procArray; *************** *** 1053,1059 **** GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum) if (excludeVacuum & proc->vacuumFlags) continue; ! if (allDbs || proc->databaseId == MyDatabaseId) { /* Fetch xmin just once - might change on us? */ TransactionId pxmin = proc->xmin; --- 1659,1665 ---- if (excludeVacuum & proc->vacuumFlags) continue; ! if (!OidIsValid(dbOid) || proc->databaseId == dbOid) { /* Fetch xmin just once - might change on us? */ TransactionId pxmin = proc->xmin; *************** *** 1083,1088 **** GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum) --- 1689,1725 ---- return vxids; } + int + VirtualTransactionIdGetPid(VirtualTransactionId vxid) + { + ProcArrayStruct *arrayP = procArray; + int result = 0; + int index; + + if (!VirtualTransactionIdIsValid(vxid)) + return 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + VirtualTransactionId procvxid; + PGPROC *proc = arrayP->procs[index]; + + GET_VXID_FROM_PGPROC(procvxid, *proc); + + if (procvxid.backendId == vxid.backendId && + procvxid.localTransactionId == vxid.localTransactionId) + { + result = proc->pid; + break; + } + } + + LWLockRelease(ProcArrayLock); + + return result; + } /* * CountActiveBackends --- count backends (other than myself) that are in *************** *** 1367,1369 **** DisplayXidCache(void) --- 2004,2210 ---- } #endif /* XIDCACHE_DEBUG */ + + /* ---------------------------------------------- + * UnobservedTransactions sub-module + * ---------------------------------------------- + * + * All functions must be called holding ProcArrayLock. + */ + + /* + * Add unobserved xids to end of UnobservedXids array + */ + void + UnobservedTransactionsAddXids(TransactionId firstXid, TransactionId lastXid) + { + TransactionId ixid = firstXid; + int index = procArray->numUnobservedXids; + TransactionId *UnobservedXids; + + UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]); + + Assert(TransactionIdIsNormal(firstXid)); + Assert(TransactionIdIsNormal(lastXid)); + Assert(TransactionIdPrecedes(firstXid, lastXid)); + + /* + * UnobservedXids is maintained as a ascending list of xids, with no gaps. + * Incoming xids are always higher than previous entries, so we just add + * them directly to the end of the array. + */ + while (ixid != lastXid) + { + /* + * check to see if we have space to store more UnobservedXids + */ + if (index >= procArray->maxUnobservedXids) + { + UnobservedTransactionsDisplay(WARNING); + elog(FATAL, "No more room in UnobservedXids array"); + } + + /* + * append ixid to UnobservedXids + */ + #ifdef USE_ASSERT_CHECKING + if (TransactionIdIsValid(UnobservedXids[index]) || + (index > 0 && TransactionIdPrecedes(UnobservedXids[index - 1], ixid))) + UnobservedTransactionsDisplay(LOG); + #endif + + elog(trace_recovery(DEBUG4), "Adding UnobservedXid %d", ixid); + UnobservedXids[index] = ixid; + index++; + + TransactionIdAdvance(ixid); + } + + procArray->numUnobservedXids = index; + } + + /* + * Remove one unobserved xid from anywhere on UnobservedXids array. + * If xid has already been pruned away, no need to report as missing. + */ + void + UnobservedTransactionsRemoveXid(TransactionId xid, bool missing_is_error) + { + int index; + bool found = false; + TransactionId *UnobservedXids; + + UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]); + + elog(trace_recovery(DEBUG4), "Remove UnobservedXid = %d", xid); + + /* + * If we haven't initialised array yet, or if we've already cleared it + * ignore this and get on with it. If it's missing after this it is an + * ERROR if removal is requested and the value isn't present. + */ + if (procArray->numUnobservedXids == 0 || + (procArray->numUnobservedXids > 0 && + TransactionIdPrecedes(xid, UnobservedXids[0]))) + return; + + /* + * XXX we could use bsearch, if this has significant overhead. + */ + for (index = 0; index < procArray->numUnobservedXids; index++) + { + if (!found) + { + if (UnobservedXids[index] == xid) + found = true; + } + else + { + UnobservedXids[index - 1] = UnobservedXids[index]; + } + } + + if (found) + UnobservedXids[--procArray->numUnobservedXids] = InvalidTransactionId; + + if (!found && missing_is_error) + { + UnobservedTransactionsDisplay(LOG); + elog(ERROR, "could not remove unobserved xid = %d", xid); + } + } + + /* + * Prune array up to a particular limit. This frequently means clearing the + * whole array, but we don't attempt to optimise for that at present. + */ + void + UnobservedTransactionsPruneXids(TransactionId limitXid) + { + int index; + int pruneUpToThisIndex = 0; + TransactionId *UnobservedXids; + + UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]); + + elog(trace_recovery(DEBUG4), "prune UnobservedXids up to %d", limitXid); + + for (index = 0; index < procArray->numUnobservedXids; index++) + { + if (TransactionIdFollowsOrEquals(limitXid, UnobservedXids[index])) + pruneUpToThisIndex = index + 1; + else + { + /* + * Anything to delete? + */ + if (pruneUpToThisIndex == 0) + return; + + /* + * Move unpruned values to start of array + */ + UnobservedXids[index - pruneUpToThisIndex] = UnobservedXids[index]; + UnobservedXids[index] = 0; + } + } + + procArray->numUnobservedXids -= pruneUpToThisIndex; + } + + /* + * Clear the whole array. + */ + void + UnobservedTransactionsClearXids(void) + { + int index; + TransactionId *UnobservedXids; + + elog(trace_recovery(DEBUG4), "Clear UnobservedXids"); + + UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]); + + /* + * UnobservedTransactionsAddXids() asserts that array will be empty + * when we add new values. so it must be zeroes here each time. + */ + for (index = 0; index < procArray->numUnobservedXids; index++) + { + UnobservedXids[index] = 0; + } + + procArray->numUnobservedXids = 0; + } + + void + UnobservedTransactionsDisplay(int trace_level) + { + int index; + TransactionId *UnobservedXids; + + UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]); + + for (index = 0; index < procArray->maxUnobservedXids; index++) + { + if (TransactionIdIsValid(UnobservedXids[index])) + elog(trace_level, "%d unobserved[%d] = %d ", + procArray->numUnobservedXids, index, UnobservedXids[index]); + } + } + + bool + XidInUnobservedTransactions(TransactionId xid) + { + int index; + TransactionId *UnobservedXids; + + UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]); + + for (index = 0; index < procArray->numUnobservedXids; index++) + { + if (UnobservedXids[index] == xid) + return true; + } + return false; + } *** src/backend/storage/ipc/sinvaladt.c --- src/backend/storage/ipc/sinvaladt.c *************** *** 142,147 **** typedef struct ProcState --- 142,148 ---- int nextMsgNum; /* next message number to read */ bool resetState; /* backend needs to reset its state */ bool signaled; /* backend has been sent catchup signal */ + bool sendOnly; /* backend only sends, never receives */ /* * Next LocalTransactionId to use for each idle backend slot. We keep *************** *** 248,254 **** CreateSharedInvalidationState(void) * Initialize a new backend to operate on the sinval buffer */ void ! SharedInvalBackendInit(void) { int index; ProcState *stateP = NULL; --- 249,255 ---- * Initialize a new backend to operate on the sinval buffer */ void ! SharedInvalBackendInit(bool sendOnly) { int index; ProcState *stateP = NULL; *************** *** 307,312 **** SharedInvalBackendInit(void) --- 308,314 ---- stateP->nextMsgNum = segP->maxMsgNum; stateP->resetState = false; stateP->signaled = false; + stateP->sendOnly = sendOnly; LWLockRelease(SInvalWriteLock); *************** *** 578,584 **** SICleanupQueue(bool callerHasWriteLock, int minFree) /* * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify * the furthest-back backend that needs signaling (if any), and reset ! * any backends that are too far back. */ min = segP->maxMsgNum; minsig = min - SIG_THRESHOLD; --- 580,588 ---- /* * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify * the furthest-back backend that needs signaling (if any), and reset ! * any backends that are too far back. Note that because we ignore ! * sendOnly backends here it is possible for them to keep sending ! * messages without a problem even when they are the only active backend. */ min = segP->maxMsgNum; minsig = min - SIG_THRESHOLD; *************** *** 590,596 **** SICleanupQueue(bool callerHasWriteLock, int minFree) int n = stateP->nextMsgNum; /* Ignore if inactive or already in reset state */ ! if (stateP->procPid == 0 || stateP->resetState) continue; /* --- 594,600 ---- int n = stateP->nextMsgNum; /* Ignore if inactive or already in reset state */ ! if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly) continue; /* *** src/backend/storage/lmgr/lock.c --- src/backend/storage/lmgr/lock.c *************** *** 35,43 **** --- 35,45 ---- #include "access/transam.h" #include "access/twophase.h" #include "access/twophase_rmgr.h" + #include "access/xact.h" #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" + #include "storage/sinval.h" #include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/resowner.h" *************** *** 490,495 **** LockAcquire(const LOCKTAG *locktag, --- 492,506 ---- if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes) elog(ERROR, "unrecognized lock mode: %d", lockmode); + if (IsRecoveryProcessingMode() && + locktag->locktag_type == LOCKTAG_OBJECT && + lockmode > AccessShareLock) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot acquire lockmode %s on database objects while recovery is in progress", + lockMethodTable->lockModeNames[lockmode]), + errhint("Only AccessShareLock can be acquired on database objects during recovery."))); + #ifdef LOCK_DEBUG if (LOCK_DEBUG_ENABLED(locktag)) elog(LOG, "LockAcquire: lock [%u,%u] %s", *************** *** 817,822 **** LockAcquire(const LOCKTAG *locktag, --- 828,881 ---- LWLockRelease(partitionLock); + /* + * We made it all the way here. We've got the lock and we've got + * it for the first time in this transaction. So now it's time + * to send a WAL message so that standby servers can see this event, + * if its an AccessExclusiveLock on a relation. + */ + if (!InRecovery && lockmode >= AccessExclusiveLock && + locktag->locktag_type == LOCKTAG_RELATION) + { + XLogRecData rdata; + xl_rel_lock xlrec; + TransactionId xid; + + /* + * First thing we do is ensure that a TransactionId has been + * assigned to this transaction. We don't actually need the xid + * but if we don't do this then RecordTransactionCommit() and + * RecordTransactionAbort() will optimise away the transaction + * completion record which recovery relies upon to release locks. + * It's a hack, but for a corner case not worth adding code for + * into the main commit path. + */ + xid = GetTopTransactionId(); + Assert(TransactionIdIsValid(xid)); + + Assert(OidIsValid(locktag->locktag_field2)); + + START_CRIT_SECTION(); + + /* + * Decode the locktag back to the original values, to avoid + * sending lots of empty bytes with every message. See + * lock.h to check how a locktag is defined for LOCKTAG_RELATION + */ + xlrec.xid = xid; + xlrec.dbOid = locktag->locktag_field1; + xlrec.relOid = locktag->locktag_field2; + + rdata.data = (char *) (&xlrec); + rdata.len = sizeof(xl_rel_lock); + rdata.buffer = InvalidBuffer; + rdata.next = NULL; + + (void) XLogInsert(RM_RELATION_ID, XLOG_RELATION_LOCK, &rdata); + + END_CRIT_SECTION(); + } + return LOCKACQUIRE_OK; } *** src/backend/storage/lmgr/proc.c --- src/backend/storage/lmgr/proc.c *************** *** 103,108 **** ProcGlobalShmemSize(void) --- 103,110 ---- size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGPROC))); /* MyProcs, including autovacuum */ size = add_size(size, mul_size(MaxBackends, sizeof(PGPROC))); + /* RecoveryProcs, including recovery actions by autovacuum */ + size = add_size(size, mul_size(MaxBackends, sizeof(PGPROC))); /* ProcStructLock */ size = add_size(size, sizeof(slock_t)); *************** *** 204,209 **** InitProcGlobal(void) --- 206,230 ---- ProcGlobal->autovacFreeProcs = &procs[i]; } + /* + * Create enough Recovery Procs so there is a shadow proc for every + * normal proc. Recovery procs don't need semaphores because they + * aren't actually performing any work, they are just ghosts with + * enough substance to store enough information to make them look + * real to anyone requesting a snapshot from the procarray. + */ + procs = (PGPROC *) ShmemAlloc((MaxBackends) * sizeof(PGPROC)); + if (!procs) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"))); + MemSet(procs, 0, MaxBackends * sizeof(PGPROC)); + for (i = 0; i < MaxBackends; i++) + { + procs[i].links.next = (SHM_QUEUE *) ProcGlobal->freeProcs; + ProcGlobal->freeProcs = &procs[i]; + } + MemSet(AuxiliaryProcs, 0, NUM_AUXILIARY_PROCS * sizeof(PGPROC)); for (i = 0; i < NUM_AUXILIARY_PROCS; i++) { *************** *** 277,282 **** InitProcess(void) --- 298,304 ---- /* * Initialize all fields of MyProc, except for the semaphore which was * prepared for us by InitProcGlobal. + * Recovery snapshot processing relies completely on this never changing. */ SHMQueueElemInit(&(MyProc->links)); MyProc->waitStatus = STATUS_OK; *************** *** 319,324 **** InitProcess(void) --- 341,440 ---- InitDeadLockChecking(); } + void + FreeRecoveryProcess(PGPROC *proc) + { + volatile PROC_HDR *procglobal = ProcGlobal; + + SpinLockAcquire(ProcStructLock); + + /* Return struct to freelist */ + proc->links.next = (SHM_QUEUE *) procglobal->freeProcs; + procglobal->freeProcs = proc; + + SpinLockRelease(ProcStructLock); + } + + /* + * InitRecoveryProcess -- initialize a per-master process data structure + * for use when emulating transactions in recovery + */ + PGPROC * + InitRecoveryProcess(void) + { + /* use volatile pointer to prevent code rearrangement */ + volatile PROC_HDR *procglobal = ProcGlobal; + PGPROC *ThisProc = NULL; + + /* + * ProcGlobal should be set up already (if we are a backend, we inherit + * this by fork() or EXEC_BACKEND mechanism from the postmaster). + */ + if (procglobal == NULL) + elog(PANIC, "proc header uninitialized"); + + /* + * Try to get a proc struct from the free list. If this fails, we must be + * out of PGPROC structures (not to mention semaphores). + */ + SpinLockAcquire(ProcStructLock); + + ThisProc = procglobal->freeProcs; + + if (ThisProc != NULL) + { + procglobal->freeProcs = (PGPROC *) ThisProc->links.next; + SpinLockRelease(ProcStructLock); + } + else + { + /* + * Should never reach here if shared memory is allocated correctly. + */ + SpinLockRelease(ProcStructLock); + elog(FATAL, "too many procs - could not create recovery proc"); + } + + /* + * xid will be set later as WAL records arrive for this recovery proc + */ + ThisProc->xid = InvalidTransactionId; + + /* + * The backendid of the recovery proc stays at InvalidBackendId. There + * is a direct 1:1 correspondence between a master backendid and this + * proc, but that same backendid may also be in use during recovery, + * so if we set this field we would have duplicate backendids. + */ + ThisProc->backendId = InvalidBackendId; + + /* + * The following are not used in recovery + */ + ThisProc->pid = 0; + + SHMQueueElemInit(&(ThisProc->links)); + ThisProc->waitStatus = STATUS_OK; + ThisProc->lxid = InvalidLocalTransactionId; + ThisProc->xmin = InvalidTransactionId; + ThisProc->databaseId = InvalidOid; + ThisProc->roleId = InvalidOid; + ThisProc->inCommit = false; + ThisProc->vacuumFlags = 0; + ThisProc->lwWaiting = false; + ThisProc->lwExclusive = false; + ThisProc->lwWaitLink = NULL; + ThisProc->waitLock = NULL; + ThisProc->waitProcLock = NULL; + + /* + * There is little else to do. The recovery proc is never used to + * acquire buffers, nor will we ever acquire LWlocks using the proc. + * Deadlock checker is not active during recovery. + */ + return ThisProc; + } + /* * InitProcessPhase2 -- make MyProc visible in the shared ProcArray. * *************** *** 363,368 **** InitProcessPhase2(void) --- 479,489 ---- * to the ProcArray or the sinval messaging mechanism, either. They also * don't get a VXID assigned, since this is only useful when we actually * hold lockmgr locks. + * + * Startup process however uses locks but never waits for them in the + * normal backend sense. Startup process also takes part in sinval messaging + * as a sendOnly process, so never reads messages from sinval queue. So + * Startup process does have a VXID and does show up in pg_locks. */ void InitAuxiliaryProcess(void) *************** *** 452,457 **** InitAuxiliaryProcess(void) --- 573,595 ---- } /* + * Additional initialisation for Startup process + */ + void + PublishStartupProcessInformation(void) + { + /* use volatile pointer to prevent code rearrangement */ + volatile PROC_HDR *procglobal = ProcGlobal; + + SpinLockAcquire(ProcStructLock); + + procglobal->startupProc = MyProc; + procglobal->startupProcPid = MyProcPid; + + SpinLockRelease(ProcStructLock); + } + + /* * Check whether there are at least N free PGPROC objects. * * Note: this is designed on the assumption that N will generally be small. *************** *** 1271,1277 **** ProcWaitForSignal(void) void ProcSendSignal(int pid) { ! PGPROC *proc = BackendPidGetProc(pid); if (proc != NULL) PGSemaphoreUnlock(&proc->sem); --- 1409,1438 ---- void ProcSendSignal(int pid) { ! PGPROC *proc = NULL; ! ! /* ! * Check to see whether it is the Startup process we wish to signal. ! * We could initialise this elsewhere, but then have a function in ! * proc.c calling a function in procarray.c calling a function in ! * proc.c which is more confusing and error prone than just putting ! * this code where it's needed. ! */ ! if (IsRecoveryProcessingMode()) ! { ! /* use volatile pointer to prevent code rearrangement */ ! volatile PROC_HDR *procglobal = ProcGlobal; ! ! SpinLockAcquire(ProcStructLock); ! ! if (pid == procglobal->startupProcPid) ! proc = procglobal->startupProc; ! ! SpinLockRelease(ProcStructLock); ! } ! ! if (proc == NULL) ! proc = BackendPidGetProc(pid); if (proc != NULL) PGSemaphoreUnlock(&proc->sem); *** src/backend/tcop/utility.c --- src/backend/tcop/utility.c *************** *** 287,296 **** ProcessUtility(Node *parsetree, --- 287,308 ---- SetPGVariable("transaction_isolation", list_make1(item->arg), true); + else if (strcmp(item->defname, "transaction_read_only") == 0) + { + A_Const *con; + + Assert(IsA(item->arg, A_Const)); + con = (A_Const *) item->arg; + Assert(nodeTag(&con->val) == T_Integer); + + if (!intVal(&con->val)) + PreventCommandDuringRecovery(); + SetPGVariable("transaction_read_only", list_make1(item->arg), true); + } } } break; *************** *** 305,310 **** ProcessUtility(Node *parsetree, --- 317,323 ---- break; case TRANS_STMT_PREPARE: + PreventCommandDuringRecovery(); if (!PrepareTransactionBlock(stmt->gid)) { /* report unsuccessful commit in completionTag */ *************** *** 314,324 **** ProcessUtility(Node *parsetree, --- 327,339 ---- break; case TRANS_STMT_COMMIT_PREPARED: + PreventCommandDuringRecovery(); PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); FinishPreparedTransaction(stmt->gid, true); break; case TRANS_STMT_ROLLBACK_PREPARED: + PreventCommandDuringRecovery(); PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); FinishPreparedTransaction(stmt->gid, false); break; *************** *** 676,681 **** ProcessUtility(Node *parsetree, --- 691,697 ---- break; case T_GrantStmt: + PreventCommandDuringRecovery(); ExecuteGrantStmt((GrantStmt *) parsetree); break; *************** *** 846,851 **** ProcessUtility(Node *parsetree, --- 862,868 ---- case T_NotifyStmt: { NotifyStmt *stmt = (NotifyStmt *) parsetree; + PreventCommandDuringRecovery(); Async_Notify(stmt->conditionname); } *************** *** 854,859 **** ProcessUtility(Node *parsetree, --- 871,877 ---- case T_ListenStmt: { ListenStmt *stmt = (ListenStmt *) parsetree; + PreventCommandDuringRecovery(); Async_Listen(stmt->conditionname); } *************** *** 862,867 **** ProcessUtility(Node *parsetree, --- 880,886 ---- case T_UnlistenStmt: { UnlistenStmt *stmt = (UnlistenStmt *) parsetree; + PreventCommandDuringRecovery(); if (stmt->conditionname) Async_Unlisten(stmt->conditionname); *************** *** 881,890 **** ProcessUtility(Node *parsetree, --- 900,911 ---- break; case T_ClusterStmt: + PreventCommandDuringRecovery(); cluster((ClusterStmt *) parsetree, isTopLevel); break; case T_VacuumStmt: + PreventCommandDuringRecovery(); vacuum((VacuumStmt *) parsetree, InvalidOid, true, NULL, false, isTopLevel); break; *************** *** 1000,1011 **** ProcessUtility(Node *parsetree, --- 1021,1034 ---- ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to do CHECKPOINT"))); + PreventCommandDuringRecovery(); RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); break; case T_ReindexStmt: { ReindexStmt *stmt = (ReindexStmt *) parsetree; + PreventCommandDuringRecovery(); switch (stmt->kind) { *************** *** 2490,2492 **** GetCommandLogLevel(Node *parsetree) --- 2513,2524 ---- return lev; } + + void + PreventCommandDuringRecovery(void) + { + if (IsRecoveryProcessingMode()) + ereport(ERROR, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("cannot be run until recovery completes"))); + } *** src/backend/utils/adt/txid.c --- src/backend/utils/adt/txid.c *************** *** 338,343 **** txid_current(PG_FUNCTION_ARGS) --- 338,349 ---- txid val; TxidEpoch state; + if (IsRecoveryProcessingMode()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot assign txid while recovery is in progress"), + errhint("only read only queries can execute during recovery"))); + load_xid_epoch(&state); val = convert_xid(GetTopTransactionId(), &state); *** src/backend/utils/cache/inval.c --- src/backend/utils/cache/inval.c *************** *** 86,95 **** --- 86,100 ---- */ #include "postgres.h" + #include + + #include "access/transam.h" #include "access/twophase_rmgr.h" #include "access/xact.h" #include "catalog/catalog.h" #include "miscadmin.h" + #include "storage/lmgr.h" + #include "storage/procarray.h" #include "storage/sinval.h" #include "storage/smgr.h" #include "utils/inval.h" *************** *** 155,160 **** typedef struct TransInvalidationInfo --- 160,173 ---- static TransInvalidationInfo *transInvalInfo = NULL; + static SharedInvalidationMessage *SharedInvalidMessagesArray; + static int numSharedInvalidMessagesArray; + static int maxSharedInvalidMessagesArray; + + static List *RecoveryLockList; + static MemoryContext RelationLockContext; + + /* * Dynamically-registered callback functions. Current implementation * assumes there won't be very many of these at once; could improve if needed. *************** *** 741,746 **** AtStart_Inval(void) --- 754,761 ---- MemoryContextAllocZero(TopTransactionContext, sizeof(TransInvalidationInfo)); transInvalInfo->my_level = GetCurrentTransactionNestLevel(); + SharedInvalidMessagesArray = NULL; + numSharedInvalidMessagesArray = 0; } /* *************** *** 851,856 **** inval_twophase_postcommit(TransactionId xid, uint16 info, --- 866,991 ---- } } + static void + MakeSharedInvalidMessagesArray(const SharedInvalidationMessage *msgs, int n) + { + /* + * Initialise array first time through in each commit + */ + if (SharedInvalidMessagesArray == NULL) + { + maxSharedInvalidMessagesArray = FIRSTCHUNKSIZE; + numSharedInvalidMessagesArray = 0; + + /* + * Although this is being palloc'd we don't actually free it directly. + * We're so close to EOXact that we now we're going to lose it anyhow. + */ + SharedInvalidMessagesArray = palloc(maxSharedInvalidMessagesArray + * sizeof(SharedInvalidationMessage)); + } + + if ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray) + { + while ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray) + maxSharedInvalidMessagesArray *= 2; + + SharedInvalidMessagesArray = repalloc(SharedInvalidMessagesArray, + maxSharedInvalidMessagesArray + * sizeof(SharedInvalidationMessage)); + } + + /* + * Append the next chunk onto the array + */ + memcpy(SharedInvalidMessagesArray + numSharedInvalidMessagesArray, + msgs, n * sizeof(SharedInvalidationMessage)); + numSharedInvalidMessagesArray += n; + } + + /* + * xactGetCommittedInvalidationMessages() is executed by + * RecordTransactionCommit() to add invalidation messages onto the + * commit record. This applies only to commit message types, never to + * abort records. Must always run before AtEOXact_Inval(), since that + * removes the data we need to see. + * + * Remember that this runs before we have officially committed, so we + * must not do anything here to change what might occur *if* we should + * fail between here and the actual commit. + * + * Note that transactional validation does *not* write a invalidation + * WAL message using XLOG_RELATION_INVAL messages. Those are only used + * by non-transactional invalidation. see comments in + * EndNonTransactionalInvalidation(). + * + * see also xact_redo_commit() and xact_desc_commit() + */ + int + xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs, + bool *RelcacheInitFileInval) + { + MemoryContext oldcontext; + + /* Must be at top of stack */ + Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL); + + /* + * Relcache init file invalidation requires processing both before and + * after we send the SI messages. However, we need not do anything + * unless we committed. + */ + if (transInvalInfo->RelcacheInitFileInval) + *RelcacheInitFileInval = true; + else + *RelcacheInitFileInval = false; + + /* + * Walk through TransInvalidationInfo to collect all the messages + * into a single contiguous array of invalidation messages. It must + * be contiguous so we can copy directly into WAL message. Maintain the + * order that they would be processed in by AtEOXact_Inval(), to ensure + * emulated behaviour in redo is as similar as possible to original. + * We want the same bugs, if any, not new ones. + */ + oldcontext = MemoryContextSwitchTo(CurTransactionContext); + + ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs, + MakeSharedInvalidMessagesArray); + ProcessInvalidationMessagesMulti(&transInvalInfo->PriorCmdInvalidMsgs, + MakeSharedInvalidMessagesArray); + MemoryContextSwitchTo(oldcontext); + + #ifdef STANDBY_INVAL_DEBUG + if (numSharedInvalidMessagesArray > 0) + { + int i; + + elog(LOG, "numSharedInvalidMessagesArray = %d", numSharedInvalidMessagesArray); + + Assert(SharedInvalidMessagesArray != NULL); + + for (i = 0; i < numSharedInvalidMessagesArray; i++) + { + SharedInvalidationMessage *msg = SharedInvalidMessagesArray + i; + + if (msg->id >= 0) + elog(LOG, "catcache id %d", msg->id); + else if (msg->id == SHAREDINVALRELCACHE_ID) + elog(LOG, "relcache id %d", msg->id); + else if (msg->id == SHAREDINVALSMGR_ID) + elog(LOG, "smgr cache id %d", msg->id); + } + } + #endif + + if (numSharedInvalidMessagesArray > 0) + Assert(SharedInvalidMessagesArray != NULL); + + *msgs = SharedInvalidMessagesArray; + + return numSharedInvalidMessagesArray; + } /* * AtEOXact_Inval *************** *** 1041,1046 **** BeginNonTransactionalInvalidation(void) --- 1176,1217 ---- Assert(transInvalInfo->CurrentCmdInvalidMsgs.cclist == NULL); Assert(transInvalInfo->CurrentCmdInvalidMsgs.rclist == NULL); Assert(transInvalInfo->RelcacheInitFileInval == false); + + SharedInvalidMessagesArray = NULL; + numSharedInvalidMessagesArray = 0; + } + + /* + * General function to log the SharedInvalidMessagesArray. Only current + * caller is EndNonTransactionalInvalidation(), but that may change. + */ + static void + LogSharedInvalidMessagesArray(void) + { + XLogRecData rdata[2]; + xl_rel_inval xlrec; + + if (numSharedInvalidMessagesArray == 0) + return; + + START_CRIT_SECTION(); + + xlrec.nmsgs = numSharedInvalidMessagesArray; + + rdata[0].data = (char *) (&xlrec); + rdata[0].len = MinSizeOfRelationInval; + rdata[0].buffer = InvalidBuffer; + + rdata[0].next = &(rdata[1]); + rdata[1].data = (char *) SharedInvalidMessagesArray; + rdata[1].len = numSharedInvalidMessagesArray * + sizeof(SharedInvalidationMessage); + rdata[1].buffer = InvalidBuffer; + rdata[1].next = NULL; + + (void) XLogInsert(RM_RELATION_ID, XLOG_RELATION_INVAL, rdata); + + END_CRIT_SECTION(); } /* *************** *** 1081,1087 **** EndNonTransactionalInvalidation(void) --- 1252,1278 ---- ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs, SendSharedInvalidMessages); + /* + * Write invalidation messages to WAL. This is not required for + * recovery, it is only required for standby servers. It's fairly + * low overhead so don't worry. This allows us to trigger inval + * messages on the standby as soon as we see these records. + * see relation_redo_inval() + * + * Note that transactional validation uses an array attached to + * a WAL commit record, so these messages are rare. + */ + ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs, + MakeSharedInvalidMessagesArray); + LogSharedInvalidMessagesArray(); + /* Clean up and release memory */ + + /* XXX: some questions and thoughts here: + * not sure where/how to allocate memory correctly in this case + * and how to free it afterwards. Think some more on this. + */ + for (chunk = transInvalInfo->CurrentCmdInvalidMsgs.cclist; chunk != NULL; chunk = next) *************** *** 1235,1237 **** CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, --- 1426,1808 ---- ++relcache_callback_count; } + + /* + * ----------------------------------------------------- + * Standby wait timers and backend cancel logic + * ----------------------------------------------------- + */ + + static void + InitStandbyDelayTimers(int *currentDelay_ms, int *standbyWait_ms) + { + *currentDelay_ms = GetLatestReplicationDelay(); + + /* + * If replication delay is enormously huge, just treat that as + * zero and work up from there. This prevents us from acting + * foolishly when replaying old log files. + */ + if (*currentDelay_ms < 0) + *currentDelay_ms = 0; + + #define STANDBY_INITIAL_WAIT_MS 1 + *standbyWait_ms = STANDBY_INITIAL_WAIT_MS; + } + + /* + * Standby wait logic for XactResolveRedoVisibilityConflicts(). + * We wait here for a while then return. If we decide wecan't wait any + * more then we return true, if we can wait some more return false. + */ + static bool + WaitExceedsMaxStandbyDelay(int *currentDelay_ms, int *standbyWait_ms) + { + int maxStandbyDelay_ms = maxStandbyDelay * 1000; + + /* + * If the server is already further behind than we would + * like then no need to wait or do more complex logic. + * max_standby_delay = 0 means wait for ever, if necessary + */ + if (maxStandbyDelay >= 0 && + *currentDelay_ms > maxStandbyDelay_ms) + return true; + + /* + * Sleep, then do bookkeeping. + */ + pg_usleep(*standbyWait_ms * 1000L); + *currentDelay_ms += *standbyWait_ms; + + /* + * Progressively increase the sleep times. + */ + *standbyWait_ms *= 2; + if (*standbyWait_ms > 1000) + *standbyWait_ms = 1000; + + /* + * Re-test our exit criteria + */ + if (maxStandbyDelay >= 0 && + *currentDelay_ms > maxStandbyDelay_ms) + return true; + + return false; + } + + void + ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, + char *reason) + { + int standbyWait_ms; + int currentDelay_ms; + bool logged; + int wontDieWait = 1; + + InitStandbyDelayTimers(¤tDelay_ms, &standbyWait_ms); + logged = false; + + while (VirtualTransactionIdIsValid(*waitlist)) + { + /* + * log that we have been waiting for a while now... + */ + if (!logged && standbyWait_ms > 500) + { + elog(trace_recovery(DEBUG5), + "virtual transaction %u/%u is blocking %s", + waitlist->backendId, + waitlist->localTransactionId, + reason); + logged = true; + } + + if (ConditionalVirtualXactLockTableWait(*waitlist)) + { + waitlist++; + InitStandbyDelayTimers(¤tDelay_ms, &standbyWait_ms); + logged = false; + } + else if (WaitExceedsMaxStandbyDelay(¤tDelay_ms, + &standbyWait_ms)) + { + /* + * Now find out who to throw out of the balloon. + */ + int pid; + + Assert(VirtualTransactionIdIsValid(*waitlist)); + pid = VirtualTransactionIdGetPid(*waitlist); + + /* + * Kill the pid if it's still here. If not, that's what we wanted + * so ignore any errors. + */ + if (pid != 0) + { + elog(LOG, + "recovery cancels activity of virtual transaction %u/%u pid %d " + "because it blocks %s (current delay now %d secs)", + waitlist->backendId, + waitlist->localTransactionId, + pid, reason, + currentDelay_ms / 1000); + kill(pid, SIGINT); + + /* wait awhile for it to die */ + pg_usleep(wontDieWait * 5000L); + wontDieWait *= 2; + } + } + } + } + + /* + * Locking in Recovery Mode + * + * All locks are held by the Startup process using a single virtual + * transaction. This implementation is both simpler and in some senses, + * more correct. The locks held mean "some original transaction held + * this lock, so query access is not allowed at this time". So the Startup + * process is the proxy by which the original locks are implemented. + * + * We only keep track of AccessExclusiveLocks, which are only ever held by + * one transaction on one relation. So we don't worry too much about keeping + * track of which xid holds which lock, we just track which slot holds the + * lock. This makes this scheme self-cleaning in case lock holders die + * without leaving a trace in the WAL. + * + * We keep a single dynamically expandible locks list in local memory. + * List elements use type xl_rel_lock, since the WAL record type exactly + * matches the information that we need to keep track of. + * + * We use session locks rather than normal locks so we don't need owners. + */ + + /* called by relation_redo_lock() */ + static void + RelationAddRecoveryLock(xl_rel_lock *lockRequest) + { + xl_rel_lock *newlock; + LOCKTAG locktag; + MemoryContext old_context; + + elog(trace_recovery(DEBUG4), + "adding recovery lock: db %d rel %d", + lockRequest->dbOid, lockRequest->relOid); + + /* + * dbOid is InvalidOid when we are locking a shared relation. + */ + Assert(OidIsValid(lockRequest->relOid)); + + if (RelationLockContext == NULL) + RelationLockContext = AllocSetContextCreate(TopMemoryContext, + "RelationLocks", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + old_context = MemoryContextSwitchTo(RelationLockContext); + newlock = palloc(sizeof(xl_rel_lock)); + MemoryContextSwitchTo(old_context); + + newlock->xid = lockRequest->xid; + newlock->dbOid = lockRequest->dbOid; + newlock->relOid = lockRequest->relOid; + RecoveryLockList = lappend(RecoveryLockList, newlock); + + /* + * Attempt to acquire the lock as requested. + */ + SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid); + + /* + * Waiting for lock to clear or kill anyone in our way. Not a + * completely foolproof way of getting the lock, but we cannot + * afford to sit and wait for the lock indefinitely. This is + * one reason to reduce strengths of various locks in 8.4. + */ + while (LockAcquire(&locktag, AccessExclusiveLock, true, true) + == LOCKACQUIRE_NOT_AVAIL) + { + VirtualTransactionId *old_lockholders; + + old_lockholders = GetLockConflicts(&locktag, AccessExclusiveLock); + ResolveRecoveryConflictWithVirtualXIDs(old_lockholders, + "exclusive locks"); + } + } + + static void + RelationRemoveRecoveryLocks(TransactionId xid) + { + ListCell *l; + LOCKTAG locktag; + List *deletionList = NIL; + + /* + * Release all matching locks and identify list elements to remove + */ + foreach(l, RecoveryLockList) + { + xl_rel_lock *lock = (xl_rel_lock *) lfirst(l); + + elog(trace_recovery(DEBUG4), + "releasing recovery lock: xid %u db %d rel %d", + lock->xid, lock->dbOid, lock->relOid); + + if (!TransactionIdIsValid(xid) || lock->xid == xid) + { + SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid); + if (!LockRelease(&locktag, AccessExclusiveLock, true)) + elog(trace_recovery(LOG), + "RecoveryLockList contains entry for lock " + "no longer recorded by lock manager " + "xid %u database %d relation %d", + lock->xid, lock->dbOid, lock->relOid); + deletionList = lappend(deletionList, lock); + } + } + + /* + * Now remove the elements from RecoveryLockList. We can't navigate + * the list at the same time as deleting multiple elements from it. + */ + foreach(l, deletionList) + { + xl_rel_lock *lock = (xl_rel_lock *) lfirst(l); + + RecoveryLockList = list_delete_ptr(RecoveryLockList, lock); + pfree(lock); + } + } + + + /* + * Called during xact_commit_redo() and xact_commit_abort when InArchiveRecovery + * to remove any AccessExclusiveLocks requested by a transaction. + * + * Remove all locks for this xid from the RecoveryLockList. + */ + void + RelationReleaseRecoveryLocks(TransactionId xid) + { + RelationRemoveRecoveryLocks(xid); + } + + /* + * Called at end of recovery and when we see a shutdown checkpoint. + */ + void + RelationClearRecoveryLocks(void) + { + elog(trace_recovery(DEBUG1), "clearing recovery locks"); + RelationRemoveRecoveryLocks(InvalidTransactionId); + } + + /* + * -------------------------------------------------- + * Recovery handling for Rmgr RM_RELATION_ID + * -------------------------------------------------- + */ + + /* + * Redo for relation lock messages + */ + static void + relation_redo_lock(xl_rel_lock *xlrec) + { + RelationAddRecoveryLock(xlrec); + } + + /* + * Redo for relation invalidation messages + */ + static void + relation_redo_inval(xl_rel_inval *xlrec) + { + SharedInvalidationMessage *msgs = &(xlrec->msgs[0]); + int nmsgs = xlrec->nmsgs; + + Assert(nmsgs > 0); /* else we should not have written a record */ + + /* + * Smack them straight onto the queue and we're done. This is safe + * because the only writer of these messages is non-transactional + * invalidation. + */ + SendSharedInvalidMessages(msgs, nmsgs); + } + + void + relation_redo(XLogRecPtr lsn, XLogRecord *record) + { + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_RELATION_INVAL) + { + xl_rel_inval *xlrec = (xl_rel_inval *) XLogRecGetData(record); + + relation_redo_inval(xlrec); + } + else if (info == XLOG_RELATION_LOCK) + { + xl_rel_lock *xlrec = (xl_rel_lock *) XLogRecGetData(record); + + relation_redo_lock(xlrec); + } + else + elog(PANIC, "relation_redo: unknown op code %u", info); + } + + static void + relation_desc_inval(StringInfo buf, xl_rel_inval *xlrec) + { + SharedInvalidationMessage *msgs = &(xlrec->msgs[0]); + int nmsgs = xlrec->nmsgs; + + appendStringInfo(buf, "nmsgs %d;", nmsgs); + + if (nmsgs > 0) + { + int i; + + for (i = 0; i < nmsgs; i++) + { + SharedInvalidationMessage *msg = msgs + i; + + if (msg->id >= 0) + appendStringInfo(buf, "catcache id %d", msg->id); + else if (msg->id == SHAREDINVALRELCACHE_ID) + appendStringInfo(buf, "relcache "); + else if (msg->id == SHAREDINVALSMGR_ID) + appendStringInfo(buf, "smgr "); + } + } + } + + void + relation_desc(StringInfo buf, uint8 xl_info, char *rec) + { + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_RELATION_INVAL) + { + xl_rel_inval *xlrec = (xl_rel_inval *) rec; + + appendStringInfo(buf, "inval: "); + relation_desc_inval(buf, xlrec); + } + else if (info == XLOG_RELATION_LOCK) + { + xl_rel_lock *xlrec = (xl_rel_lock *) rec; + + appendStringInfo(buf, "exclusive relation lock: xid %u db %d rel %d", + xlrec->xid, xlrec->dbOid, xlrec->relOid); + } + else + appendStringInfo(buf, "UNKNOWN"); + } *** src/backend/utils/error/elog.c --- src/backend/utils/error/elog.c *************** *** 2579,2581 **** is_log_level_output(int elevel, int log_min_level) --- 2579,2598 ---- return false; } + + /* + * If trace_recovery_messages is set to make this visible, then show as LOG, + * else display as whatever level is set. It may still be shown, but only + * if log_min_messages is set lower than trace_recovery_messages. + * + * Intention is to keep this for at least the whole of the 8.4 production + * release, so we can more easily diagnose production problems in the field. + */ + int + trace_recovery(int trace_level) + { + if (trace_level >= trace_recovery_messages) + return LOG; + + return trace_level; + } *** src/backend/utils/init/flatfiles.c --- src/backend/utils/init/flatfiles.c *************** *** 678,686 **** write_auth_file(Relation rel_authid, Relation rel_authmem) /* * This routine is called once during database startup, after completing * WAL replay if needed. Its purpose is to sync the flat files with the ! * current state of the database tables. This is particularly important ! * during PITR operation, since the flat files will come from the ! * base backup which may be far out of sync with the current state. * * In theory we could skip rebuilding the flat files if no WAL replay * occurred, but it seems best to just do it always. We have to --- 678,687 ---- /* * This routine is called once during database startup, after completing * WAL replay if needed. Its purpose is to sync the flat files with the ! * current state of the database tables. ! * ! * In 8.4 we also run this during xact_redo_commit() if the transaction ! * wrote a new database or auth flat file. * * In theory we could skip rebuilding the flat files if no WAL replay * occurred, but it seems best to just do it always. We have to *************** *** 716,723 **** BuildFlatFiles(bool database_only) /* * We don't have any hope of running a real relcache, but we can use the * same fake-relcache facility that WAL replay uses. - * - * No locking is needed because no one else is alive yet. */ rel_db = CreateFakeRelcacheEntry(rnode); write_database_file(rel_db, true); --- 717,722 ---- *************** *** 832,845 **** AtEOXact_UpdateFlatFiles(bool isCommit) /* Okay to write the files */ if (database_file_update_subid != InvalidSubTransactionId) { ! database_file_update_subid = InvalidSubTransactionId; write_database_file(drel, false); heap_close(drel, NoLock); } if (auth_file_update_subid != InvalidSubTransactionId) { ! auth_file_update_subid = InvalidSubTransactionId; write_auth_file(arel, mrel); heap_close(arel, NoLock); heap_close(mrel, NoLock); --- 831,844 ---- /* Okay to write the files */ if (database_file_update_subid != InvalidSubTransactionId) { ! /* reset database_file_update_subid later during commit */ write_database_file(drel, false); heap_close(drel, NoLock); } if (auth_file_update_subid != InvalidSubTransactionId) { ! /* reset auth_file_update_subid later during commit */ write_auth_file(arel, mrel); heap_close(arel, NoLock); heap_close(mrel, NoLock); *************** *** 859,864 **** AtEOXact_UpdateFlatFiles(bool isCommit) --- 858,887 ---- ForceSyncCommit(); } + /* + * Exported to allow transaction commit to set flags to perform flat file + * update in redo. Reset per-transaction flags. For abort case they were + * already set during AtEOXact_UpdateFlatFiles(). + */ + bool + AtEOXact_Database_FlatFile_Update_Needed(void) + { + bool result = TransactionIdIsValid(database_file_update_subid); + + database_file_update_subid = InvalidSubTransactionId; + + return result; + } + + bool + AtEOXact_Auth_FlatFile_Update_Needed(void) + { + bool result = TransactionIdIsValid(auth_file_update_subid); + + auth_file_update_subid = InvalidSubTransactionId; + + return result; + } /* * This routine is called during transaction prepare. *** src/backend/utils/init/postinit.c --- src/backend/utils/init/postinit.c *************** *** 440,446 **** InitPostgres(const char *in_dbname, Oid dboid, const char *username, */ MyBackendId = InvalidBackendId; ! SharedInvalBackendInit(); if (MyBackendId > MaxBackends || MyBackendId <= 0) elog(FATAL, "bad backend id: %d", MyBackendId); --- 440,446 ---- */ MyBackendId = InvalidBackendId; ! SharedInvalBackendInit(false); if (MyBackendId > MaxBackends || MyBackendId <= 0) elog(FATAL, "bad backend id: %d", MyBackendId); *************** *** 489,497 **** InitPostgres(const char *in_dbname, Oid dboid, const char *username, --- 489,503 ---- * Start a new transaction here before first access to db, and get a * snapshot. We don't have a use for the snapshot itself, but we're * interested in the secondary effect that it sets RecentGlobalXmin. + * If we are connecting during recovery, make sure the initial + * transaction is read only and force all subsequent transactions + * that way also. */ if (!bootstrap) { + if (IsRecoveryProcessingMode()) + SetConfigOption("default_transaction_read_only", "true", + PGC_POSTMASTER, PGC_S_OVERRIDE); StartTransactionCommand(); (void) GetTransactionSnapshot(); } *************** *** 515,521 **** InitPostgres(const char *in_dbname, Oid dboid, const char *username, */ if (!bootstrap) LockSharedObject(DatabaseRelationId, MyDatabaseId, 0, ! RowExclusiveLock); /* * Recheck the flat file copy of pg_database to make sure the target --- 521,527 ---- */ if (!bootstrap) LockSharedObject(DatabaseRelationId, MyDatabaseId, 0, ! (IsRecoveryProcessingMode() ? AccessShareLock : RowExclusiveLock)); /* * Recheck the flat file copy of pg_database to make sure the target *** src/backend/utils/misc/guc.c --- src/backend/utils/misc/guc.c *************** *** 114,119 **** extern char *temp_tablespaces; --- 114,121 ---- extern bool synchronize_seqscans; extern bool fullPageWrites; + int trace_recovery_messages = DEBUG1; + #ifdef TRACE_SORT extern bool trace_sort; #endif *************** *** 2609,2614 **** static struct config_enum ConfigureNamesEnum[] = --- 2611,2626 ---- }, { + {"trace_recovery_messages", PGC_SUSET, LOGGING_WHEN, + gettext_noop("Sets the message levels that are logged during recovery."), + gettext_noop("Each level includes all the levels that follow it. The later" + " the level, the fewer messages are sent.") + }, + &trace_recovery_messages, + DEBUG1, server_message_level_options, NULL, NULL + }, + + { {"track_functions", PGC_SUSET, STATS_COLLECTOR, gettext_noop("Collects function-level statistics on database activity."), NULL *************** *** 5475,5482 **** ExecSetVariableStmt(VariableSetStmt *stmt) --- 5487,5505 ---- SetPGVariable("transaction_isolation", list_make1(item->arg), stmt->is_local); else if (strcmp(item->defname, "transaction_read_only") == 0) + { + A_Const *con; + + Assert(IsA(item->arg, A_Const)); + con = (A_Const *) item->arg; + Assert(nodeTag(&con->val) == T_Integer); + + if (!intVal(&con->val)) + PreventCommandDuringRecovery(); + SetPGVariable("transaction_read_only", list_make1(item->arg), stmt->is_local); + } else elog(ERROR, "unexpected SET TRANSACTION element: %s", item->defname); *************** *** 5494,5501 **** ExecSetVariableStmt(VariableSetStmt *stmt) --- 5517,5535 ---- SetPGVariable("default_transaction_isolation", list_make1(item->arg), stmt->is_local); else if (strcmp(item->defname, "transaction_read_only") == 0) + { + A_Const *con; + + Assert(IsA(item->arg, A_Const)); + con = (A_Const *) item->arg; + Assert(nodeTag(&con->val) == T_Integer); + + if (!intVal(&con->val)) + PreventCommandDuringRecovery(); + SetPGVariable("default_transaction_read_only", list_make1(item->arg), stmt->is_local); + } else elog(ERROR, "unexpected SET SESSION element: %s", item->defname); *** src/backend/utils/time/tqual.c --- src/backend/utils/time/tqual.c *************** *** 86,92 **** static inline void SetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid) { ! if (TransactionIdIsValid(xid)) { /* NB: xid must be known committed here! */ XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); --- 86,92 ---- SetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid) { ! if (!IsRecoveryProcessingMode() && TransactionIdIsValid(xid)) { /* NB: xid must be known committed here! */ XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); *************** *** 1238,1263 **** XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) return true; /* ! * If the snapshot contains full subxact data, the fastest way to check ! * things is just to compare the given XID against both subxact XIDs and ! * top-level XIDs. If the snapshot overflowed, we have to use pg_subtrans ! * to convert a subxact XID to its parent XID, but then we need only look ! * at top-level XIDs not subxacts. */ - if (snapshot->subxcnt >= 0) - { - /* full data, so search subxip */ - int32 j; ! for (j = 0; j < snapshot->subxcnt; j++) ! { ! if (TransactionIdEquals(xid, snapshot->subxip[j])) return true; } ! /* not there, fall through to search xip[] */ ! } ! else { /* overflowed, so convert xid to top-level */ xid = SubTransGetTopmostTransaction(xid); --- 1238,1289 ---- return true; /* ! * Our strategy for checking xids changed in 8.4. Prior to 8.4 ! * we either checked the subxid cache on the snapshot or we ! * checked subtrans. That was much more efficient than just using ! * subtrans but it has some problems. First, as soon as *any* ! * transaction had more than 64 transactions we forced *all* ! * snapshots to check against subtrans, giving a sharp modal ! * change in behaviour. Second because we either checked subtrans ! * or the snapshot, we were forced to place entries in subtrans ! * in case the snapshot later overflowed, even if we never ! * actually checked subtrans. ! * ! * In 8.4 we improve on that scheme in a number of ways. As before ! * we check subtrans if the snapshot has overflowed. We *also* ! * check the subxid cache. This has two benefits: first the ! * behaviour degrades gracefully when the cache overflows, so we ! * retain much of its benefit if it has only just overflowed. ! * Second, a transaction doesn't need to insert entries into ! * subtrans until its own personal subxid cache overflows. This ! * means entries into subtrans become significantly rarer, ! * perhaps less than 1% of the previous insert rate, giving ! * considerable benefit for transactions using only a few ! * subtransactions. ! * ! * This behaviour is also necessary for allowing snapshots to work ! * correctly on a standby server. By this subtle change of behaviour ! * we can now utilise the subxid cache to store "unobserved xids" ! * of which we can infer their existence from watching the ! * arrival sequence of newly observed transactionids in the WAL. */ ! /* ! * First, compare the given XID against cached subxact XIDs. ! */ ! for (i = 0; i < snapshot->subxcnt; i++) ! { ! if (TransactionIdEquals(xid, snapshot->subxip[i])) return true; } ! /* ! * If the snapshot overflowed and we haven't already located the xid ! * we also have to consult pg_subtrans. We use subtrans to convert a ! * subxact XID to its parent XID, so that we can then check the status ! * of the top-level TransactionId. ! */ ! if (snapshot->suboverflowed) { /* overflowed, so convert xid to top-level */ xid = SubTransGetTopmostTransaction(xid); *************** *** 1270,1275 **** XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) --- 1296,1305 ---- return false; } + /* + * By now xid is either not present, or a top-level xid. So now + * we just need to check the main transaction ids. + */ for (i = 0; i < snapshot->xcnt; i++) { if (TransactionIdEquals(xid, snapshot->xip[i])) *** src/bin/pg_controldata/pg_controldata.c --- src/bin/pg_controldata/pg_controldata.c *************** *** 197,202 **** main(int argc, char *argv[]) --- 197,205 ---- printf(_("Minimum recovery ending location: %X/%X\n"), ControlFile.minRecoveryPoint.xlogid, ControlFile.minRecoveryPoint.xrecoff); + printf(_("Minimum safe starting location: %X/%X\n"), + ControlFile.minSafeStartPoint.xlogid, + ControlFile.minSafeStartPoint.xrecoff); printf(_("Maximum data alignment: %u\n"), ControlFile.maxAlign); /* we don't print floatFormat since can't say much useful about it */ *** src/bin/pg_resetxlog/pg_resetxlog.c --- src/bin/pg_resetxlog/pg_resetxlog.c *************** *** 603,608 **** RewriteControlFile(void) --- 603,610 ---- ControlFile.prevCheckPoint.xrecoff = 0; ControlFile.minRecoveryPoint.xlogid = 0; ControlFile.minRecoveryPoint.xrecoff = 0; + ControlFile.minSafeStartPoint.xlogid = 0; + ControlFile.minSafeStartPoint.xrecoff = 0; /* Now we can force the recorded xlog seg size to the right thing. */ ControlFile.xlog_seg_size = XLogSegSize; *** src/include/access/heapam.h --- src/include/access/heapam.h *************** *** 130,140 **** extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec); extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf, ItemPointerData from, Buffer newbuf, HeapTuple newtup); extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, ! bool redirect_move); extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, OffsetNumber *offsets, int offcnt); --- 130,142 ---- extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf, ItemPointerData from, Buffer newbuf, HeapTuple newtup); + extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode, + TransactionId latestRemovedXid); extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, ! TransactionId latestRemovedXid, bool redirect_move); extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, OffsetNumber *offsets, int offcnt); *** src/include/access/htup.h --- src/include/access/htup.h *************** *** 580,585 **** typedef HeapTupleData *HeapTuple; --- 580,586 ---- #define XLOG_HEAP2_FREEZE 0x00 #define XLOG_HEAP2_CLEAN 0x10 #define XLOG_HEAP2_CLEAN_MOVE 0x20 + #define XLOG_HEAP2_CLEANUP_INFO 0x30 /* * All what we need to find changed tuple *************** *** 668,673 **** typedef struct xl_heap_clean --- 669,675 ---- { RelFileNode node; BlockNumber block; + TransactionId latestRemovedXid; uint16 nredirected; uint16 ndead; /* OFFSET NUMBERS FOLLOW */ *************** *** 675,680 **** typedef struct xl_heap_clean --- 677,695 ---- #define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16)) + /* + * Cleanup_info is required in some cases during a lazy VACUUM. + * Used for reporting the results of HeapTupleHeaderAdvanceLatestRemovedXid() + * see vacuumlazy.c for full explanation + */ + typedef struct xl_heap_cleanup_info + { + RelFileNode node; + TransactionId latestRemovedXid; + } xl_heap_cleanup_info; + + #define SizeOfHeapCleanupInfo (sizeof(xl_heap_cleanup_info)) + /* This is for replacing a page's contents in toto */ /* NB: this is used for indexes as well as heaps */ typedef struct xl_heap_newpage *************** *** 718,723 **** typedef struct xl_heap_freeze --- 733,741 ---- #define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId)) + extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, + TransactionId *latestRemovedXid); + /* HeapTupleHeader functions implemented in utils/time/combocid.c */ extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup); extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup); *** src/include/access/nbtree.h --- src/include/access/nbtree.h *************** *** 214,225 **** typedef struct BTMetaPageData #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ #define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add tuple with split of root */ #define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */ ! #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuple */ #define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */ #define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, and update metapage */ #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0 /* page deletion that makes * parent half-dead */ /* * All that we need to find changed index tuple --- 214,226 ---- #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ #define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add tuple with split of root */ #define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */ ! #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ #define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */ #define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, and update metapage */ #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0 /* page deletion that makes * parent half-dead */ + #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during vacuum */ /* * All that we need to find changed index tuple *************** *** 306,321 **** typedef struct xl_btree_split /* * This is what we need to know about delete of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a ! * single index page. */ typedef struct xl_btree_delete { RelFileNode node; BlockNumber block; /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ } xl_btree_delete; ! #define SizeOfBtreeDelete (offsetof(xl_btree_delete, block) + sizeof(BlockNumber)) /* * This is what we need to know about deletion of a btree page. The target --- 307,359 ---- /* * This is what we need to know about delete of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a ! * single index page when *not* executed by VACUUM. */ typedef struct xl_btree_delete { RelFileNode node; BlockNumber block; + TransactionId latestRemovedXid; + int numItems; /* number of items in the offset array */ + /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ } xl_btree_delete; ! #define SizeOfBtreeDelete (offsetof(xl_btree_delete, latestRemovedXid) + sizeof(TransactionId)) ! ! /* ! * This is what we need to know about vacuum of individual leaf index tuples. ! * The WAL record can represent deletion of any number of index tuples on a ! * single index page when executed by VACUUM. ! * ! * The correctness requirement for applying these changes during recovery is ! * that we must do one of these two things for every block in the index: ! * * lock the block for cleanup and apply any required changes ! * * EnsureBlockUnpinned() ! * The purpose of this is to ensure that no index scans started before we ! * finish scanning the index are still running by the time we begin to remove ! * heap tuples. ! * ! * Any changes to any one block are registered on just one WAL record. All ! * blocks that we need to run EnsureBlockUnpinned() before we touch the changed ! * block are also given on this record as a variable length array. The array ! * is compressed by way of storing an array of block ranges, rather than an ! * actual array of blockids. ! * ! * Note that the *last* WAL record in any vacuum of an index is allowed to ! * have numItems == 0. All other WAL records must have numItems > 0. ! */ ! typedef struct xl_btree_vacuum ! { ! RelFileNode node; ! BlockNumber block; ! BlockNumber lastBlockVacuumed; ! int numItems; /* number of items in the offset array */ ! ! /* TARGET OFFSET NUMBERS FOLLOW */ ! } xl_btree_vacuum; ! ! #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) /* * This is what we need to know about deletion of a btree page. The target *************** *** 498,503 **** typedef BTScanOpaqueData *BTScanOpaque; --- 536,545 ---- #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) + /* XXX probably needs new RMgr call to do this cleanly */ + extern bool btree_is_cleanup_record(uint8 info); + extern bool btree_needs_cleanup_lock(uint8 info); + /* * prototypes for functions in nbtree.c (external entry points for btree) */ *************** *** 537,543 **** extern void _bt_relbuf(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); extern void _bt_delitems(Relation rel, Buffer buf, ! OffsetNumber *itemnos, int nitems); extern int _bt_pagedel(Relation rel, Buffer buf, BTStack stack, bool vacuum_full); --- 579,586 ---- extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); extern void _bt_delitems(Relation rel, Buffer buf, ! OffsetNumber *itemnos, int nitems, bool isVacuum, ! BlockNumber lastBlockVacuumed); extern int _bt_pagedel(Relation rel, Buffer buf, BTStack stack, bool vacuum_full); *** src/include/access/rmgr.h --- src/include/access/rmgr.h *************** *** 23,28 **** typedef uint8 RmgrId; --- 23,29 ---- #define RM_DBASE_ID 4 #define RM_TBLSPC_ID 5 #define RM_MULTIXACT_ID 6 + #define RM_RELATION_ID 8 #define RM_HEAP2_ID 9 #define RM_HEAP_ID 10 #define RM_BTREE_ID 11 *** src/include/access/xact.h --- src/include/access/xact.h *************** *** 17,22 **** --- 17,23 ---- #include "access/xlog.h" #include "nodes/pg_list.h" #include "storage/relfilenode.h" + #include "utils/snapshot.h" #include "utils/timestamp.h" *************** *** 84,111 **** typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid, #define XLOG_XACT_ABORT 0x20 #define XLOG_XACT_COMMIT_PREPARED 0x30 #define XLOG_XACT_ABORT_PREPARED 0x40 typedef struct xl_xact_commit { ! TimestampTz xact_time; /* time of commit */ ! int nrels; /* number of RelFileNodes */ ! int nsubxacts; /* number of subtransaction XIDs */ ! /* Array of RelFileNode(s) to drop at commit */ ! RelFileNode xnodes[1]; /* VARIABLE LENGTH ARRAY */ ! /* ARRAY OF COMMITTED SUBTRANSACTION XIDs FOLLOWS */ } xl_xact_commit; #define MinSizeOfXactCommit offsetof(xl_xact_commit, xnodes) typedef struct xl_xact_abort { TimestampTz xact_time; /* time of abort */ int nrels; /* number of RelFileNodes */ int nsubxacts; /* number of subtransaction XIDs */ /* Array of RelFileNode(s) to drop at abort */ RelFileNode xnodes[1]; /* VARIABLE LENGTH ARRAY */ /* ARRAY OF ABORTED SUBTRANSACTION XIDs FOLLOWS */ } xl_xact_abort; #define MinSizeOfXactAbort offsetof(xl_xact_abort, xnodes) --- 85,159 ---- #define XLOG_XACT_ABORT 0x20 #define XLOG_XACT_COMMIT_PREPARED 0x30 #define XLOG_XACT_ABORT_PREPARED 0x40 + #define XLOG_XACT_ASSIGNMENT 0x50 + #define XLOG_XACT_RUNNING_XACTS 0x60 + /* 0x70 can also be used, if required */ + + typedef struct xl_xact_assignment + { + TransactionId xassign; /* assigned xid */ + TransactionId xparent; /* assigned xids parent, if any */ + bool isSubXact; /* is a subtransaction */ + } xl_xact_assignment; + + /* + * xl_xact_running_xacts is in utils/snapshot.h so it can be passed + * around to the same places as snapshots. Not snapmgr.h + */ typedef struct xl_xact_commit { ! TimestampTz xact_time; /* time of commit */ ! uint xinfo; /* info flags */ ! int nrels; /* number of RelFileForks */ ! int nsubxacts; /* number of subtransaction XIDs */ ! int nmsgs; /* number of shared inval msgs */ ! /* Array of RelFileFork(s) to drop at commit */ ! RelFileNode xnodes[1]; /* VARIABLE LENGTH ARRAY */ ! /* ARRAY OF COMMITTED SUBTRANSACTION XIDs FOLLOWS */ ! /* ARRAY OF SHARED INVALIDATION MESSAGES FOLLOWS */ } xl_xact_commit; #define MinSizeOfXactCommit offsetof(xl_xact_commit, xnodes) + #define OffsetSharedInvalInXactCommit() \ + ( \ + MinSizeOfXactCommit + \ + (xlrec->nsubxacts * sizeof(TransactionId)) + \ + (xlrec->nrels * sizeof(RelFileNode)) \ + ) + + /* + * These flags are set in the xinfo fields of transaction + * completion WAL records. They indicate a number of actions + * that need to occur when emulating transaction completion. + * They are named XactCompletion... to differentiate them from + * EOXact... routines which run at the end of the original + * transaction completion. + */ + #define XACT_COMPLETION_UNMARKED_SUBXIDS 0x01 + + /* These next states only occur on commit record types */ + #define XACT_COMPLETION_UPDATE_DB_FILE 0x02 + #define XACT_COMPLETION_UPDATE_AUTH_FILE 0x04 + #define XACT_COMPLETION_UPDATE_RELCACHE_FILE 0x08 + + /* Access macros for above flags */ + #define XactCompletionHasUnMarkedSubxids(xlrec) ((xlrec)->xinfo & XACT_COMPLETION_UNMARKED_SUBXIDS) + #define XactCompletionUpdateDBFile(xlrec) ((xlrec)->xinfo & XACT_COMPLETION_UPDATE_DB_FILE) + #define XactCompletionUpdateAuthFile(xlrec) ((xlrec)->xinfo & XACT_COMPLETION_UPDATE_AUTH_FILE) + #define XactCompletionRelcacheInitFileInval(xlrec) ((xlrec)->xinfo & XACT_COMPLETION_UPDATE_RELCACHE_FILE) typedef struct xl_xact_abort { TimestampTz xact_time; /* time of abort */ + uint xinfo; /* info flags */ int nrels; /* number of RelFileNodes */ int nsubxacts; /* number of subtransaction XIDs */ /* Array of RelFileNode(s) to drop at abort */ RelFileNode xnodes[1]; /* VARIABLE LENGTH ARRAY */ /* ARRAY OF ABORTED SUBTRANSACTION XIDs FOLLOWS */ } xl_xact_abort; + /* Note the intentional lack of an invalidation message array c.f. commit */ #define MinSizeOfXactAbort offsetof(xl_xact_abort, xnodes) *************** *** 185,190 **** extern TransactionId RecordTransactionCommit(void); --- 233,245 ---- extern int xactGetCommittedChildren(TransactionId **ptr); + extern void LogCurrentRunningXacts(void); + extern bool IsRunningXactDataValid(void); + + extern void InitRecoveryTransactionEnvironment(void); + extern void XactResolveRecoveryConflicts(TransactionId latestRemovedXid, Oid recDatabaseOid); + extern void RecordKnownAssignedTransactionIds(XLogRecPtr lsn, XLogRecord *record); + extern void xact_redo(XLogRecPtr lsn, XLogRecord *record); extern void xact_desc(StringInfo buf, uint8 xl_info, char *rec); *** src/include/access/xlog.h --- src/include/access/xlog.h *************** *** 46,55 **** typedef struct XLogRecord TransactionId xl_xid; /* xact id */ uint32 xl_tot_len; /* total len of entire record */ uint32 xl_len; /* total len of rmgr data */ ! uint8 xl_info; /* flag bits, see below */ RmgrId xl_rmid; /* resource manager for this record */ ! /* Depending on MAXALIGN, there are either 2 or 6 wasted bytes here */ /* ACTUAL LOG DATA FOLLOWS AT END OF STRUCT */ --- 46,56 ---- TransactionId xl_xid; /* xact id */ uint32 xl_tot_len; /* total len of entire record */ uint32 xl_len; /* total len of rmgr data */ ! uint8 xl_info; /* flag bits, see below (XLR_ entries) */ RmgrId xl_rmid; /* resource manager for this record */ + TransactionId xl_parentxid; /* parent_xid if XLR2_FIRST_SUBXID_RECORD is set */ ! /* XXX Above structure has 8 byte alignment */ /* ACTUAL LOG DATA FOLLOWS AT END OF STRUCT */ *************** *** 133,139 **** typedef struct XLogRecData } XLogRecData; extern TimeLineID ThisTimeLineID; /* current TLI */ ! extern bool InRecovery; extern XLogRecPtr XactLastRecEnd; /* these variables are GUC parameters related to XLOG */ --- 134,148 ---- } XLogRecData; extern TimeLineID ThisTimeLineID; /* current TLI */ ! /* ! * Prior to 8.4, all activity during recovery were carried out by Startup ! * process. This local variable continues to be used in many parts of the ! * code to indicate actions taken by RecoveryManagers. Other processes who ! * potentially perform work during recovery should check ! * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c ! */ ! extern bool InRecovery; ! extern bool InArchiveRecovery; extern XLogRecPtr XactLastRecEnd; /* these variables are GUC parameters related to XLOG */ *************** *** 143,148 **** extern bool XLogArchiveMode; --- 152,158 ---- extern char *XLogArchiveCommand; extern int XLogArchiveTimeout; extern bool log_checkpoints; + extern int maxStandbyDelay; #define XLogArchivingActive() (XLogArchiveMode) #define XLogArchiveCommandSet() (XLogArchiveCommand[0] != '\0') *************** *** 166,171 **** extern bool XLOG_DEBUG; --- 176,182 ---- /* These indicate the cause of a checkpoint request */ #define CHECKPOINT_CAUSE_XLOG 0x0010 /* XLOG consumption */ #define CHECKPOINT_CAUSE_TIME 0x0020 /* Elapsed time */ + #define CHECKPOINT_RESTARTPOINT 0x0040 /* Restartpoint during recovery */ /* Checkpoint statistics */ typedef struct CheckpointStatsData *************** *** 197,202 **** extern void XLogSetAsyncCommitLSN(XLogRecPtr record); --- 208,216 ---- extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool IsRecoveryProcessingMode(void); + extern int GetLatestReplicationDelay(void); + extern void UpdateControlFile(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); *** src/include/access/xlog_internal.h --- src/include/access/xlog_internal.h *************** *** 17,22 **** --- 17,23 ---- #define XLOG_INTERNAL_H #include "access/xlog.h" + #include "catalog/pg_control.h" #include "fmgr.h" #include "pgtime.h" #include "storage/block.h" *************** *** 71,77 **** typedef struct XLogContRecord /* * Each page of XLOG file has a header like this: */ ! #define XLOG_PAGE_MAGIC 0xD063 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { --- 72,78 ---- /* * Each page of XLOG file has a header like this: */ ! #define XLOG_PAGE_MAGIC 0x5352 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { *************** *** 245,250 **** extern const RmgrData RmgrTable[]; --- 246,254 ---- extern pg_time_t GetLastSegSwitchTime(void); extern XLogRecPtr RequestXLogSwitch(void); + extern void CreateRestartPoint(const XLogRecPtr ReadPtr, + const CheckPoint *restartPoint, int flags); + /* * These aren't in xlog.h because I'd rather not include fmgr.h there. */ *************** *** 255,259 **** extern Datum pg_current_xlog_location(PG_FUNCTION_ARGS); --- 259,273 ---- extern Datum pg_current_xlog_insert_location(PG_FUNCTION_ARGS); extern Datum pg_xlogfile_name_offset(PG_FUNCTION_ARGS); extern Datum pg_xlogfile_name(PG_FUNCTION_ARGS); + extern Datum pg_recovery_continue(PG_FUNCTION_ARGS); + extern Datum pg_recovery_pause(PG_FUNCTION_ARGS); + extern Datum pg_recovery_pause_cleanup(PG_FUNCTION_ARGS); + extern Datum pg_recovery_pause_xid(PG_FUNCTION_ARGS); + extern Datum pg_recovery_pause_time(PG_FUNCTION_ARGS); + extern Datum pg_recovery_advance(PG_FUNCTION_ARGS); + extern Datum pg_recovery_stop(PG_FUNCTION_ARGS); + extern Datum pg_is_in_recovery(PG_FUNCTION_ARGS); + extern Datum pg_last_completed_xact_timestamp(PG_FUNCTION_ARGS); + extern Datum pg_last_completed_xid(PG_FUNCTION_ARGS); #endif /* XLOG_INTERNAL_H */ *** src/include/access/xlogutils.h --- src/include/access/xlogutils.h *************** *** 26,33 **** extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, BlockNumber nblocks); extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init); extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, ! BlockNumber blkno, ReadBufferMode mode); extern Relation CreateFakeRelcacheEntry(RelFileNode rnode); extern void FreeFakeRelcacheEntry(Relation fakerel); --- 26,34 ---- BlockNumber nblocks); extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init); + extern Buffer XLogReadBufferForCleanup(RelFileNode rnode, BlockNumber blkno, bool init); extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, ! BlockNumber blkno, ReadBufferMode mode, int lockmode); extern Relation CreateFakeRelcacheEntry(RelFileNode rnode); extern void FreeFakeRelcacheEntry(Relation fakerel); *** src/include/catalog/pg_control.h --- src/include/catalog/pg_control.h *************** *** 21,27 **** /* Version identifier for this pg_control format */ ! #define PG_CONTROL_VERSION 843 /* * Body of CheckPoint XLOG records. This is declared here because we keep --- 21,28 ---- /* Version identifier for this pg_control format */ ! #define PG_CONTROL_VERSION 847 ! // xxx change me /* * Body of CheckPoint XLOG records. This is declared here because we keep *************** *** 46,52 **** typedef struct CheckPoint #define XLOG_NOOP 0x20 #define XLOG_NEXTOID 0x30 #define XLOG_SWITCH 0x40 ! /* System status indicator */ typedef enum DBState --- 47,58 ---- #define XLOG_NOOP 0x20 #define XLOG_NEXTOID 0x30 #define XLOG_SWITCH 0x40 ! /* ! * Prior to 8.4 we wrote a shutdown checkpoint when recovery completed. ! * Now we write an XLOG_RECOVERY_END record, which helps differentiate ! * between a checkpoint-at-shutdown and the startup case. ! */ ! #define XLOG_RECOVERY_END 0x50 /* System status indicator */ typedef enum DBState *************** *** 101,107 **** typedef struct ControlFileData --- 107,118 ---- CheckPoint checkPointCopy; /* copy of last check point record */ + /* + * Next two sound very similar, yet are distinct and necessary. + * Check comments in xlog.c for a full explanation not easily repeated. + */ XLogRecPtr minRecoveryPoint; /* must replay xlog to here */ + XLogRecPtr minSafeStartPoint; /* safe point after recovery crashes */ /* * This data is used to check for hardware-architecture compatibility of *** src/include/catalog/pg_proc.h --- src/include/catalog/pg_proc.h *************** *** 3230,3235 **** DESCR("xlog filename and byte offset, given an xlog location"); --- 3230,3257 ---- DATA(insert OID = 2851 ( pg_xlogfile_name PGNSP PGUID 12 1 0 0 f f f t f i 1 0 25 "25" _null_ _null_ _null_ _null_ pg_xlogfile_name _null_ _null_ _null_ )); DESCR("xlog filename, given an xlog location"); + DATA(insert OID = 3801 ( pg_recovery_continue PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_continue _null_ _null_ _null_ )); + DESCR("if recovery is paused, continue with recovery"); + DATA(insert OID = 3802 ( pg_recovery_pause PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_pause _null_ _null_ _null_ )); + DESCR("pause recovery until recovery target reset"); + DATA(insert OID = 3803 ( pg_recovery_pause_cleanup PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_pause_cleanup _null_ _null_ _null_ )); + DESCR("continue recovery until cleanup record arrives, then pause recovery"); + DATA(insert OID = 3804 ( pg_recovery_pause_xid PGNSP PGUID 12 1 0 0 f f f t f v 1 0 2278 "23" _null_ _null_ _null_ _null_ pg_recovery_pause_xid _null_ _null_ _null_ )); + DESCR("continue recovery until specified xid completes, if ever seen, then pause recovery"); + DATA(insert OID = 3805 ( pg_recovery_pause_time PGNSP PGUID 12 1 0 0 f f f t f v 1 0 2278 "1184" _null_ _null_ _null_ _null_ pg_recovery_pause_time _null_ _null_ _null_ )); + DESCR("continue recovery until a transaction with specified timestamp completes, if ever seen, then pause recovery"); + DATA(insert OID = 3806 ( pg_recovery_advance PGNSP PGUID 12 1 0 0 f f f t f v 1 0 2278 "23" _null_ _null_ _null_ _null_ pg_recovery_advance _null_ _null_ _null_ )); + DESCR("continue recovery exactly specified number of records, then pause recovery"); + DATA(insert OID = 3807 ( pg_recovery_stop PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_stop _null_ _null_ _null_ )); + DESCR("stop recovery immediately"); + + DATA(insert OID = 3810 ( pg_is_in_recovery PGNSP PGUID 12 1 0 0 f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pg_is_in_recovery _null_ _null_ _null_ )); + DESCR("true if server is in recovery"); + DATA(insert OID = 3811 ( pg_last_completed_xact_timestamp PGNSP PGUID 12 1 0 0 f f f t f v 0 0 1184 "" _null_ _null_ _null_ _null_ pg_last_completed_xact_timestamp _null_ _null_ _null_ )); + DESCR("timestamp of last commit or abort record that arrived during recovery, if any"); + DATA(insert OID = 3812 ( pg_last_completed_xid PGNSP PGUID 12 1 0 0 f f f t f v 0 0 28 "" _null_ _null_ _null_ _null_ pg_last_completed_xid _null_ _null_ _null_ )); + DESCR("xid of last commit or abort record that arrived during recovery, if any"); + DATA(insert OID = 2621 ( pg_reload_conf PGNSP PGUID 12 1 0 0 f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pg_reload_conf _null_ _null_ _null_ )); DESCR("reload configuration files"); DATA(insert OID = 2622 ( pg_rotate_logfile PGNSP PGUID 12 1 0 0 f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pg_rotate_logfile _null_ _null_ _null_ )); *** src/include/miscadmin.h --- src/include/miscadmin.h *************** *** 235,240 **** extern bool VacuumCostActive; --- 235,246 ---- /* in tcop/postgres.c */ extern void check_stack_depth(void); + /* in tcop/utility.c */ + extern void PreventCommandDuringRecovery(void); + + /* in utils/misc/guc.c */ + extern int trace_recovery_messages; + int trace_recovery(int trace_level); /***************************************************************************** * pdir.h -- * *** src/include/postmaster/bgwriter.h --- src/include/postmaster/bgwriter.h *************** *** 12,17 **** --- 12,18 ---- #ifndef _BGWRITER_H #define _BGWRITER_H + #include "catalog/pg_control.h" #include "storage/block.h" #include "storage/relfilenode.h" *************** *** 25,30 **** extern double CheckPointCompletionTarget; --- 26,36 ---- extern void BackgroundWriterMain(void); extern void RequestCheckpoint(int flags); + extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter); + extern void RequestRestartPointCompletion(void); + extern XLogRecPtr GetRedoLocationForArchiveCheckpoint(void); + extern bool SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo); + extern void CheckpointWriteDelay(int flags, double progress); extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, *** src/include/storage/bufmgr.h --- src/include/storage/bufmgr.h *************** *** 67,72 **** extern PGDLLIMPORT int32 *LocalRefCount; --- 67,75 ---- #define BUFFER_LOCK_SHARE 1 #define BUFFER_LOCK_EXCLUSIVE 2 + /* Not used by LockBuffer, but is used by XLogReadBuffer... */ + #define BUFFER_LOCK_CLEANUP 3 + /* * These routines are beaten on quite heavily, hence the macroization. */ *************** *** 197,202 **** extern bool ConditionalLockBuffer(Buffer buffer); --- 200,209 ---- extern void LockBufferForCleanup(Buffer buffer); extern bool ConditionalLockBufferForCleanup(Buffer buffer); + extern void StartCleanupDelayStats(void); + extern void EndCleanupDelayStats(void); + extern void ReportCleanupDelayStats(void); + extern void AbortBufferIO(void); extern void BufmgrCommit(void); *** src/include/storage/pmsignal.h --- src/include/storage/pmsignal.h *************** *** 22,27 **** --- 22,28 ---- */ typedef enum { + PMSIGNAL_RECOVERY_START, /* move to PM_RECOVERY state */ PMSIGNAL_PASSWORD_CHANGE, /* pg_auth file has changed */ PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */ PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */ *** src/include/storage/proc.h --- src/include/storage/proc.h *************** *** 14,19 **** --- 14,20 ---- #ifndef _PROC_H_ #define _PROC_H_ + #include "access/xlog.h" #include "storage/lock.h" #include "storage/pg_sema.h" *************** *** 93,98 **** struct PGPROC --- 94,107 ---- uint8 vacuumFlags; /* vacuum-related flags, see above */ + /* + * The lsn field exists to allow procs to be used during recovery + * for managing snapshot data for standby servers. The lsn allows + * us to disambiguate any incoming information so we always respect + * the latest info. + */ + XLogRecPtr lsn; /* Last LSN which maintained state of Recovery Proc */ + /* Info about LWLock the process is currently waiting for, if any. */ bool lwWaiting; /* true if waiting for an LW lock */ bool lwExclusive; /* true if waiting for exclusive access */ *************** *** 133,138 **** typedef struct PROC_HDR --- 142,150 ---- PGPROC *autovacFreeProcs; /* Current shared estimate of appropriate spins_per_delay value */ int spins_per_delay; + /* The proc of the Startup process, since not in ProcArray */ + PGPROC *startupProc; + int startupProcPid; } PROC_HDR; /* *************** *** 157,164 **** extern int ProcGlobalSemas(void); --- 169,180 ---- extern Size ProcGlobalShmemSize(void); extern void InitProcGlobal(void); extern void InitProcess(void); + extern PGPROC *InitRecoveryProcess(void); + extern void FreeRecoveryProcess(PGPROC *proc); extern void InitProcessPhase2(void); extern void InitAuxiliaryProcess(void); + extern void PublishStartupProcessInformation(void); + extern bool HaveNFreeProcs(int n); extern void ProcReleaseLocks(bool isCommit); *** src/include/storage/procarray.h --- src/include/storage/procarray.h *************** *** 14,19 **** --- 14,20 ---- #ifndef PROCARRAY_H #define PROCARRAY_H + #include "access/xact.h" #include "storage/lock.h" #include "utils/snapshot.h" *************** *** 23,31 **** extern void CreateSharedProcArray(void); extern void ProcArrayAdd(PGPROC *proc); extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid); ! extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid); extern void ProcArrayClearTransaction(PGPROC *proc); extern Snapshot GetSnapshotData(Snapshot snapshot); extern bool TransactionIdIsInProgress(TransactionId xid); --- 24,40 ---- extern void ProcArrayAdd(PGPROC *proc); extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid); ! extern void ProcArrayInitRecoveryEnvironment(void); ! extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid, ! int nsubxids, TransactionId *subxids); extern void ProcArrayClearTransaction(PGPROC *proc); + extern void ProcArrayClearRecoveryTransactions(void); + extern bool XidInRecoveryProcs(TransactionId xid); + extern void ProcArrayDisplay(int trace_level); + extern void ProcArrayUpdateRecoveryTransactions(XLogRecPtr lsn, + xl_xact_running_xacts *xlrec); + extern RunningTransactions GetRunningTransactionData(void); extern Snapshot GetSnapshotData(Snapshot snapshot); extern bool TransactionIdIsInProgress(TransactionId xid); *************** *** 36,46 **** extern int GetTransactionsInCommit(TransactionId **xids_p); extern bool HaveTransactionsInCommit(TransactionId *xids, int nxids); extern PGPROC *BackendPidGetProc(int pid); extern int BackendXidGetPid(TransactionId xid); extern bool IsBackendPid(int pid); ! extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin, ! bool allDbs, int excludeVacuum); extern int CountActiveBackends(void); extern int CountDBBackends(Oid databaseid); extern int CountUserBackends(Oid roleid); --- 45,58 ---- extern bool HaveTransactionsInCommit(TransactionId *xids, int nxids); extern PGPROC *BackendPidGetProc(int pid); + extern PGPROC *BackendXidGetProc(TransactionId xid); extern int BackendXidGetPid(TransactionId xid); extern bool IsBackendPid(int pid); ! extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin, ! Oid dbOid, int excludeVacuum); ! extern int VirtualTransactionIdGetPid(VirtualTransactionId vxid); ! extern int CountActiveBackends(void); extern int CountDBBackends(Oid databaseid); extern int CountUserBackends(Oid roleid); *************** *** 51,54 **** extern void XidCacheRemoveRunningXids(TransactionId xid, --- 63,76 ---- int nxids, const TransactionId *xids, TransactionId latestXid); + /* Primitives for UnobservedXids array handling for standby */ + extern void UnobservedTransactionsAddXids(TransactionId firstXid, + TransactionId lastXid); + extern void UnobservedTransactionsRemoveXid(TransactionId xid, + bool missing_is_error); + extern void UnobservedTransactionsPruneXids(TransactionId limitXid); + extern void UnobservedTransactionsClearXids(void); + extern void UnobservedTransactionsDisplay(int trace_level); + extern bool XidInUnobservedTransactions(TransactionId xid); + #endif /* PROCARRAY_H */ *** src/include/storage/sinval.h --- src/include/storage/sinval.h *************** *** 89,94 **** extern void ReceiveSharedInvalidMessages( --- 89,132 ---- void (*invalFunction) (SharedInvalidationMessage *msg), void (*resetFunction) (void)); + extern int xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs, + bool *RelcacheInitFileInval); + + /* + * Relation Rmgr (RM_RELATION_ID) + * + * Relation recovery manager exists to allow locks and certain kinds of + * invalidation message to be passed across to a standby server. + */ + + extern void RelationReleaseRecoveryLocks(TransactionId xid); + extern void RelationClearRecoveryLocks(void); + + /* Recovery handlers for the Relation Rmgr (RM_RELATION_ID) */ + extern void relation_redo(XLogRecPtr lsn, XLogRecord *record); + extern void relation_desc(StringInfo buf, uint8 xl_info, char *rec); + + /* + * XLOG message types + */ + #define XLOG_RELATION_INVAL 0x00 + #define XLOG_RELATION_LOCK 0x10 + + typedef struct xl_rel_inval + { + int nmsgs; /* number of shared inval msgs */ + SharedInvalidationMessage msgs[1]; /* VARIABLE LENGTH ARRAY */ + } xl_rel_inval; + + #define MinSizeOfRelationInval offsetof(xl_rel_inval, msgs) + + typedef struct xl_rel_lock + { + TransactionId xid; /* xid of the *parent* transaction. XXX why parent? */ + Oid dbOid; + Oid relOid; + } xl_rel_lock; + /* signal handler for catchup events (SIGUSR1) */ extern void CatchupInterruptHandler(SIGNAL_ARGS); *** src/include/storage/sinvaladt.h --- src/include/storage/sinvaladt.h *************** *** 29,35 **** */ extern Size SInvalShmemSize(void); extern void CreateSharedInvalidationState(void); ! extern void SharedInvalBackendInit(void); extern bool BackendIdIsActive(int backendID); extern void SIInsertDataEntries(const SharedInvalidationMessage *data, int n); --- 29,35 ---- */ extern Size SInvalShmemSize(void); extern void CreateSharedInvalidationState(void); ! extern void SharedInvalBackendInit(bool sendOnly); extern bool BackendIdIsActive(int backendID); extern void SIInsertDataEntries(const SharedInvalidationMessage *data, int n); *** src/include/utils/flatfiles.h --- src/include/utils/flatfiles.h *************** *** 27,32 **** extern void AtEOSubXact_UpdateFlatFiles(bool isCommit, --- 27,39 ---- SubTransactionId mySubid, SubTransactionId parentSubid); + /* + * Called by RecordTransactionCommit to allow it to set xinfo flags + * on the commit record. Used for standby invalidation of flat files. + */ + extern bool AtEOXact_Database_FlatFile_Update_Needed(void); + extern bool AtEOXact_Auth_FlatFile_Update_Needed(void); + extern Datum flatfile_update_trigger(PG_FUNCTION_ARGS); extern void flatfile_twophase_postcommit(TransactionId xid, uint16 info, *** src/include/utils/inval.h --- src/include/utils/inval.h *************** *** 15,20 **** --- 15,21 ---- #define INVAL_H #include "access/htup.h" + #include "storage/lock.h" #include "utils/relcache.h" *************** *** 60,63 **** extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, --- 61,67 ---- extern void inval_twophase_postcommit(TransactionId xid, uint16 info, void *recdata, uint32 len); + extern void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, + char *reason); + #endif /* INVAL_H */ *** src/include/utils/snapshot.h --- src/include/utils/snapshot.h *************** *** 49,55 **** typedef struct SnapshotData uint32 xcnt; /* # of xact ids in xip[] */ TransactionId *xip; /* array of xact IDs in progress */ /* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */ ! int32 subxcnt; /* # of xact ids in subxip[], -1 if overflow */ TransactionId *subxip; /* array of subxact IDs in progress */ /* --- 49,65 ---- uint32 xcnt; /* # of xact ids in xip[] */ TransactionId *xip; /* array of xact IDs in progress */ /* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */ ! ! /* ! * Prior to 8.4 we represented an overflowed subxid cache with subxcnt -1. ! * In 8.4+ we separate the two concepts because when checking the xids ! * in the snapshot we check *both* subxid cache and subtrans, if subxid ! * cache has overflowed. So we still need the count, even if overflowed. ! * We do this to allow unobserved xids to be placed into the snapshot ! * even when snapshot overflows. It is also a performance gain. ! */ ! uint32 subxcnt; /* # of xact ids in subxip[] */ ! bool suboverflowed; /* true means at least one subxid cache overflowed */ TransactionId *subxip; /* array of subxact IDs in progress */ /* *************** *** 63,68 **** typedef struct SnapshotData --- 73,147 ---- } SnapshotData; /* + * Declarations for GetRunningTransactionData(). Similar to Snapshots, but + * not quite. This has nothing at all to do with visibility on this server, + * so this is completely separate from snapmgr.c and snapmgr.h + * This data is important for creating the initial snapshot state on a + * standby server. We need lots more information than a normal snapshot, + * hence we use a specific data structure for our needs. This data + * is written to WAL as a separate record immediately after each + * checkpoint. That means that wherever we start a standby from we will + * almost immediately see the data we need to begin executing queries. + */ + typedef struct RunningXact + { + /* Items matching PGPROC entries */ + TransactionId xid; /* xact ID in progress */ + int pid; /* backend's process id, or 0 */ + Oid databaseId; /* OID of database this backend is using */ + Oid roleId; /* OID of role using this backend */ + uint8 vacuumFlags; /* vacuum-related flags, see above */ + + /* Items matching XidCache */ + bool overflowed; + int nsubxids; /* # of subxact ids for this xact only */ + + /* Additional info */ + uint32 subx_offset; /* array offset of start of subxip, + * zero if nsubxids == 0 + */ + } RunningXact; + + typedef struct RunningXactsData + { + uint32 xcnt; /* # of xact ids in xrun[] */ + uint32 subxcnt; /* total # of xact ids in subxip[] */ + TransactionId latestRunningXid; /* Initial setting of LatestObservedXid */ + TransactionId latestCompletedXid; + + RunningXact *xrun; /* array of RunningXact structs */ + + /* + * subxip is held as a single contiguous array, so no space is wasted, + * plus it helps it fit into one XLogRecord. We continue to keep track + * of which subxids go with each top-level xid by tracking the start + * offset, held on each RunningXact struct. + */ + TransactionId *subxip; /* array of subxact IDs in progress */ + + } RunningXactsData; + + typedef RunningXactsData *RunningTransactions; + + /* + * When we write running xact data to WAL, we use this structure. + */ + typedef struct xl_xact_running_xacts + { + int xcnt; /* # of xact ids in xrun[] */ + int subxcnt; /* # of xact ids in subxip[] */ + TransactionId latestRunningXid; /* Initial setting of LatestObservedXid */ + TransactionId latestCompletedXid; + + /* Array of RunningXact(s) */ + RunningXact xrun[1]; /* VARIABLE LENGTH ARRAY */ + + /* ARRAY OF RUNNING SUBTRANSACTION XIDs FOLLOWS */ + } xl_xact_running_xacts; + + #define MinSizeOfXactRunningXacts offsetof(xl_xact_running_xacts, xrun) + + /* * Result codes for HeapTupleSatisfiesUpdate. This should really be in * tqual.h, but we want to avoid including that file elsewhere. */