Hot standby, RestoreBkpBlocks and cleanup locks - Mailing list pgsql-hackers

From Heikki Linnakangas
Subject Hot standby, RestoreBkpBlocks and cleanup locks
Date
Msg-id 49677BC3.5090105@enterprisedb.com
Whole thread Raw
Responses Re: Hot standby, RestoreBkpBlocks and cleanup locks  (Simon Riggs <simon@2ndQuadrant.com>)
Re: Hot standby, RestoreBkpBlocks and cleanup locks  (Simon Riggs <simon@2ndQuadrant.com>)
Re: Hot standby, RestoreBkpBlocks and cleanup locks  (Simon Riggs <simon@2ndQuadrant.com>)
List pgsql-hackers
The hot standby patch has some hacks to decide which full-page-images
can be restored holding an exclusive lock and which ones need a
vacuum-strength lock. It's not very pretty as is, as mentioned in
comments too.

How about we refactor things so that redo-functions are responsible for
calling RestoreBkpBlocks? The redo function can then pass an argument
indicating what kind of lock is required. We should also change
XLogReadBufferExtended so that it doesn't lock the page; the caller
knows better what kind of a lock it needs. That makes it more analogous
with ReadBufferExtended too, although I think we should keep
XLogReadBuffer() unchanged for now.

See attached patch. One shortfall of this patch is that you can pass
only one argument to RestoreBkpBlocks, but there can multiple backup
blocks in one WAL record. That's OK in current usage, though.

--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com
*** src/backend/access/gin/ginxlog.c
--- src/backend/access/gin/ginxlog.c
***************
*** 438,443 **** gin_redo(XLogRecPtr lsn, XLogRecord *record)
--- 438,445 ----
  {
      uint8        info = record->xl_info & ~XLR_INFO_MASK;

+     RestoreBkpBlocks(lsn, record, false);
+
      topCtx = MemoryContextSwitchTo(opCtx);
      switch (info)
      {
*** src/backend/access/gist/gistxlog.c
--- src/backend/access/gist/gistxlog.c
***************
*** 395,400 **** gist_redo(XLogRecPtr lsn, XLogRecord *record)
--- 395,402 ----
  {
      uint8        info = record->xl_info & ~XLR_INFO_MASK;

+     RestoreBkpBlocks(lsn, record, false);
+
      MemoryContext oldCxt;

      oldCxt = MemoryContextSwitchTo(opCtx);
*** src/backend/access/heap/heapam.c
--- src/backend/access/heap/heapam.c
***************
*** 4777,4782 **** heap_redo(XLogRecPtr lsn, XLogRecord *record)
--- 4777,4784 ----
  {
      uint8        info = record->xl_info & ~XLR_INFO_MASK;

+     RestoreBkpBlocks(lsn, record, false);
+
      switch (info & XLOG_HEAP_OPMASK)
      {
          case XLOG_HEAP_INSERT:
***************
*** 4816,4827 **** heap2_redo(XLogRecPtr lsn, XLogRecord *record)
--- 4818,4832 ----
      switch (info & XLOG_HEAP_OPMASK)
      {
          case XLOG_HEAP2_FREEZE:
+             RestoreBkpBlocks(lsn, record, false);
              heap_xlog_freeze(lsn, record);
              break;
          case XLOG_HEAP2_CLEAN:
+             RestoreBkpBlocks(lsn, record, true);
              heap_xlog_clean(lsn, record, false);
              break;
          case XLOG_HEAP2_CLEAN_MOVE:
+             RestoreBkpBlocks(lsn, record, true);
              heap_xlog_clean(lsn, record, true);
              break;
          default:
*** src/backend/access/nbtree/nbtxlog.c
--- src/backend/access/nbtree/nbtxlog.c
***************
*** 714,719 **** btree_redo(XLogRecPtr lsn, XLogRecord *record)
--- 714,721 ----
  {
      uint8        info = record->xl_info & ~XLR_INFO_MASK;

+     RestoreBkpBlocks(lsn, record, false);
+
      switch (info)
      {
          case XLOG_BTREE_INSERT_LEAF:
*** src/backend/access/transam/xlog.c
--- src/backend/access/transam/xlog.c
***************
*** 2934,2941 **** CleanupBackupHistory(void)
   * modifications of the page that appear in XLOG, rather than possibly
   * ignoring them as already applied, but that's not a huge drawback.
   */
! static void
! RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
  {
      Buffer        buffer;
      Page        page;
--- 2934,2941 ----
   * modifications of the page that appear in XLOG, rather than possibly
   * ignoring them as already applied, but that's not a huge drawback.
   */
! void
! RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
  {
      Buffer        buffer;
      Page        page;
***************
*** 2943,2948 **** RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
--- 2943,2951 ----
      char       *blk;
      int            i;

+     if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
+         return;
+
      blk = (char *) XLogRecGetData(record) + record->xl_len;
      for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
      {
***************
*** 2955,2960 **** RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
--- 2958,2968 ----
          buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
                                          RBM_ZERO);
          Assert(BufferIsValid(buffer));
+         if (cleanup)
+             LockBufferForCleanup(buffer);
+         else
+             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
          page = (Page) BufferGetPage(buffer);

          if (bkpb.hole_length == 0)
***************
*** 5210,5218 **** StartupXLOG(void)
                      TransactionIdAdvance(ShmemVariableCache->nextXid);
                  }

-                 if (record->xl_info & XLR_BKP_BLOCK_MASK)
-                     RestoreBkpBlocks(record, EndRecPtr);
-
                  RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);

                  /* Pop the error context stack */
--- 5218,5223 ----
*** src/backend/access/transam/xlogutils.c
--- src/backend/access/transam/xlogutils.c
***************
*** 217,224 **** XLogCheckInvalidPages(void)

  /*
   * XLogReadBuffer
!  *        A shorthand of XLogReadBufferExtended(), for reading from the main
!  *        fork.
   *
   * For historical reasons, instead of a ReadBufferMode argument, this only
   * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
--- 217,234 ----

  /*
   * XLogReadBuffer
!  *        Read a page during XLOG replay.
!  *
!  * This is a shorthand of XLogReadBufferExtended() followed by
!  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), for reading from the main
!  * fork.
!  *
!  * (Getting the lock is not really necessary, since we
!  * expect that this is only used during single-process XLOG replay, but
!  * some subroutines such as MarkBufferDirty will complain if we don't.)
!  * XXX: but it will be with the hot standby patch.
!  *
!  * The returned buffer is exclusively-locked.
   *
   * For historical reasons, instead of a ReadBufferMode argument, this only
   * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
***************
*** 226,247 **** XLogCheckInvalidPages(void)
  Buffer
  XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
  {
!     return XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
!                                   init ? RBM_ZERO : RBM_NORMAL);
  }

  /*
   * XLogReadBufferExtended
   *        Read a page during XLOG replay
   *
!  * This is functionally comparable to ReadBuffer followed by
!  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE): you get back a pinned
!  * and locked buffer.  (Getting the lock is not really necessary, since we
!  * expect that this is only used during single-process XLOG replay, but
!  * some subroutines such as MarkBufferDirty will complain if we don't.)
!  *
!  * There's some differences in the behavior wrt. the "mode" argument,
!  * compared to ReadBufferExtended:
   *
   * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
   * return InvalidBuffer. In this case the caller should silently skip the
--- 236,256 ----
  Buffer
  XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
  {
!     Buffer buf;
!     buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
!                                  init ? RBM_ZERO : RBM_NORMAL);
!     if (BufferIsValid(buf))
!         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
!
!     return buf;
  }

  /*
   * XLogReadBufferExtended
   *        Read a page during XLOG replay
   *
!  * This is functionally comparable to ReadBufferExtended. There's some
!  * differences in the behavior wrt. the "mode" argument:
   *
   * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
   * return InvalidBuffer. In this case the caller should silently skip the
***************
*** 306,321 **** XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
          Assert(BufferGetBlockNumber(buffer) == blkno);
      }

-     LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
      if (mode == RBM_NORMAL)
      {
          /* check that page has been initialized */
          Page        page = (Page) BufferGetPage(buffer);

          if (PageIsNew(page))
          {
-             UnlockReleaseBuffer(buffer);
              log_invalid_page(rnode, forknum, blkno, true);
              return InvalidBuffer;
          }
--- 315,331 ----
          Assert(BufferGetBlockNumber(buffer) == blkno);
      }

      if (mode == RBM_NORMAL)
      {
          /* check that page has been initialized */
          Page        page = (Page) BufferGetPage(buffer);

+         /*
+          * We assume that PageIsNew works without a lock. During recovery,
+          * no other backend should modify the buffer at the same time.
+          */
          if (PageIsNew(page))
          {
              log_invalid_page(rnode, forknum, blkno, true);
              return InvalidBuffer;
          }
*** src/backend/commands/sequence.c
--- src/backend/commands/sequence.c
***************
*** 1339,1344 **** seq_redo(XLogRecPtr lsn, XLogRecord *record)
--- 1339,1346 ----
      xl_seq_rec *xlrec = (xl_seq_rec *) XLogRecGetData(record);
      sequence_magic *sm;

+     RestoreBkpBlocks(lsn, record, false);
+
      if (info != XLOG_SEQ_LOG)
          elog(PANIC, "seq_redo: unknown op code %u", info);

*** src/backend/storage/freespace/freespace.c
--- src/backend/storage/freespace/freespace.c
***************
*** 212,217 **** XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
--- 212,219 ----

      /* If the page doesn't exist already, extend */
      buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR);
+     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
      page = BufferGetPage(buf);
      if (PageIsNew(page))
          PageInit(page, BLCKSZ, 0);
*** src/include/access/xlog.h
--- src/include/access/xlog.h
***************
*** 197,202 **** extern void XLogSetAsyncCommitLSN(XLogRecPtr record);
--- 197,204 ----
  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);

+ extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup);
+
  extern void UpdateControlFile(void);
  extern Size XLOGShmemSize(void);
  extern void XLOGShmemInit(void);

pgsql-hackers by date:

Previous
From: Simon Riggs
Date:
Subject: Re: Improving compressibility of WAL files
Next
From: Aidan Van Dyk
Date:
Subject: Re: Improving compressibility of WAL files