Index split WAL reduction - Mailing list pgsql-patches

From Heikki Linnakangas
Subject Index split WAL reduction
Date
Msg-id 45755AD9.1040503@enterprisedb.com
Whole thread Raw
Responses Re: Index split WAL reduction
List pgsql-patches
Hi,

Currently, an index split writes all the data on the split page to WAL.
That's a lot of WAL traffic. The tuples that are copied to the right
page need to be WAL logged, but the tuples that stay on the original
page don't.

Here's a patch to do that. It needs further testing, I have used the
attached crude crashtest.sh to test the basics, but we need to test the
more obscure cases like splitting non-leaf or root page.

On a test case that inserts 10000 rows in increasing key order with a
100 characters wide text-field as key, the patch reduced the total
generated WAL traffic from 45MB to 33MB, or ~ 25%. Your mileage may
vary, depending on the tuple and key sizes, and the order of inserts.

Anyone see a problem with this?

--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/nbtree/nbtinsert.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v
retrieving revision 1.146
diff -c -r1.146 nbtinsert.c
*** src/backend/access/nbtree/nbtinsert.c    11 Nov 2006 01:14:18 -0000    1.146
--- src/backend/access/nbtree/nbtinsert.c    4 Dec 2006 12:31:13 -0000
***************
*** 932,985 ****
          xl_btree_split xlrec;
          uint8        xlinfo;
          XLogRecPtr    recptr;
!         XLogRecData rdata[4];

!         xlrec.target.node = rel->rd_node;
!         ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
          if (newitemonleft)
!             xlrec.otherblk = BufferGetBlockNumber(rbuf);
          else
!             xlrec.otherblk = BufferGetBlockNumber(buf);
!         xlrec.leftblk = lopaque->btpo_prev;
!         xlrec.rightblk = ropaque->btpo_next;
!         xlrec.level = lopaque->btpo.level;

!         /*
           * Direct access to page is not good but faster - we should implement
           * some new func in page API.  Note we only store the tuples
           * themselves, knowing that the item pointers are in the same order
           * and can be reconstructed by scanning the tuples.  See comments for
           * _bt_restore_page().
           */
!         xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
!             ((PageHeader) leftpage)->pd_upper;

!         rdata[0].data = (char *) &xlrec;
!         rdata[0].len = SizeOfBtreeSplit;
!         rdata[0].buffer = InvalidBuffer;
!         rdata[0].next = &(rdata[1]);
!
!         rdata[1].data = (char *) leftpage + ((PageHeader) leftpage)->pd_upper;
!         rdata[1].len = xlrec.leftlen;
!         rdata[1].buffer = InvalidBuffer;
!         rdata[1].next = &(rdata[2]);
!
!         rdata[2].data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper;
!         rdata[2].len = ((PageHeader) rightpage)->pd_special -
              ((PageHeader) rightpage)->pd_upper;
!         rdata[2].buffer = InvalidBuffer;
!         rdata[2].next = NULL;

          if (!P_RIGHTMOST(ropaque))
          {
!             rdata[2].next = &(rdata[3]);
!             rdata[3].data = NULL;
!             rdata[3].len = 0;
!             rdata[3].buffer = sbuf;
!             rdata[3].buffer_std = true;
!             rdata[3].next = NULL;
          }

          if (P_ISROOT(oopaque))
              xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
          else
--- 932,1026 ----
          xl_btree_split xlrec;
          uint8        xlinfo;
          XLogRecPtr    recptr;
!         XLogRecData rdata[7];
!         XLogRecData *lastrdata;

!         xlrec.node = rel->rd_node;
!         xlrec.leftsib = BufferGetBlockNumber(buf);
!         xlrec.rightsib = BufferGetBlockNumber(rbuf);
!         xlrec.firstright = firstright;
!         xlrec.rnext = ropaque->btpo_next;
!         xlrec.level = lopaque->btpo.level;
!
!         rdata[0].data = (char *) &xlrec;
!         rdata[0].len = SizeOfBtreeSplit;
!         rdata[0].buffer = InvalidBuffer;
!
!         lastrdata = &rdata[0];
!
!         /* Log downlink on non-leaf pages. */
!
!         if (lopaque->btpo.level > 0)
!         {
!             lastrdata->next = lastrdata + 1;
!             lastrdata++;
!
!             lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
!             lastrdata->len = sizeof(BlockIdData);
!             lastrdata->buffer = InvalidBuffer;
!         }
!
!
!         /* Log the new item, if it was inserted on the left page. If it was
!          * put on the right page, we don't need to explicitly WAL log it
!          * because it's included with all the other on the right page.
!          */
!         lastrdata->next = lastrdata + 1;
!         lastrdata++;
          if (newitemonleft)
!         {
!             lastrdata->data = (char *) &newitemoff;
!             lastrdata->len = sizeof(OffsetNumber);
!             lastrdata->buffer = buf;
!             lastrdata->buffer_std = true;
!
!             lastrdata->next = lastrdata + 1;
!             lastrdata++;
!             lastrdata->data = (char *)newitem;
!             lastrdata->len = newitemsz;
!             lastrdata->buffer = buf;
!             lastrdata->buffer_std = true;
!         }
          else
!         {
!             lastrdata->data = NULL;
!             lastrdata->len = 0;
!             lastrdata->buffer = buf;
!             lastrdata->buffer_std = true;
!         }

!         /* Log the contents of the right page in the format understood by
!          * _bt_restore_page(). We don't set the buffer-field, because we're
!          * going to recreate the whole page anyway.
!          *
           * Direct access to page is not good but faster - we should implement
           * some new func in page API.  Note we only store the tuples
           * themselves, knowing that the item pointers are in the same order
           * and can be reconstructed by scanning the tuples.  See comments for
           * _bt_restore_page().
           */
!         lastrdata->next = lastrdata + 1;
!         lastrdata++;

!         lastrdata->data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper;
!         lastrdata->len = ((PageHeader) rightpage)->pd_special -
              ((PageHeader) rightpage)->pd_upper;
!         lastrdata->buffer = InvalidBuffer;

+         /* Log the right sibling, because we've changed it's prev-pointer. */
          if (!P_RIGHTMOST(ropaque))
          {
!             lastrdata->next = lastrdata + 1;
!             lastrdata++;
!
!             lastrdata->data = NULL;
!             lastrdata->len = 0;
!             lastrdata->buffer = sbuf;
!             lastrdata->buffer_std = true;
          }

+         lastrdata->next = NULL;
+
          if (P_ISROOT(oopaque))
              xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
          else
Index: src/backend/access/nbtree/nbtxlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtxlog.c,v
retrieving revision 1.39
diff -c -r1.39 nbtxlog.c
*** src/backend/access/nbtree/nbtxlog.c    1 Nov 2006 19:43:17 -0000    1.39
--- src/backend/access/nbtree/nbtxlog.c    4 Dec 2006 14:13:39 -0000
***************
*** 264,385 ****
  {
      xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
      Relation    reln;
-     BlockNumber targetblk;
-     OffsetNumber targetoff;
-     BlockNumber leftsib;
-     BlockNumber rightsib;
      BlockNumber downlink = 0;
!     Buffer        buffer;
!     Page        page;
!     BTPageOpaque pageop;

!     reln = XLogOpenRelation(xlrec->target.node);
!     targetblk = ItemPointerGetBlockNumber(&(xlrec->target.tid));
!     targetoff = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
!     leftsib = (onleft) ? targetblk : xlrec->otherblk;
!     rightsib = (onleft) ? xlrec->otherblk : targetblk;

!     /* Left (original) sibling */
!     buffer = XLogReadBuffer(reln, leftsib, true);
!     Assert(BufferIsValid(buffer));
!     page = (Page) BufferGetPage(buffer);

!     _bt_pageinit(page, BufferGetPageSize(buffer));
!     pageop = (BTPageOpaque) PageGetSpecialPointer(page);

-     pageop->btpo_prev = xlrec->leftblk;
-     pageop->btpo_next = rightsib;
-     pageop->btpo.level = xlrec->level;
-     pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
-     pageop->btpo_cycleid = 0;

!     _bt_restore_page(page,
!                      (char *) xlrec + SizeOfBtreeSplit,
!                      xlrec->leftlen);

!     if (onleft && xlrec->level > 0)
      {
!         IndexTuple    itup;

!         /* extract downlink in the target tuple */
!         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, targetoff));
!         downlink = ItemPointerGetBlockNumber(&(itup->t_tid));
!         Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
      }

!     PageSetLSN(page, lsn);
!     PageSetTLI(page, ThisTimeLineID);
!     MarkBufferDirty(buffer);
!     UnlockReleaseBuffer(buffer);

!     /* Right (new) sibling */
!     buffer = XLogReadBuffer(reln, rightsib, true);
!     Assert(BufferIsValid(buffer));
!     page = (Page) BufferGetPage(buffer);

!     _bt_pageinit(page, BufferGetPageSize(buffer));
!     pageop = (BTPageOpaque) PageGetSpecialPointer(page);

-     pageop->btpo_prev = leftsib;
-     pageop->btpo_next = xlrec->rightblk;
-     pageop->btpo.level = xlrec->level;
-     pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
-     pageop->btpo_cycleid = 0;

!     _bt_restore_page(page,
!                      (char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen,
!                      record->xl_len - SizeOfBtreeSplit - xlrec->leftlen);

!     if (!onleft && xlrec->level > 0)
      {
!         IndexTuple    itup;

-         /* extract downlink in the target tuple */
-         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, targetoff));
-         downlink = ItemPointerGetBlockNumber(&(itup->t_tid));
-         Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
      }

!     PageSetLSN(page, lsn);
!     PageSetTLI(page, ThisTimeLineID);
!     MarkBufferDirty(buffer);
!     UnlockReleaseBuffer(buffer);

      /* Fix left-link of right (next) page */
!     if (!(record->xl_info & XLR_BKP_BLOCK_1))
      {
!         if (xlrec->rightblk != P_NONE)
          {
!             buffer = XLogReadBuffer(reln, xlrec->rightblk, false);
              if (BufferIsValid(buffer))
              {
!                 page = (Page) BufferGetPage(buffer);

!                 if (XLByteLE(lsn, PageGetLSN(page)))
!                 {
!                     UnlockReleaseBuffer(buffer);
!                 }
!                 else
                  {
!                     pageop = (BTPageOpaque) PageGetSpecialPointer(page);
!                     pageop->btpo_prev = rightsib;

                      PageSetLSN(page, lsn);
                      PageSetTLI(page, ThisTimeLineID);
                      MarkBufferDirty(buffer);
-                     UnlockReleaseBuffer(buffer);
                  }
              }
          }
      }

      /* Forget any split this insertion completes */
      if (xlrec->level > 0)
!         forget_matching_split(xlrec->target.node, downlink, false);

      /* The job ain't done till the parent link is inserted... */
!     log_incomplete_split(xlrec->target.node,
!                          leftsib, rightsib, isroot);
  }

  static void
--- 264,429 ----
  {
      xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
      Relation    reln;
      BlockNumber downlink = 0;
!     Buffer        lbuf, rbuf;
!     Page        lpage, rpage;
!     BTPageOpaque ropaque, lopaque;
!     char       *datapos;
!     int            datalen;
!     bool        bkp_left = record->xl_info & XLR_BKP_BLOCK_1;
!     bool        bkp_nextsib = record->xl_info & XLR_BKP_BLOCK_3;
!     OffsetNumber newitemoff;
!     IndexTuple newitem = NULL;
!     Size newitemsz = 0;

!     reln = XLogOpenRelation(xlrec->node);

!     datapos = (char *) xlrec + SizeOfBtreeSplit;
!     datalen = record->xl_len - SizeOfBtreeSplit;

!     /* Extract downlink */
!     if (xlrec->level > 0)
!     {
!         downlink = BlockIdGetBlockNumber((BlockId) datapos);
!
!         datapos += sizeof(BlockIdData);
!         datalen -= sizeof(BlockIdData);
!     }


!     /* Extract newitem and newitemoff */

!     if (!bkp_left && onleft)
      {
!         /* Extract the offset of the new tuple and it's contents */
!         memcpy(&newitemoff, datapos, sizeof(OffsetNumber));
!         datapos += sizeof(OffsetNumber);
!         datalen -= sizeof(OffsetNumber);

!         newitem = (IndexTuple) datapos;
!         newitemsz = IndexTupleDSize(*newitem); /* XXX: alignment? */
!         datapos += newitemsz;
!         datalen -= newitemsz;
      }

!     /* Reconstruct right (new) sibling */
!     rbuf = XLogReadBuffer(reln, xlrec->rightsib, true);
!     Assert(BufferIsValid(rbuf));
!     rpage = (Page) BufferGetPage(rbuf);

!     _bt_pageinit(rpage, BufferGetPageSize(rbuf));
!     ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);

!     ropaque->btpo_prev = xlrec->leftsib;
!     ropaque->btpo_next = xlrec->rnext;
!     ropaque->btpo.level = xlrec->level;
!     ropaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
!     ropaque->btpo_cycleid = 0;
!
!     _bt_restore_page(rpage, datapos, datalen);
!
!     PageSetLSN(rpage, lsn);
!     PageSetTLI(rpage, ThisTimeLineID);
!     MarkBufferDirty(rbuf);
!
!     /* don't release the buffer yet, because reconstructing the left sibling
!      * needs to access the data on the right page
!      */


!     /* Reconstruct left (original) sibling */

!     if(!bkp_left)
      {
!         lbuf = XLogReadBuffer(reln, xlrec->leftsib, false);
!
!         if (BufferIsValid(lbuf))
!         {
!             lpage = (Page) BufferGetPage(lbuf);
!             lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
!
!             if (!XLByteLE(lsn, PageGetLSN(lpage)))
!             {
!                 /* Remove the items from the left page that were copied to
!                  * right page, and add the new item if it was inserted to
!                  * left page.
!                  */
!                 OffsetNumber off;
!                 OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage);
!
!                 for(off = maxoff ; off >= xlrec->firstright; off--)
!                     PageIndexTupleDelete(lpage, off);
!
!                 if (onleft)
!                 {
!                     if (PageAddItem(lpage, (Item) newitem, newitemsz,
!                                     newitemoff, LP_USED) == InvalidOffsetNumber)
!                         elog(PANIC, "can't add new item to left sibling after split");
!                 }
!                 PageSetLSN(lpage, lsn);
!                 PageSetTLI(lpage, ThisTimeLineID);
!                 MarkBufferDirty(lbuf);
!
!
!                 /* Set high key equal to the first key on the next page */
!                 {
!                     ItemId hiItemId = PageGetItemId(rpage, FirstOffsetNumber);
!                     Item hiItem = PageGetItem(rpage, hiItemId);
!
!                     if(!P_RIGHTMOST(lopaque))
!                     {
!                         /* But remove the old high key first */
!                         PageIndexTupleDelete(lpage, P_HIKEY);
!                     }
!
!                     if(PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
!                                    P_HIKEY, LP_USED) == InvalidOffsetNumber)
!                         elog(PANIC, "can't add high key after split to left page");
!                 }
!
!                 lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
!                 lopaque->btpo_next = xlrec->rightsib;
!                 lopaque->btpo_cycleid = 0;
!             }
!
!             UnlockReleaseBuffer(lbuf);
!         }

      }

!     UnlockReleaseBuffer(rbuf);

      /* Fix left-link of right (next) page */
!     if (!bkp_nextsib)
      {
!         if (xlrec->rnext != P_NONE)
          {
!             Buffer buffer = XLogReadBuffer(reln, xlrec->rnext, false);
              if (BufferIsValid(buffer))
              {
!                 Page page = (Page) BufferGetPage(buffer);

!                 if (!XLByteLE(lsn, PageGetLSN(page)))
                  {
!                     BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
!                     pageop->btpo_prev = xlrec->rightsib;

                      PageSetLSN(page, lsn);
                      PageSetTLI(page, ThisTimeLineID);
                      MarkBufferDirty(buffer);
                  }
+                 UnlockReleaseBuffer(buffer);
              }
          }
      }

      /* Forget any split this insertion completes */
      if (xlrec->level > 0)
!         forget_matching_split(xlrec->node, downlink, false);

      /* The job ain't done till the parent link is inserted... */
!     log_incomplete_split(xlrec->node,
!                          xlrec->leftsib, xlrec->rightsib, isroot);
  }

  static void
***************
*** 723,728 ****
--- 767,773 ----
                  out_target(buf, &(xlrec->target));
                  break;
              }
+ #ifdef NOT_USED
          case XLOG_BTREE_SPLIT_L:
              {
                  xl_btree_split *xlrec = (xl_btree_split *) rec;
***************
*** 763,768 ****
--- 808,814 ----
                                   xlrec->otherblk, xlrec->rightblk);
                  break;
              }
+ #endif
          case XLOG_BTREE_DELETE:
              {
                  xl_btree_delete *xlrec = (xl_btree_delete *) rec;
Index: src/include/access/nbtree.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/nbtree.h,v
retrieving revision 1.106
diff -c -r1.106 nbtree.h
*** src/include/access/nbtree.h    1 Nov 2006 19:43:17 -0000    1.106
--- src/include/access/nbtree.h    4 Dec 2006 12:16:31 -0000
***************
*** 254,260 ****
   *
   * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
   * The _L and _R variants indicate whether the inserted tuple went into the
!  * left or right split page (and thus, whether otherblk is the right or left
   * page of the split pair).  The _ROOT variants indicate that we are splitting
   * the root page, and thus that a newroot record rather than an insert or
   * split record should follow.    Note that a split record never carries a
--- 254,261 ----
   *
   * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
   * The _L and _R variants indicate whether the inserted tuple went into the
!  * left or right split page (and thus, whether newitemoff and the new item
!  * are stored or not.
   * page of the split pair).  The _ROOT variants indicate that we are splitting
   * the root page, and thus that a newroot record rather than an insert or
   * split record should follow.    Note that a split record never carries a
***************
*** 262,278 ****
   */
  typedef struct xl_btree_split
  {
!     xl_btreetid target;            /* inserted tuple id */
!     BlockNumber otherblk;        /* second block participated in split: */
!     /* first one is stored in target' tid */
!     BlockNumber leftblk;        /* prev/left block */
!     BlockNumber rightblk;        /* next/right block */
!     uint32        level;            /* tree level of page being split */
!     uint16        leftlen;        /* len of left page items below */
!     /* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
  } xl_btree_split;

! #define SizeOfBtreeSplit    (offsetof(xl_btree_split, leftlen) + sizeof(uint16))

  /*
   * This is what we need to know about delete of individual leaf index tuples.
--- 263,283 ----
   */
  typedef struct xl_btree_split
  {
!     RelFileNode node;
!     BlockNumber leftsib;     /* orig page / new left page */
!     BlockNumber rightsib;     /* new right page */
!     OffsetNumber firstright; /* first item stored on right page */
!     BlockNumber rnext;         /* next/right block pointer */
!     uint32        level;         /* tree level of page being split */
!
!     /* BlockIdData downlink follows if level > 0 */
!
!     /* OffsetNumber newitemoff follows in the  _L variants. */
!     /* New item follows in the _L variants */
!     /* RIGHT PAGES TUPLES FOLLOW AT THE END */
  } xl_btree_split;

! #define SizeOfBtreeSplit    (offsetof(xl_btree_split, level) + sizeof(uint32))

  /*
   * This is what we need to know about delete of individual leaf index tuples.

Attachment

pgsql-patches by date:

Previous
From: Michael Meskes
Date:
Subject: Re: ECPG docs
Next
From: Teodor Sigaev
Date:
Subject: Re: Bundle of patches