BTree vacuum before page splitting - Mailing list pgsql-patches

From Junji TERAMOTO
Subject BTree vacuum before page splitting
Date
Msg-id 43D9F82D.6010004@lab.ntt.co.jp
Whole thread Raw
Responses Re: BTree vacuum before page splitting  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-patches
Hi all,

This patch adds a function to remove unnecessary items before split
page of BTree.

When a new item is put in the page, it looks for the "LP_DELETE" item,
and removes that item. No heap access is required. Moreover, the penalty
is also few because it operates instead of page split.

Only when the super-exclusive lock was able to obtain, it removes
items because the deletion of the item might disturb the simultaneous
execution of the index scanning.
It is an optimistic control.

Because own scan might be lost even if the super-exclusive lock can
obtain, we also modify _bt_restscan().

The growth of the index frequently updated by using this patch can be
suppressed.

Your comments are welcome. Thanks.

----
For example(pgbench):
transaction type: TPC-B (sort of)
scaling factor: 100
number of clients: 50
number of transactions per client: 5000
number of transactions actually processed: 250000/250000

[before]
Relation Name         File Name Tuples   Pages File Size
public.accounts_pkey  16568     10001200 21899 175192
public.branches_pkey  16564          100   410   3280
public.tellers_pkey   16566         1000   374   2992

[after]
Relation Name         File Name Tuples   Pages File Size
public.accounts_pkey  16688     10004100 21899 175192
public.branches_pkey  16684          100    44    352 <==
public.tellers_pkey   16686         1000    26    208 <==
----

We also test this patch on DBT-2.

--
Junji Teramoto / teramoto.junji (a) lab.ntt.co.jp
NTT Cyber Space Lab.
diff -cpr pgsql-orig/src/backend/access/nbtree/nbtinsert.c pgsql/src/backend/access/nbtree/nbtinsert.c
*** pgsql-orig/src/backend/access/nbtree/nbtinsert.c    2006-01-24 21:39:25.000000000 +0900
--- pgsql/src/backend/access/nbtree/nbtinsert.c    2006-01-25 05:07:56.000000000 +0900
*************** static void _bt_pgaddtup(Relation rel, P
*** 62,67 ****
--- 62,68 ----
               OffsetNumber itup_off, const char *where);
  static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
              int keysz, ScanKey scankey);
+ static int    _bt_vacuum_page(Relation rel, Buffer buf);


  /*
*************** _bt_check_unique(Relation rel, BTItem bt
*** 260,265 ****
--- 261,267 ----
                                                   hbuffer) == HEAPTUPLE_DEAD)
                      {
                          curitemid->lp_flags |= LP_DELETE;
+                         opaque->btpo_flags |= BTP_HAS_DEAD;
                          if (nbuf != InvalidBuffer)
                              SetBufferCommitInfoNeedsSave(nbuf);
                          else
*************** _bt_insertonpg(Relation rel,
*** 424,433 ****
           */
          bool        movedright = false;

!         while (PageGetFreeSpace(page) < itemsz &&
!                !P_RIGHTMOST(lpageop) &&
!                _bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 &&
!                random() > (MAX_RANDOM_VALUE / 100))
          {
              /*
               * step right to next non-dead page
--- 426,432 ----
           */
          bool        movedright = false;

!         while (PageGetFreeSpace(page) < itemsz)
          {
              /*
               * step right to next non-dead page
*************** _bt_insertonpg(Relation rel,
*** 440,445 ****
--- 439,457 ----
               */
              Buffer        rbuf = InvalidBuffer;

+             if (P_ISLEAF(lpageop) && P_HAS_DEAD(lpageop) &&
+                 !rel->rd_istemp && BufferSuperLockHeldByMe(buf))
+             {
+                 _bt_vacuum_page(rel, buf);
+                 if (PageGetFreeSpace(page) >= itemsz)
+                     break;
+             }
+
+             if (P_RIGHTMOST(lpageop) ||
+                 _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
+                 random() <= (MAX_RANDOM_VALUE / 100))
+                 break;
+
              for (;;)
              {
                  BlockNumber rblkno = lpageop->btpo_next;
*************** _bt_isequal(TupleDesc itupdesc, Page pag
*** 1590,1592 ****
--- 1602,1651 ----
      /* if we get here, the keys are equal */
      return true;
  }
+
+ /*
+  * _bt_vacuum_page
+  *
+  * @param buf    Must have super-exclusive-lock.
+  */
+ static int
+ _bt_vacuum_page(Relation rel, Buffer buf)
+ {
+     OffsetNumber    deletable[MaxOffsetNumber];
+     BTPageOpaque    opaque;
+     OffsetNumber    offnum, minoff, maxoff;
+     int                ndeletable;
+     Page            page = BufferGetPage(buf);
+
+     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+     if (!P_ISLEAF(opaque) || !P_HAS_DEAD(opaque))
+         return -1;
+
+     minoff = P_FIRSTDATAKEY(opaque);
+     maxoff = PageGetMaxOffsetNumber(page);
+     if (minoff > maxoff)
+         return -1;
+
+     ndeletable = 0;
+
+     for (offnum = minoff;
+          offnum <= maxoff;
+          offnum = OffsetNumberNext(offnum))
+     {
+         ItemId itemId = PageGetItemId(page, offnum);
+         if (!ItemIdIsUsed(itemId) || ItemIdDeleted(itemId))
+         {
+             deletable[ndeletable++] = offnum;
+         }
+     }
+
+     if (ndeletable > 0)
+     {
+         _bt_delitems(rel, buf, deletable, ndeletable);
+     }
+
+     /* clear the dead tuple hint bit */
+     opaque->btpo_flags &= ~BTP_HAS_DEAD;
+
+     return ndeletable;
+ }
diff -cpr pgsql-orig/src/backend/access/nbtree/nbtree.c pgsql/src/backend/access/nbtree/nbtree.c
*** pgsql-orig/src/backend/access/nbtree/nbtree.c    2006-01-24 21:39:25.000000000 +0900
--- pgsql/src/backend/access/nbtree/nbtree.c    2006-01-25 05:30:06.000000000 +0900
*************** btgettuple(PG_FUNCTION_ARGS)
*** 271,276 ****
--- 271,277 ----
              offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData));
              page = BufferGetPage(so->btso_curbuf);
              PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
+             ((BTPageOpaque) PageGetSpecialPointer(page))->btpo_flags |= BTP_HAS_DEAD;

              /*
               * Since this can be redone later if needed, it's treated the same
*************** btbulkdelete(PG_FUNCTION_ARGS)
*** 632,637 ****
--- 633,641 ----
                  }
              }

+             /* clear the dead tuple hint bit */
+             opaque->btpo_flags &= ~BTP_HAS_DEAD;
+
              /*
               * If we need to delete anything, do it and write the buffer; else
               * just release the buffer.
*************** _bt_restscan(IndexScanDesc scan)
*** 898,903 ****
--- 902,929 ----
          return;
      }

+     /* Check for item on this page */
+     if (offnum <= maxoff)
+     {
+         for (;
+              offnum <= maxoff;
+              offnum = OffsetNumberNext(offnum))
+         {
+             item = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+             if (BTTidSame(item->bti_itup.t_tid, *target))
+             {
+                 /* Found it */
+                 current->ip_posid = offnum;
+                 return;
+             }
+         }
+
+         /* Move offsetnumber to first, because we may delete LP_DELETE tuple */
+         maxoff = ItemPointerGetOffsetNumber(current) - 1;
+     }
+
+     offnum = P_FIRSTDATAKEY(opaque);
+
      /*
       * The item we were on may have moved right due to insertions. Find it
       * again.  We use the heap TID to identify the item uniquely.
diff -cpr pgsql-orig/src/backend/storage/buffer/bufmgr.c pgsql/src/backend/storage/buffer/bufmgr.c
*** pgsql-orig/src/backend/storage/buffer/bufmgr.c    2006-01-24 21:39:25.000000000 +0900
--- pgsql/src/backend/storage/buffer/bufmgr.c    2006-01-25 05:19:50.000000000 +0900
*************** LockBufferForCleanup(Buffer buffer)
*** 1880,1885 ****
--- 1880,1904 ----
  }

  /*
+  * BufferSuperLockHeldByMe
+  *
+  * @param buffer    Must have BUFFER_LOCK_EXCLUSIVE.
+  */
+ bool
+ BufferSuperLockHeldByMe(Buffer buffer)
+ {
+     volatile BufferDesc *bufHdr;
+     unsigned    refcount;
+
+     bufHdr = &BufferDescriptors[buffer - 1];
+     LockBufHdr(bufHdr);
+     refcount = bufHdr->refcount;
+     UnlockBufHdr(bufHdr);
+
+     return refcount == 1;
+ }
+
+ /*
   *    Functions for buffer I/O handling
   *
   *    Note: We assume that nested buffer I/O never occurs.
diff -cpr pgsql-orig/src/include/access/nbtree.h pgsql/src/include/access/nbtree.h
*** pgsql-orig/src/include/access/nbtree.h    2006-01-24 21:39:25.000000000 +0900
--- pgsql/src/include/access/nbtree.h    2006-01-25 04:58:05.000000000 +0900
*************** typedef BTPageOpaqueData *BTPageOpaque;
*** 55,60 ****
--- 55,61 ----
  #define BTP_DELETED        (1 << 2)    /* page has been deleted from tree */
  #define BTP_META        (1 << 3)    /* meta-page */
  #define BTP_HALF_DEAD    (1 << 4)    /* empty, but still in tree */
+ #define BTP_HAS_DEAD    (1 << 5)    /* page has dead tuples */


  /*
*************** typedef BTItemData *BTItem;
*** 155,160 ****
--- 156,162 ----
  #define P_ISROOT(opaque)        ((opaque)->btpo_flags & BTP_ROOT)
  #define P_ISDELETED(opaque)        ((opaque)->btpo_flags & BTP_DELETED)
  #define P_IGNORE(opaque)        ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
+ #define P_HAS_DEAD(opaque)        ((opaque)->btpo_flags & BTP_HAS_DEAD)

  /*
   *    Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
diff -cpr pgsql-orig/src/include/storage/bufmgr.h pgsql/src/include/storage/bufmgr.h
*** pgsql-orig/src/include/storage/bufmgr.h    2006-01-24 21:39:25.000000000 +0900
--- pgsql/src/include/storage/bufmgr.h    2006-01-25 05:06:04.000000000 +0900
*************** extern void UnlockBuffers(void);
*** 149,154 ****
--- 149,155 ----
  extern void LockBuffer(Buffer buffer, int mode);
  extern bool ConditionalLockBuffer(Buffer buffer);
  extern void LockBufferForCleanup(Buffer buffer);
+ extern bool BufferSuperLockHeldByMe(Buffer buffer);

  extern void AbortBufferIO(void);


pgsql-patches by date:

Previous
From: Bruce Momjian
Date:
Subject: Re: pg_restore COPY error handling
Next
From: NAKANO Yoshihisa
Date:
Subject: Patch for ALTER TABLE / TYPE