Split _bt_insertonpg to two functions - Mailing list pgsql-patches

From Heikki Linnakangas
Subject Split _bt_insertonpg to two functions
Date
Msg-id 45E2B2C0.8020103@enterprisedb.com
Whole thread Raw
Responses Re: Split _bt_insertonpg to two functions  (Bruce Momjian <bruce@momjian.us>)
Re: Split _bt_insertonpg to two functions  (Bruce Momjian <bruce@momjian.us>)
List pgsql-patches
Here's a patch that:

Moves the logic to find a page with enough room from _bt_insertonpg to a
new function, _bt_findinsertloc. It makes the code more readable, and
simplifies the forthcoming Grouped Index Tuples patch.

Also, the insert location within page used to be calculated twice for
unique indexes, once in _bt_checkunique and second time in
_bt_insertonpg. That's a waste of cycles, and this patch fixes that.


I couldn't measure a difference with pgbench, but this micro-benchmark
shows it:

 > psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);"
 > psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync
 > time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest;
INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;"

Without patch:    real    0m7.260s
With patch:    real    0m6.963s

On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10,
to remove any other variables.

It's not a huge difference, but it's worth having, and performance
wasn't the main motivation of the patch anyway.

--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/nbtree/nbtinsert.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v
retrieving revision 1.152
diff -c -r1.152 nbtinsert.c
*** src/backend/access/nbtree/nbtinsert.c    21 Feb 2007 20:02:17 -0000    1.152
--- src/backend/access/nbtree/nbtinsert.c    26 Feb 2007 09:37:16 -0000
***************
*** 46,58 ****
  static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);

  static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
!                  Relation heapRel, Buffer buf,
                   ScanKey itup_scankey);
  static void _bt_insertonpg(Relation rel, Buffer buf,
                 BTStack stack,
-                int keysz, ScanKey scankey,
                 IndexTuple itup,
!                OffsetNumber afteritem,
                 bool split_only_page);
  static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
            OffsetNumber newitemoff, Size newitemsz,
--- 46,63 ----
  static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);

  static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
!                  Relation heapRel, Buffer buf, OffsetNumber ioffset,
                   ScanKey itup_scankey);
+ static void _bt_findinsertloc(Relation rel,
+                   Buffer *bufptr,
+                   OffsetNumber *offsetptr,
+                   int keysz,
+                   ScanKey scankey,
+                   IndexTuple newtup);
  static void _bt_insertonpg(Relation rel, Buffer buf,
                 BTStack stack,
                 IndexTuple itup,
!                OffsetNumber newitemoff,
                 bool split_only_page);
  static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
            OffsetNumber newitemoff, Size newitemsz,
***************
*** 86,91 ****
--- 91,97 ----
      ScanKey        itup_scankey;
      BTStack        stack;
      Buffer        buf;
+     OffsetNumber offset;

      /* we need an insertion scan key to do our search, so build one */
      itup_scankey = _bt_mkscankey(rel, itup);
***************
*** 94,99 ****
--- 100,107 ----
      /* find the first page containing this key */
      stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);

+     offset = InvalidOffsetNumber;
+
      /* trade in our read lock for a write lock */
      LockBuffer(buf, BUFFER_LOCK_UNLOCK);
      LockBuffer(buf, BT_WRITE);
***************
*** 128,134 ****
      {
          TransactionId xwait;

!         xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey);

          if (TransactionIdIsValid(xwait))
          {
--- 136,143 ----
      {
          TransactionId xwait;

!         offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
!         xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey);

          if (TransactionIdIsValid(xwait))
          {
***************
*** 142,148 ****
      }

      /* do the insertion */
!     _bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false);

      /* be tidy */
      _bt_freestack(stack);
--- 151,158 ----
      }

      /* do the insertion */
!     _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
!     _bt_insertonpg(rel, buf, stack, itup, offset, false);

      /* be tidy */
      _bt_freestack(stack);
***************
*** 152,169 ****
  /*
   *    _bt_check_unique() -- Check for violation of unique index constraint
   *
   * Returns InvalidTransactionId if there is no conflict, else an xact ID
   * we must wait for to see if it commits a conflicting tuple.    If an actual
   * conflict is detected, no return --- just ereport().
   */
  static TransactionId
  _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
!                  Buffer buf, ScanKey itup_scankey)
  {
      TupleDesc    itupdesc = RelationGetDescr(rel);
      int            natts = rel->rd_rel->relnatts;
!     OffsetNumber offset,
!                 maxoff;
      Page        page;
      BTPageOpaque opaque;
      Buffer        nbuf = InvalidBuffer;
--- 162,182 ----
  /*
   *    _bt_check_unique() -- Check for violation of unique index constraint
   *
+  * offset points to the first possible item that could conflict. It can
+  * also point to end-of-page, which means that the first tuple to check
+  * is the first tuple on the next page.
+  *
   * Returns InvalidTransactionId if there is no conflict, else an xact ID
   * we must wait for to see if it commits a conflicting tuple.    If an actual
   * conflict is detected, no return --- just ereport().
   */
  static TransactionId
  _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
!                  Buffer buf, OffsetNumber offset, ScanKey itup_scankey)
  {
      TupleDesc    itupdesc = RelationGetDescr(rel);
      int            natts = rel->rd_rel->relnatts;
!     OffsetNumber maxoff;
      Page        page;
      BTPageOpaque opaque;
      Buffer        nbuf = InvalidBuffer;
***************
*** 173,184 ****
      maxoff = PageGetMaxOffsetNumber(page);

      /*
-      * Find first item >= proposed new item.  Note we could also get a pointer
-      * to end-of-page here.
-      */
-     offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
-
-     /*
       * Scan over all equal tuples, looking for live conflicts.
       */
      for (;;)
--- 186,191 ----
***************
*** 342,374 ****
      return InvalidTransactionId;
  }

! /*----------
!  *    _bt_insertonpg() -- Insert a tuple on a particular page in the index.
!  *
!  *        This recursive procedure does the following things:
!  *
!  *            +  finds the right place to insert the tuple.
!  *            +  if necessary, splits the target page (making sure that the
!  *               split is equitable as far as post-insert free space goes).
!  *            +  inserts the tuple.
!  *            +  if the page was split, pops the parent stack, and finds the
!  *               right place to insert the new child pointer (by walking
!  *               right using information stored in the parent stack).
!  *            +  invokes itself with the appropriate tuple for the right
!  *               child page on the parent.
!  *            +  updates the metapage if a true root or fast root is split.
!  *
!  *        On entry, we must have the right buffer in which to do the
!  *        insertion, and the buffer must be pinned and write-locked.    On return,
!  *        we will have dropped both the pin and the lock on the buffer.
!  *
!  *        If 'afteritem' is >0 then the new tuple must be inserted after the
!  *        existing item of that number, noplace else.  If 'afteritem' is 0
!  *        then the procedure finds the exact spot to insert it by searching.
!  *        (keysz and scankey parameters are used ONLY if afteritem == 0.
!  *        The scankey must be an insertion-type scankey.)
   *
!  *        NOTE: if the new key is equal to one or more existing keys, we can
   *        legitimately place it anywhere in the series of equal keys --- in fact,
   *        if the new key is equal to the page's "high key" we can place it on
   *        the next page.    If it is equal to the high key, and there's not room
--- 349,359 ----
      return InvalidTransactionId;
  }

!
! /*
!  *    _bt_findinsertloc() -- Finds an insert location for a tuple
   *
!  *        If the new key is equal to one or more existing keys, we can
   *        legitimately place it anywhere in the series of equal keys --- in fact,
   *        if the new key is equal to the page's "high key" we can place it on
   *        the next page.    If it is equal to the high key, and there's not room
***************
*** 379,414 ****
   *        Once we have chosen the page to put the key on, we'll insert it before
   *        any existing equal keys because of the way _bt_binsrch() works.
   *
!  *        The locking interactions in this code are critical.  You should
!  *        grok Lehman and Yao's paper before making any changes.  In addition,
!  *        you need to understand how we disambiguate duplicate keys in this
!  *        implementation, in order to be able to find our location using
!  *        L&Y "move right" operations.  Since we may insert duplicate user
!  *        keys, and since these dups may propagate up the tree, we use the
!  *        'afteritem' parameter to position ourselves correctly for the
!  *        insertion on internal pages.
!  *----------
   */
  static void
! _bt_insertonpg(Relation rel,
!                Buffer buf,
!                BTStack stack,
!                int keysz,
!                ScanKey scankey,
!                IndexTuple itup,
!                OffsetNumber afteritem,
!                bool split_only_page)
  {
!     Page        page;
      BTPageOpaque lpageop;
      OffsetNumber newitemoff;
!     OffsetNumber firstright = InvalidOffsetNumber;
!     Size        itemsz;

-     page = BufferGetPage(buf);
      lpageop = (BTPageOpaque) PageGetSpecialPointer(page);

!     itemsz = IndexTupleDSize(*itup);
      itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
                                   * need to be consistent */

--- 364,403 ----
   *        Once we have chosen the page to put the key on, we'll insert it before
   *        any existing equal keys because of the way _bt_binsrch() works.
   *
!  *        If there's not enough room in the space, we try to make room by
!  *        removing any LP_DELETEd tuples.
!  *
!  *        On entry, *buf and *offsetptr point to the first legal position
!  *        where the new tuple could be inserted. The caller should hold an
!  *        exclusive lock on *buf. *offsetptr can also be set to
!  *        InvalidOffsetNumber, in which case the function will search the    right
!  *        location within the page if needed. On exit, they point to the chosen
!  *        insert location. If findinsertloc decided to move right, the lock and
!  *        pin on the original page will be released and the new page returned to
!  *        the caller is exclusively locked instead.
!  *
!  *        newtup is the new tuple we're inserting, and scankey is an insertion
!  *        type scan key for it.
   */
  static void
! _bt_findinsertloc(Relation rel,
!                   Buffer *bufptr,
!                   OffsetNumber *offsetptr,
!                   int keysz,
!                   ScanKey scankey,
!                   IndexTuple newtup)
  {
!     Buffer buf = *bufptr;
!     Page page = BufferGetPage(buf);
!     Size itemsz;
      BTPageOpaque lpageop;
+     bool movedright, vacuumed;
      OffsetNumber newitemoff;
!     OffsetNumber firstlegaloff = *offsetptr;

      lpageop = (BTPageOpaque) PageGetSpecialPointer(page);

!     itemsz = IndexTupleDSize(*newtup);
      itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
                                   * need to be consistent */

***************
*** 429,525 ****
                  "Consider a function index of an MD5 hash of the value, "
                  "or use full text indexing.")));

-     /*
-      * Determine exactly where new item will go.
-      */
-     if (afteritem > 0)
-         newitemoff = afteritem + 1;
-     else
-     {
-         /*----------
-          * If we will need to split the page to put the item here,
-          * check whether we can put the tuple somewhere to the right,
-          * instead.  Keep scanning right until we
-          *        (a) find a page with enough free space,
-          *        (b) reach the last page where the tuple can legally go, or
-          *        (c) get tired of searching.
-          * (c) is not flippant; it is important because if there are many
-          * pages' worth of equal keys, it's better to split one of the early
-          * pages than to scan all the way to the end of the run of equal keys
-          * on every insert.  We implement "get tired" as a random choice,
-          * since stopping after scanning a fixed number of pages wouldn't work
-          * well (we'd never reach the right-hand side of previously split
-          * pages).    Currently the probability of moving right is set at 0.99,
-          * which may seem too high to change the behavior much, but it does an
-          * excellent job of preventing O(N^2) behavior with many equal keys.
-          *----------
-          */
-         bool        movedright = false;
-
-         while (PageGetFreeSpace(page) < itemsz)
-         {
-             Buffer        rbuf;

-             /*
-              * before considering moving right, see if we can obtain enough
-              * space by erasing LP_DELETE items
-              */
-             if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
-             {
-                 _bt_vacuum_one_page(rel, buf);
-                 if (PageGetFreeSpace(page) >= itemsz)
-                     break;        /* OK, now we have enough space */
-             }

!             /*
!              * nope, so check conditions (b) and (c) enumerated above
!              */
!             if (P_RIGHTMOST(lpageop) ||
!                 _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
!                 random() <= (MAX_RANDOM_VALUE / 100))
!                 break;

!             /*
!              * step right to next non-dead page
!              *
!              * must write-lock that page before releasing write lock on
!              * current page; else someone else's _bt_check_unique scan could
!              * fail to see our insertion.  write locks on intermediate dead
!              * pages won't do because we don't know when they will get
!              * de-linked from the tree.
!              */
!             rbuf = InvalidBuffer;

!             for (;;)
!             {
!                 BlockNumber rblkno = lpageop->btpo_next;

!                 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
!                 page = BufferGetPage(rbuf);
!                 lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
!                 if (!P_IGNORE(lpageop))
!                     break;
!                 if (P_RIGHTMOST(lpageop))
!                     elog(ERROR, "fell off the end of \"%s\"",
!                          RelationGetRelationName(rel));
!             }
!             _bt_relbuf(rel, buf);
!             buf = rbuf;
!             movedright = true;
          }

          /*
!          * Now we are on the right page, so find the insert position. If we
!          * moved right at all, we know we should insert at the start of the
!          * page, else must find the position by searching.
           */
!         if (movedright)
!             newitemoff = P_FIRSTDATAKEY(lpageop);
!         else
!             newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
      }

      /*
       * Do we need to split the page to fit the item on it?
       *
       * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
--- 418,573 ----
                  "Consider a function index of an MD5 hash of the value, "
                  "or use full text indexing.")));



!     /*----------
!      * If we will need to split the page to put the item on this page,
!      * check whether we can put the tuple somewhere to the right,
!      * instead.  Keep scanning right until we
!      *        (a) find a page with enough free space,
!      *        (b) reach the last page where the tuple can legally go, or
!      *        (c) get tired of searching.
!      * (c) is not flippant; it is important because if there are many
!      * pages' worth of equal keys, it's better to split one of the early
!      * pages than to scan all the way to the end of the run of equal keys
!      * on every insert.  We implement "get tired" as a random choice,
!      * since stopping after scanning a fixed number of pages wouldn't work
!      * well (we'd never reach the right-hand side of previously split
!      * pages).    Currently the probability of moving right is set at 0.99,
!      * which may seem too high to change the behavior much, but it does an
!      * excellent job of preventing O(N^2) behavior with many equal keys.
!      *----------
!      */
!     movedright = false;
!     vacuumed = false;
!     while (PageGetFreeSpace(page) < itemsz)
!     {
!         Buffer        rbuf;

!         /*
!          * before considering moving right, see if we can obtain enough
!          * space by erasing LP_DELETE items
!          */
!         if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
!         {
!             _bt_vacuum_one_page(rel, buf);

!             /* remember that we vacuumed this page, because that makes
!              * the hint supplied by the caller invalid */
!             vacuumed = true;

!             if (PageGetFreeSpace(page) >= itemsz)
!                 break;        /* OK, now we have enough space */
          }

          /*
!          * nope, so check conditions (b) and (c) enumerated above
           */
!         if (P_RIGHTMOST(lpageop) ||
!             _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
!             random() <= (MAX_RANDOM_VALUE / 100))
!             break;
!
!         /*
!          * step right to next non-dead page
!          *
!          * must write-lock that page before releasing write lock on
!          * current page; else someone else's _bt_check_unique scan could
!          * fail to see our insertion.  write locks on intermediate dead
!          * pages won't do because we don't know when they will get
!          * de-linked from the tree.
!          */
!         rbuf = InvalidBuffer;
!
!         for (;;)
!         {
!             BlockNumber rblkno = lpageop->btpo_next;
!
!             rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
!             page = BufferGetPage(rbuf);
!             lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
!             if (!P_IGNORE(lpageop))
!                 break;
!             if (P_RIGHTMOST(lpageop))
!                 elog(ERROR, "fell off the end of \"%s\"",
!                      RelationGetRelationName(rel));
!         }
!         _bt_relbuf(rel, buf);
!         buf = rbuf;
!         movedright = true;
!         vacuumed = false;
      }

      /*
+      * Now we are on the right page, so find the insert position. If we
+      * moved right at all, we know we should insert at the start of the
+      * page. If we didn't move right, we can use the firstlegaloff hint
+      * if the caller supplied one, unless we vacuumed the page which
+      * might have moved tuples around making the hint invalid. If we
+      * didn't move right or can't use the hint, find the position
+      * by searching.
+      */
+     if (movedright)
+         newitemoff = P_FIRSTDATAKEY(lpageop);
+     else if(firstlegaloff != InvalidOffsetNumber && !vacuumed)
+         newitemoff = firstlegaloff;
+     else
+         newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
+
+     *bufptr = buf;
+     *offsetptr = newitemoff;
+ }
+
+ /*----------
+  *    _bt_insertonpg() -- Insert a tuple on a particular page in the index.
+  *
+  *        This recursive procedure does the following things:
+  *
+  *            +  if necessary, splits the target page (making sure that the
+  *               split is equitable as far as post-insert free space goes).
+  *            +  inserts the tuple.
+  *            +  if the page was split, pops the parent stack, and finds the
+  *               right place to insert the new child pointer (by walking
+  *               right using information stored in the parent stack).
+  *            +  invokes itself with the appropriate tuple for the right
+  *               child page on the parent.
+  *            +  updates the metapage if a true root or fast root is split.
+  *
+  *        On entry, we must have the right buffer in which to do the
+  *        insertion, and the buffer must be pinned and write-locked.    On return,
+  *        we will have dropped both the pin and the lock on the buffer.
+  *
+  *        The locking interactions in this code are critical.  You should
+  *        grok Lehman and Yao's paper before making any changes.  In addition,
+  *        you need to understand how we disambiguate duplicate keys in this
+  *        implementation, in order to be able to find our location using
+  *        L&Y "move right" operations.  Since we may insert duplicate user
+  *        keys, and since these dups may propagate up the tree, we use the
+  *        'afteritem' parameter to position ourselves correctly for the
+  *        insertion on internal pages.
+  *----------
+  */
+ static void
+ _bt_insertonpg(Relation rel,
+                Buffer buf,
+                BTStack stack,
+                IndexTuple itup,
+                OffsetNumber newitemoff,
+                bool split_only_page)
+ {
+     Page        page;
+     BTPageOpaque lpageop;
+     OffsetNumber firstright = InvalidOffsetNumber;
+     Size        itemsz;
+
+     page = BufferGetPage(buf);
+     lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+     itemsz = IndexTupleDSize(*itup);
+     itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but we
+                                  * need to be consistent */
+
+     /*
       * Do we need to split the page to fit the item on it?
       *
       * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
***************
*** 1427,1433 ****

          /* Recursively update the parent */
          _bt_insertonpg(rel, pbuf, stack->bts_parent,
!                        0, NULL, new_item, stack->bts_offset,
                         is_only);

          /* be tidy */
--- 1475,1481 ----

          /* Recursively update the parent */
          _bt_insertonpg(rel, pbuf, stack->bts_parent,
!                        new_item, stack->bts_offset + 1,
                         is_only);

          /* be tidy */

pgsql-patches by date:

Previous
From: Matteo Beccati
Date:
Subject: Re: Small patch to compile on IRIX 6.5 with gcc
Next
From: NikhilS
Date:
Subject: Re: Recalculating OldestXmin in a long-running vacuum