Thread: Merge join performance

Merge join performance

From
Heikki Linnakangas
Date:
It seems that the page-at-a-time-index-scan patch applied in the spring
caused a slight performance regression to merge joins. The btree
mark/restore became much more expensive, as btmarkpos now has to copy
the array of item pointers retrieved from the current index page. That
adds up, because merge join calls markpos for every tuple.

We can make markpos fast, if we make the copy lazily in _bt_steppage,
see attached patch.

I did some micro-benchmarking of merge join performance, see attached
test. Test results, on my laptop:

8_1_STABLE: 1.77 s
HEAD, with patch: 1.65 s
HEAD, without patch: 2.46 s

The results are pretty stable, within 0.1 s.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

Index: src/backend/access/nbtree/nbtree.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtree.c,v
retrieving revision 1.149
diff -c -r1.149 nbtree.c
*** src/backend/access/nbtree/nbtree.c    10 May 2006 23:18:39 -0000    1.149
--- src/backend/access/nbtree/nbtree.c    23 Aug 2006 11:54:09 -0000
***************
*** 368,373 ****
--- 368,374 ----
      {
          so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
          so->currPos.buf = so->markPos.buf = InvalidBuffer;
+         so->markItemIndex = -1;
          if (scan->numberOfKeys > 0)
              so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
          else
***************
*** 392,397 ****
--- 393,399 ----
          ReleaseBuffer(so->markPos.buf);
          so->markPos.buf = InvalidBuffer;
      }
+     so->markItemIndex = -1;

      /*
       * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
***************
*** 430,435 ****
--- 432,438 ----
          ReleaseBuffer(so->markPos.buf);
          so->markPos.buf = InvalidBuffer;
      }
+     so->markItemIndex = -1;

      if (so->killedItems != NULL)
          pfree(so->killedItems);
***************
*** 456,469 ****
          so->markPos.buf = InvalidBuffer;
      }

!     /* bump pin on current buffer for assignment to mark buffer */
      if (BTScanPosIsValid(so->currPos))
      {
!         IncrBufferRefCount(so->currPos.buf);
!         memcpy(&so->markPos, &so->currPos,
!                offsetof(BTScanPosData, items[1]) +
!                so->currPos.lastItem * sizeof(BTScanPosItem));
!     }

      PG_RETURN_VOID();
  }
--- 459,472 ----
          so->markPos.buf = InvalidBuffer;
      }

!     /* Record the current itemIndex we're on. If we later step to next page
!      * before releasing the marked position, _bt_steppage makes a full copy
!      * of the currPos-struct. */
      if (BTScanPosIsValid(so->currPos))
      {
!         so->markItemIndex = so->currPos.itemIndex;
!     } else
!         so->markItemIndex = -1;

      PG_RETURN_VOID();
  }
***************
*** 477,500 ****
      IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
      BTScanOpaque so = (BTScanOpaque) scan->opaque;

!     /* we aren't holding any read locks, but gotta drop the pin */
!     if (BTScanPosIsValid(so->currPos))
      {
!         /* Before leaving current page, deal with any killed items */
!         if (so->numKilled > 0 &&
!             so->currPos.buf != so->markPos.buf)
!             _bt_killitems(scan, false);
!         ReleaseBuffer(so->currPos.buf);
!         so->currPos.buf = InvalidBuffer;
!     }
!
!     /* bump pin on marked buffer */
!     if (BTScanPosIsValid(so->markPos))
      {
!         IncrBufferRefCount(so->markPos.buf);
!         memcpy(&so->currPos, &so->markPos,
!                offsetof(BTScanPosData, items[1]) +
!                so->markPos.lastItem * sizeof(BTScanPosItem));
      }

      PG_RETURN_VOID();
--- 480,512 ----
      IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
      BTScanOpaque so = (BTScanOpaque) scan->opaque;

!     if (so->markItemIndex != -1)
      {
!         /* The restore position was on the same page.
!          * Just restore the itemIndex */
!         so->currPos.itemIndex = so->markItemIndex;
!     }
!     else
      {
!         /* we aren't holding any read locks, but gotta drop the pin */
!         if (BTScanPosIsValid(so->currPos))
!         {
!             /* Before leaving current page, deal with any killed items */
!             if (so->numKilled > 0 &&
!                 so->currPos.buf != so->markPos.buf)
!                 _bt_killitems(scan, false);
!             ReleaseBuffer(so->currPos.buf);
!             so->currPos.buf = InvalidBuffer;
!         }
!
!         if (BTScanPosIsValid(so->markPos))
!         {
!             /* bump pin on marked buffer for assignment to current buffer */
!             IncrBufferRefCount(so->markPos.buf);
!             memcpy(&so->currPos, &so->markPos,
!                    offsetof(BTScanPosData, items[1]) +
!                    so->markPos.lastItem * sizeof(BTScanPosItem));
!         }
      }

      PG_RETURN_VOID();
Index: src/backend/access/nbtree/nbtsearch.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtsearch.c,v
retrieving revision 1.105
diff -c -r1.105 nbtsearch.c
*** src/backend/access/nbtree/nbtsearch.c    7 May 2006 01:21:30 -0000    1.105
--- src/backend/access/nbtree/nbtsearch.c    23 Aug 2006 11:11:45 -0000
***************
*** 1055,1060 ****
--- 1055,1074 ----

      rel = scan->indexRelation;

+     /* Before we modify currPos, make a copy of the items if there
+      * was a marked position that needs them. */
+     if (so->markItemIndex != -1)
+     {
+         /* bump pin on current buffer for assignment to marked buffer */
+         IncrBufferRefCount(so->currPos.buf);
+         memcpy(&so->markPos, &so->currPos,
+                offsetof(BTScanPosData, items[1]) +
+                so->currPos.lastItem * sizeof(BTScanPosItem));
+
+         so->markPos.itemIndex = so->markItemIndex;
+         so->markItemIndex = -1;
+     }
+
      if (ScanDirectionIsForward(dir))
      {
          /* Walk right to the next page with data */
Index: src/include/access/nbtree.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/nbtree.h,v
retrieving revision 1.103
diff -c -r1.103 nbtree.h
*** src/include/access/nbtree.h    7 Aug 2006 16:57:57 -0000    1.103
--- src/include/access/nbtree.h    23 Aug 2006 11:07:47 -0000
***************
*** 441,446 ****
--- 441,450 ----
      /* keep these last in struct for efficiency */
      BTScanPosData currPos;        /* current position data */
      BTScanPosData markPos;        /* marked position, if any */
+     int markItemIndex;            /* if the marked position is on the same
+                                    page as current position, we don't use
+                                    markPos, but just keep the marked
+                                    itemIndex in markItemIndex */
  } BTScanOpaqueData;

  typedef BTScanOpaqueData *BTScanOpaque;

Attachment

Re: Merge join performance

From
Tom Lane
Date:
Heikki Linnakangas <heikki@enterprisedb.com> writes:
> We can make markpos fast, if we make the copy lazily in _bt_steppage,

Nice hack.  Getting rid of the buffer refcount manipulations is probably
even more useful than avoiding the memcpy.

> I did some micro-benchmarking of merge join performance, see attached
> test. Test results, on my laptop:

> 8_1_STABLE: 1.77 s
> HEAD, with patch: 1.65 s
> HEAD, without patch: 2.46 s

Hm, I don't see a performance regression on my machine, but there is a
measureable improvement with the patch:

8.1 branch:    10.6 s
CVS HEAD:    10.1 s
w/patch:     9.3 s

Patch applied ...

            regards, tom lane