Merge join performance - Mailing list pgsql-patches

From Heikki Linnakangas
Subject Merge join performance
Date
Msg-id 44EC42B8.7070708@enterprisedb.com
Whole thread Raw
Responses Re: Merge join performance  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-patches
It seems that the page-at-a-time-index-scan patch applied in the spring
caused a slight performance regression to merge joins. The btree
mark/restore became much more expensive, as btmarkpos now has to copy
the array of item pointers retrieved from the current index page. That
adds up, because merge join calls markpos for every tuple.

We can make markpos fast, if we make the copy lazily in _bt_steppage,
see attached patch.

I did some micro-benchmarking of merge join performance, see attached
test. Test results, on my laptop:

8_1_STABLE: 1.77 s
HEAD, with patch: 1.65 s
HEAD, without patch: 2.46 s

The results are pretty stable, within 0.1 s.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

Index: src/backend/access/nbtree/nbtree.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtree.c,v
retrieving revision 1.149
diff -c -r1.149 nbtree.c
*** src/backend/access/nbtree/nbtree.c    10 May 2006 23:18:39 -0000    1.149
--- src/backend/access/nbtree/nbtree.c    23 Aug 2006 11:54:09 -0000
***************
*** 368,373 ****
--- 368,374 ----
      {
          so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
          so->currPos.buf = so->markPos.buf = InvalidBuffer;
+         so->markItemIndex = -1;
          if (scan->numberOfKeys > 0)
              so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
          else
***************
*** 392,397 ****
--- 393,399 ----
          ReleaseBuffer(so->markPos.buf);
          so->markPos.buf = InvalidBuffer;
      }
+     so->markItemIndex = -1;

      /*
       * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
***************
*** 430,435 ****
--- 432,438 ----
          ReleaseBuffer(so->markPos.buf);
          so->markPos.buf = InvalidBuffer;
      }
+     so->markItemIndex = -1;

      if (so->killedItems != NULL)
          pfree(so->killedItems);
***************
*** 456,469 ****
          so->markPos.buf = InvalidBuffer;
      }

!     /* bump pin on current buffer for assignment to mark buffer */
      if (BTScanPosIsValid(so->currPos))
      {
!         IncrBufferRefCount(so->currPos.buf);
!         memcpy(&so->markPos, &so->currPos,
!                offsetof(BTScanPosData, items[1]) +
!                so->currPos.lastItem * sizeof(BTScanPosItem));
!     }

      PG_RETURN_VOID();
  }
--- 459,472 ----
          so->markPos.buf = InvalidBuffer;
      }

!     /* Record the current itemIndex we're on. If we later step to next page
!      * before releasing the marked position, _bt_steppage makes a full copy
!      * of the currPos-struct. */
      if (BTScanPosIsValid(so->currPos))
      {
!         so->markItemIndex = so->currPos.itemIndex;
!     } else
!         so->markItemIndex = -1;

      PG_RETURN_VOID();
  }
***************
*** 477,500 ****
      IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
      BTScanOpaque so = (BTScanOpaque) scan->opaque;

!     /* we aren't holding any read locks, but gotta drop the pin */
!     if (BTScanPosIsValid(so->currPos))
      {
!         /* Before leaving current page, deal with any killed items */
!         if (so->numKilled > 0 &&
!             so->currPos.buf != so->markPos.buf)
!             _bt_killitems(scan, false);
!         ReleaseBuffer(so->currPos.buf);
!         so->currPos.buf = InvalidBuffer;
!     }
!
!     /* bump pin on marked buffer */
!     if (BTScanPosIsValid(so->markPos))
      {
!         IncrBufferRefCount(so->markPos.buf);
!         memcpy(&so->currPos, &so->markPos,
!                offsetof(BTScanPosData, items[1]) +
!                so->markPos.lastItem * sizeof(BTScanPosItem));
      }

      PG_RETURN_VOID();
--- 480,512 ----
      IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
      BTScanOpaque so = (BTScanOpaque) scan->opaque;

!     if (so->markItemIndex != -1)
      {
!         /* The restore position was on the same page.
!          * Just restore the itemIndex */
!         so->currPos.itemIndex = so->markItemIndex;
!     }
!     else
      {
!         /* we aren't holding any read locks, but gotta drop the pin */
!         if (BTScanPosIsValid(so->currPos))
!         {
!             /* Before leaving current page, deal with any killed items */
!             if (so->numKilled > 0 &&
!                 so->currPos.buf != so->markPos.buf)
!                 _bt_killitems(scan, false);
!             ReleaseBuffer(so->currPos.buf);
!             so->currPos.buf = InvalidBuffer;
!         }
!
!         if (BTScanPosIsValid(so->markPos))
!         {
!             /* bump pin on marked buffer for assignment to current buffer */
!             IncrBufferRefCount(so->markPos.buf);
!             memcpy(&so->currPos, &so->markPos,
!                    offsetof(BTScanPosData, items[1]) +
!                    so->markPos.lastItem * sizeof(BTScanPosItem));
!         }
      }

      PG_RETURN_VOID();
Index: src/backend/access/nbtree/nbtsearch.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtsearch.c,v
retrieving revision 1.105
diff -c -r1.105 nbtsearch.c
*** src/backend/access/nbtree/nbtsearch.c    7 May 2006 01:21:30 -0000    1.105
--- src/backend/access/nbtree/nbtsearch.c    23 Aug 2006 11:11:45 -0000
***************
*** 1055,1060 ****
--- 1055,1074 ----

      rel = scan->indexRelation;

+     /* Before we modify currPos, make a copy of the items if there
+      * was a marked position that needs them. */
+     if (so->markItemIndex != -1)
+     {
+         /* bump pin on current buffer for assignment to marked buffer */
+         IncrBufferRefCount(so->currPos.buf);
+         memcpy(&so->markPos, &so->currPos,
+                offsetof(BTScanPosData, items[1]) +
+                so->currPos.lastItem * sizeof(BTScanPosItem));
+
+         so->markPos.itemIndex = so->markItemIndex;
+         so->markItemIndex = -1;
+     }
+
      if (ScanDirectionIsForward(dir))
      {
          /* Walk right to the next page with data */
Index: src/include/access/nbtree.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/nbtree.h,v
retrieving revision 1.103
diff -c -r1.103 nbtree.h
*** src/include/access/nbtree.h    7 Aug 2006 16:57:57 -0000    1.103
--- src/include/access/nbtree.h    23 Aug 2006 11:07:47 -0000
***************
*** 441,446 ****
--- 441,450 ----
      /* keep these last in struct for efficiency */
      BTScanPosData currPos;        /* current position data */
      BTScanPosData markPos;        /* marked position, if any */
+     int markItemIndex;            /* if the marked position is on the same
+                                    page as current position, we don't use
+                                    markPos, but just keep the marked
+                                    itemIndex in markItemIndex */
  } BTScanOpaqueData;

  typedef BTScanOpaqueData *BTScanOpaque;

Attachment

pgsql-patches by date:

Previous
From: Karel Zak
Date:
Subject: Leaving... (was: Re: [HACKERS] COPY view)
Next
From: Teodor Sigaev
Date:
Subject: Re: [HACKERS] Use of backslash in tsearch2