hash index work - Mailing list pgsql-patches

From Neil Conway
Subject hash index work
Date
Msg-id 4284378C.2080205@samurai.com
Whole thread Raw
Responses Re: hash index work  (Bruce Momjian <pgman@candle.pha.pa.us>)
List pgsql-patches
This patch implements two changes to hash indexes:

- rather than storing the values of the index keys, we just store their
hash value instead. The hash opclasses have been marked "lossy" so that
the executor will recheck the scan qualifier to catch hash collisions.

- within a page of a hash bucket, the entries are stored sorted by hash
value. When doing a hash index scan, we can then binary search within a
page rather than using a linear search.

Unfortunately, I haven't been able to measure any performance
improvement from either of these changes :-\

I've run tests on two types of tables: int4 keys and relatively short
textual keys (I don't think there's much point testing longer keys: the
patches should obviously be a win there, so I'm concerned about speeding
up the common case). In both cases, the difference in index scan
performance isn't significant: it's about the same with and without the
patches. The indexes have been on tables with 1 million rows (of int4)
and 400,000 rows (of text). I would test with larger tables, but
creating hash indexes is so slow that even these sizes are painful
enough. Since indexes of this size should be cached in memory the
reduction in I/O for the text case isn't being measured (the index is
about 30% smaller than it is when we store the full text value), but
even so I would have expected the binary search to produce noticeable
gains...

Perhaps I've made a coding error that has pessimized the performance
somehow (although nothing obvious shows up in profiles), or perhaps the
linear scan of the pages of the hash bucket isn't a performance problem
in the first place, at least for types with a cheap equality operator.
Some oprofile data seems to support this -- for example, in the int4
case, we spend less than 0.5% of the time in _hash_next and children,
and only 1.8% of the runtime in hashgetmulti and children (with the
vanilla CVS HEAD code).

-Neil
Index: src/backend/access/hash/hash.c
===================================================================
RCS file: /var/lib/cvs/pgsql/src/backend/access/hash/hash.c,v
retrieving revision 1.79
diff -c -r1.79 hash.c
*** src/backend/access/hash/hash.c    11 May 2005 06:24:51 -0000    1.79
--- src/backend/access/hash/hash.c    13 May 2005 01:30:49 -0000
***************
*** 104,110 ****
          return;
      }

!     hitem = _hash_formitem(itup);

      _hash_doinsert(index, hitem);

--- 104,110 ----
          return;
      }

!     hitem = _hash_formitem(itup, index);

      _hash_doinsert(index, hitem);

***************
*** 127,133 ****
      Datum       *values = (Datum *) PG_GETARG_POINTER(1);
      bool       *isnull = (bool *) PG_GETARG_POINTER(2);
      ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
-
  #ifdef NOT_USED
      Relation    heapRel = (Relation) PG_GETARG_POINTER(4);
      bool        checkUnique = PG_GETARG_BOOL(5);
--- 127,132 ----
***************
*** 154,160 ****
          PG_RETURN_BOOL(false);
      }

!     hitem = _hash_formitem(itup);

      _hash_doinsert(rel, hitem);

--- 153,159 ----
          PG_RETURN_BOOL(false);
      }

!     hitem = _hash_formitem(itup, rel);

      _hash_doinsert(rel, hitem);

***************
*** 331,336 ****
--- 330,337 ----
      so->hashso_bucket_valid = false;
      so->hashso_bucket_blkno = 0;
      so->hashso_curbuf = so->hashso_mrkbuf = InvalidBuffer;
+     so->hashso_sk_hash = _hash_datum2hashkey(rel,
+                                              scan->keyData[0].sk_argument);
      scan->opaque = so;

      /* register scan in case we change pages it's using */
***************
*** 379,385 ****
--- 380,390 ----
                  scankey,
                  scan->numberOfKeys * sizeof(ScanKeyData));
          if (so)
+         {
              so->hashso_bucket_valid = false;
+             so->hashso_sk_hash = _hash_datum2hashkey(rel,
+                                                      scan->keyData[0].sk_argument);
+         }
      }

      PG_RETURN_VOID();
***************
*** 570,576 ****

                  hitem = (HashItem) PageGetItem(page,
                                               PageGetItemId(page, offno));
!                 htup = &(hitem->hash_itup.t_tid);
                  if (callback(htup, callback_state))
                  {
                      /* delete the item from the page */
--- 575,581 ----

                  hitem = (HashItem) PageGetItem(page,
                                               PageGetItemId(page, offno));
!                 htup = &(hitem->tid);
                  if (callback(htup, callback_state))
                  {
                      /* delete the item from the page */
Index: src/backend/access/hash/hashinsert.c
===================================================================
RCS file: /var/lib/cvs/pgsql/src/backend/access/hash/hashinsert.c,v
retrieving revision 1.36
diff -c -r1.36 hashinsert.c
*** src/backend/access/hash/hashinsert.c    21 Mar 2005 01:23:57 -0000    1.36
--- src/backend/access/hash/hashinsert.c    13 May 2005 01:30:49 -0000
***************
*** 18,28 ****
  #include "access/hash.h"
  #include "storage/lmgr.h"

-
- static OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf,
-                Size itemsize, HashItem hitem);
-
-
  /*
   *    _hash_doinsert() -- Handle insertion of a single HashItem in the table.
   *
--- 18,23 ----
***************
*** 36,71 ****
      Buffer        buf;
      Buffer        metabuf;
      HashMetaPage metap;
-     IndexTuple    itup;
      BlockNumber itup_blkno;
      OffsetNumber itup_off;
      BlockNumber blkno;
      Page        page;
      HashPageOpaque pageopaque;
-     Size        itemsz;
      bool        do_expand;
-     uint32        hashkey;
      Bucket        bucket;
-     Datum        datum;
-     bool        isnull;
-
-     /*
-      * Compute the hash key for the item.  We do this first so as not to
-      * need to hold any locks while running the hash function.
-      */
-     itup = &(hitem->hash_itup);
-     if (rel->rd_rel->relnatts != 1)
-         elog(ERROR, "hash indexes support only one index key");
-     datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
-     Assert(!isnull);
-     hashkey = _hash_datum2hashkey(rel, datum);
-
-     /* compute item size too */
-     itemsz = IndexTupleDSize(hitem->hash_itup)
-         + (sizeof(HashItemData) - sizeof(IndexTupleData));
-
-     itemsz = MAXALIGN(itemsz);    /* be safe, PageAddItem will do this but
-                                  * we need to be consistent */

      /*
       * Acquire shared split lock so we can compute the target bucket
--- 31,43 ----
***************
*** 79,99 ****
      _hash_checkpage(rel, (Page) metap, LH_META_PAGE);

      /*
-      * Check whether the item can fit on a hash page at all. (Eventually,
-      * we ought to try to apply TOAST methods if not.)    Note that at this
-      * point, itemsz doesn't include the ItemId.
-      */
-     if (itemsz > HashMaxItemSize((Page) metap))
-         ereport(ERROR,
-                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-                  errmsg("index row size %lu exceeds hash maximum %lu",
-                         (unsigned long) itemsz,
-                         (unsigned long) HashMaxItemSize((Page) metap))));
-
-     /*
       * Compute the target bucket number, and convert to block number.
       */
!     bucket = _hash_hashkey2bucket(hashkey,
                                    metap->hashm_maxbucket,
                                    metap->hashm_highmask,
                                    metap->hashm_lowmask);
--- 51,59 ----
      _hash_checkpage(rel, (Page) metap, LH_META_PAGE);

      /*
       * Compute the target bucket number, and convert to block number.
       */
!     bucket = _hash_hashkey2bucket(hitem->hash,
                                    metap->hashm_maxbucket,
                                    metap->hashm_highmask,
                                    metap->hashm_lowmask);
***************
*** 108,114 ****
       * lock.
       */
      _hash_getlock(rel, blkno, HASH_SHARE);
-
      _hash_droplock(rel, 0, HASH_SHARE);

      /* Fetch the primary bucket page for the bucket */
--- 68,73 ----
***************
*** 119,125 ****
      Assert(pageopaque->hasho_bucket == bucket);

      /* Do the insertion */
!     while (PageGetFreeSpace(page) < itemsz)
      {
          /*
           * no space on this page; check for an overflow page
--- 78,84 ----
      Assert(pageopaque->hasho_bucket == bucket);

      /* Do the insertion */
!     while (PageGetFreeSpace(page) < sizeof(HashItemData))
      {
          /*
           * no space on this page; check for an overflow page
***************
*** 151,157 ****
              page = BufferGetPage(buf);

              /* should fit now, given test above */
!             Assert(PageGetFreeSpace(page) >= itemsz);
          }
          _hash_checkpage(rel, page, LH_OVERFLOW_PAGE);
          pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
--- 110,116 ----
              page = BufferGetPage(buf);

              /* should fit now, given test above */
!             Assert(PageGetFreeSpace(page) >= sizeof(HashItemData));
          }
          _hash_checkpage(rel, page, LH_OVERFLOW_PAGE);
          pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
***************
*** 159,165 ****
      }

      /* found page with enough space, so add the item here */
!     itup_off = _hash_pgaddtup(rel, buf, itemsz, hitem);
      itup_blkno = BufferGetBlockNumber(buf);

      /* write and release the modified page */
--- 118,124 ----
      }

      /* found page with enough space, so add the item here */
!     itup_off = _hash_pgaddtup(rel, buf, hitem);
      itup_blkno = BufferGetBlockNumber(buf);

      /* write and release the modified page */
***************
*** 192,220 ****
  }

  /*
   *    _hash_pgaddtup() -- add a tuple to a particular page in the index.
   *
   *        This routine adds the tuple to the page as requested; it does
!  *        not write out the page.  It is an error to call pgaddtup() without
!  *        a write lock and pin.
   */
! static OffsetNumber
! _hash_pgaddtup(Relation rel,
!                Buffer buf,
!                Size itemsize,
!                HashItem hitem)
  {
!     OffsetNumber itup_off;
!     Page        page;

      page = BufferGetPage(buf);
      _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);

!     itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
!     if (PageAddItem(page, (Item) hitem, itemsize, itup_off, LP_USED)
!         == InvalidOffsetNumber)
          elog(ERROR, "failed to add index item to \"%s\"",
               RelationGetRelationName(rel));

!     return itup_off;
  }
--- 151,219 ----
  }

  /*
+  * Return the offset number in the page where the specified hash value
+  * should be located. The return value might exceed the page's max
+  * offset if the hash value is greater than any hash in the page.
+  */
+ OffsetNumber
+ _hash_binsearch(Page page, uint32 hash_value)
+ {
+     OffsetNumber    upper;
+     OffsetNumber    lower;
+
+     upper = PageGetMaxOffsetNumber(page) + 1;
+     lower = FirstOffsetNumber;
+
+     while (upper > lower)
+     {
+         HashItem pos;
+         OffsetNumber off;
+
+         off = lower + ((upper - lower) / 2);
+         Assert(OffsetNumberIsValid(off));
+         Assert(lower <= off);
+         Assert(upper >= off);
+
+         pos = (HashItem) PageGetItem(page, PageGetItemId(page, off));
+         if (pos->hash < hash_value)
+             lower = off + 1;
+         else
+             upper = off;
+     }
+
+     return upper;
+ }
+
+ /*
   *    _hash_pgaddtup() -- add a tuple to a particular page in the index.
   *
   *        This routine adds the tuple to the page as requested; it does
!  *        not write out the page.  It is an error to call _hash_pgaddtup
!  *        without a write lock and pin. We keep the items in each page
!  *        sorted by hash value, so that we can do a binary search during
!  *        scans.
   */
! OffsetNumber
! _hash_pgaddtup(Relation rel, Buffer buf, HashItem hitem)
  {
!     OffsetNumber    result;
!     OffsetNumber    insert_loc;
!     Page            page;

+     Assert(BufferIsPinned(buf));
      page = BufferGetPage(buf);
      _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);

!     /* Find the insertion location via binary search */
!     insert_loc = _hash_binsearch(page, hitem->hash);
!
!     result = PageAddItem(page, (Item) hitem, sizeof(HashItemData),
!                          insert_loc, LP_USED);
!
!     if (result == InvalidOffsetNumber)
          elog(ERROR, "failed to add index item to \"%s\"",
               RelationGetRelationName(rel));

!     Assert(result == insert_loc);
!     return result;
  }
Index: src/backend/access/hash/hashovfl.c
===================================================================
RCS file: /var/lib/cvs/pgsql/src/backend/access/hash/hashovfl.c,v
retrieving revision 1.46
diff -c -r1.46 hashovfl.c
*** src/backend/access/hash/hashovfl.c    11 May 2005 01:26:01 -0000    1.46
--- src/backend/access/hash/hashovfl.c    13 May 2005 01:30:49 -0000
***************
*** 562,571 ****
      Page        rpage;
      HashPageOpaque wopaque;
      HashPageOpaque ropaque;
-     OffsetNumber woffnum;
      OffsetNumber roffnum;
      HashItem    hitem;
-     Size        itemsz;

      /*
       * start squeezing into the base bucket page.
--- 562,569 ----
***************
*** 608,627 ****
      roffnum = FirstOffsetNumber;
      for (;;)
      {
!         /* this test is needed in case page is empty on entry */
          if (roffnum <= PageGetMaxOffsetNumber(rpage))
          {
              hitem = (HashItem) PageGetItem(rpage,
                                             PageGetItemId(rpage, roffnum));
-             itemsz = IndexTupleDSize(hitem->hash_itup)
-                 + (sizeof(HashItemData) - sizeof(IndexTupleData));
-             itemsz = MAXALIGN(itemsz);

              /*
               * Walk up the bucket chain, looking for a page big enough for
               * this item.  Exit if we reach the read page.
               */
!             while (PageGetFreeSpace(wpage) < itemsz)
              {
                  Assert(!PageIsEmpty(wpage));

--- 606,622 ----
      roffnum = FirstOffsetNumber;
      for (;;)
      {
!         /* this test is needed in case the page is empty on entry */
          if (roffnum <= PageGetMaxOffsetNumber(rpage))
          {
              hitem = (HashItem) PageGetItem(rpage,
                                             PageGetItemId(rpage, roffnum));

              /*
               * Walk up the bucket chain, looking for a page big enough for
               * this item.  Exit if we reach the read page.
               */
!             while (PageGetFreeSpace(wpage) < sizeof(HashItemData))
              {
                  Assert(!PageIsEmpty(wpage));

***************
*** 647,657 ****
              /*
               * we have found room so insert on the "write" page.
               */
!             woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
!             if (PageAddItem(wpage, (Item) hitem, itemsz, woffnum, LP_USED)
!                 == InvalidOffsetNumber)
!                 elog(ERROR, "failed to add index item to \"%s\"",
!                      RelationGetRelationName(rel));

              /*
               * delete the tuple from the "read" page. PageIndexTupleDelete
--- 642,648 ----
              /*
               * we have found room so insert on the "write" page.
               */
!             _hash_pgaddtup(rel, wbuf, hitem);

              /*
               * delete the tuple from the "read" page. PageIndexTupleDelete
Index: src/backend/access/hash/hashpage.c
===================================================================
RCS file: /var/lib/cvs/pgsql/src/backend/access/hash/hashpage.c,v
retrieving revision 1.49
diff -c -r1.49 hashpage.c
*** src/backend/access/hash/hashpage.c    11 May 2005 01:26:01 -0000    1.49
--- src/backend/access/hash/hashpage.c    13 May 2005 01:30:49 -0000
***************
*** 527,545 ****
      Buffer        nbuf;
      BlockNumber oblkno;
      BlockNumber nblkno;
-     bool        null;
-     Datum        datum;
      HashItem    hitem;
      HashPageOpaque oopaque;
      HashPageOpaque nopaque;
-     IndexTuple    itup;
-     Size        itemsz;
      OffsetNumber ooffnum;
-     OffsetNumber noffnum;
      OffsetNumber omaxoffnum;
      Page        opage;
      Page        npage;
-     TupleDesc    itupdesc = RelationGetDescr(rel);

      /*
       * It should be okay to simultaneously write-lock pages from each
--- 527,539 ----
***************
*** 603,620 ****
          }

          /*
!          * Re-hash the tuple to determine which bucket it now belongs in.
!          *
!          * It is annoying to call the hash function while holding locks, but
!          * releasing and relocking the page for each tuple is unappealing
!          * too.
           */
          hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
!         itup = &(hitem->hash_itup);
!         datum = index_getattr(itup, 1, itupdesc, &null);
!         Assert(!null);
!
!         bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
                                        maxbucket, highmask, lowmask);

          if (bucket == nbucket)
--- 597,608 ----
          }

          /*
!          * Determine the bucket where the tuple now belongs. Since we
!          * store the full hash of the tuple value in the index itself,
!          * we needn't recompute the hash here.
           */
          hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
!         bucket = _hash_hashkey2bucket(hitem->hash,
                                        maxbucket, highmask, lowmask);

          if (bucket == nbucket)
***************
*** 624,635 ****
               * the current page in the new bucket, we must allocate a new
               * overflow page and place the tuple on that page instead.
               */
!             itemsz = IndexTupleDSize(hitem->hash_itup)
!                 + (sizeof(HashItemData) - sizeof(IndexTupleData));
!
!             itemsz = MAXALIGN(itemsz);
!
!             if (PageGetFreeSpace(npage) < itemsz)
              {
                  /* write out nbuf and drop lock, but keep pin */
                  _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
--- 612,618 ----
               * the current page in the new bucket, we must allocate a new
               * overflow page and place the tuple on that page instead.
               */
!             if (PageGetFreeSpace(npage) < sizeof(HashItemData))
              {
                  /* write out nbuf and drop lock, but keep pin */
                  _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
***************
*** 640,650 ****
                  /* we don't need nopaque within the loop */
              }

!             noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
!             if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED)
!                 == InvalidOffsetNumber)
!                 elog(ERROR, "failed to add index item to \"%s\"",
!                      RelationGetRelationName(rel));

              /*
               * now delete the tuple from the old bucket.  after this
--- 623,629 ----
                  /* we don't need nopaque within the loop */
              }

!             _hash_pgaddtup(rel, nbuf, hitem);

              /*
               * now delete the tuple from the old bucket.  after this
***************
*** 654,659 ****
--- 633,643 ----
               * array).    this also means that 'omaxoffnum' is exactly one
               * less than it used to be, so we really can just decrement it
               * instead of calling PageGetMaxOffsetNumber.
+              *
+              * XXX: calling PageIndexTupleDelete() in a loop seems
+              * unwise; it would be better to move all the tuples out
+              * to the new bucket first, then repack the old page in
+              * one shot.
               */
              PageIndexTupleDelete(opage, ooffnum);
              omaxoffnum = OffsetNumberPrev(omaxoffnum);
***************
*** 668,673 ****
--- 652,659 ----
              Assert(bucket == obucket);
              ooffnum = OffsetNumberNext(ooffnum);
          }
+
+         Assert(omaxoffnum == PageGetMaxOffsetNumber(opage));
      }

      /*
Index: src/backend/access/hash/hashsearch.c
===================================================================
RCS file: /var/lib/cvs/pgsql/src/backend/access/hash/hashsearch.c,v
retrieving revision 1.38
diff -c -r1.38 hashsearch.c
*** src/backend/access/hash/hashsearch.c    31 Dec 2004 21:59:13 -0000    1.38
--- src/backend/access/hash/hashsearch.c    13 May 2005 02:03:36 -0000
***************
*** 37,43 ****
      OffsetNumber offnum;
      ItemPointer current;
      HashItem    hitem;
-     IndexTuple    itup;

      /* we still have the buffer pinned and read-locked */
      buf = so->hashso_curbuf;
--- 37,42 ----
***************
*** 55,62 ****
      page = BufferGetPage(buf);
      _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
      hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
!     itup = &hitem->hash_itup;
!     scan->xs_ctup.t_self = itup->t_tid;

      return true;
  }
--- 54,60 ----
      page = BufferGetPage(buf);
      _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
      hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
!     scan->xs_ctup.t_self = hitem->tid;

      return true;
  }
***************
*** 106,123 ****
  /*
   *    _hash_first() -- Find the first item in a scan.
   *
!  *        Find the first item in the index that
!  *        satisfies the qualification associated with the scan descriptor. On
!  *        success, the page containing the current index tuple is read locked
!  *        and pinned, and the scan's opaque data entry is updated to
!  *        include the buffer.
   */
  bool
  _hash_first(IndexScanDesc scan, ScanDirection dir)
  {
      Relation    rel = scan->indexRelation;
      HashScanOpaque so = (HashScanOpaque) scan->opaque;
-     uint32        hashkey;
      Bucket        bucket;
      BlockNumber blkno;
      Buffer        buf;
--- 104,120 ----
  /*
   *    _hash_first() -- Find the first item in a scan.
   *
!  *        Find the first item in the index that satisfies the
!  *        qualification associated with the scan descriptor. On success,
!  *        the page containing the current index tuple is read locked and
!  *        pinned, and the scan's opaque data entry is updated to include
!  *        the buffer.
   */
  bool
  _hash_first(IndexScanDesc scan, ScanDirection dir)
  {
      Relation    rel = scan->indexRelation;
      HashScanOpaque so = (HashScanOpaque) scan->opaque;
      Bucket        bucket;
      BlockNumber blkno;
      Buffer        buf;
***************
*** 126,132 ****
      HashPageOpaque opaque;
      HashMetaPage metap;
      HashItem    hitem;
-     IndexTuple    itup;
      ItemPointer current;
      OffsetNumber offnum;

--- 123,128 ----
***************
*** 153,164 ****
          return false;

      /*
-      * Okay to compute the hash key.  We want to do this before acquiring
-      * any locks, in case a user-defined hash function happens to be slow.
-      */
-     hashkey = _hash_datum2hashkey(rel, scan->keyData[0].sk_argument);
-
-     /*
       * Acquire shared split lock so we can compute the target bucket
       * safely (see README).
       */
--- 149,154 ----
***************
*** 172,178 ****
      /*
       * Compute the target bucket number, and convert to block number.
       */
!     bucket = _hash_hashkey2bucket(hashkey,
                                    metap->hashm_maxbucket,
                                    metap->hashm_highmask,
                                    metap->hashm_lowmask);
--- 162,168 ----
      /*
       * Compute the target bucket number, and convert to block number.
       */
!     bucket = _hash_hashkey2bucket(so->hashso_sk_hash,
                                    metap->hashm_maxbucket,
                                    metap->hashm_highmask,
                                    metap->hashm_lowmask);
***************
*** 218,225 ****
      page = BufferGetPage(buf);
      _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
      hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
!     itup = &hitem->hash_itup;
!     scan->xs_ctup.t_self = itup->t_tid;

      return true;
  }
--- 208,214 ----
      page = BufferGetPage(buf);
      _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
      hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
!     scan->xs_ctup.t_self = hitem->tid;

      return true;
  }
***************
*** 247,255 ****
      OffsetNumber maxoff;
      OffsetNumber offnum;
      Bucket        bucket;
-     BlockNumber blkno;
-     HashItem    hitem;
-     IndexTuple    itup;

      current = &(scan->currentItemData);

--- 236,241 ----
***************
*** 258,358 ****
      _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
      opaque = (HashPageOpaque) PageGetSpecialPointer(page);
      bucket = opaque->hasho_bucket;

      /*
       * If _hash_step is called from _hash_first, current will not be
!      * valid, so we can't dereference it.  However, in that case, we
!      * presumably want to start at the beginning/end of the page...
       */
-     maxoff = PageGetMaxOffsetNumber(page);
      if (ItemPointerIsValid(current))
          offnum = ItemPointerGetOffsetNumber(current);
      else
          offnum = InvalidOffsetNumber;

!     /*
!      * 'offnum' now points to the last tuple we have seen (if any).
!      *
!      * continue to step through tuples until: 1) we get to the end of the
!      * bucket chain or 2) we find a valid tuple.
!      */
!     do
      {
!         switch (dir)
          {
!             case ForwardScanDirection:
!                 if (offnum != InvalidOffsetNumber)
!                     offnum = OffsetNumberNext(offnum);    /* move forward */
!                 else
!                     offnum = FirstOffsetNumber; /* new page */
!
!                 while (offnum > maxoff)
!                 {
!                     /*
!                      * either this page is empty (maxoff ==
!                      * InvalidOffsetNumber) or we ran off the end.
!                      */
!                     _hash_readnext(rel, &buf, &page, &opaque);
!                     if (BufferIsValid(buf))
!                     {
!                         maxoff = PageGetMaxOffsetNumber(page);
!                         offnum = FirstOffsetNumber;
!                     }
!                     else
!                     {
!                         /* end of bucket */
!                         maxoff = offnum = InvalidOffsetNumber;
!                         break;    /* exit while */
!                     }
!                 }
!                 break;
!
!             case BackwardScanDirection:
!                 if (offnum != InvalidOffsetNumber)
!                     offnum = OffsetNumberPrev(offnum);    /* move back */
!                 else
!                     offnum = maxoff;    /* new page */
!
!                 while (offnum < FirstOffsetNumber)
!                 {
!                     /*
!                      * either this page is empty (offnum ==
!                      * InvalidOffsetNumber) or we ran off the end.
!                      */
!                     _hash_readprev(rel, &buf, &page, &opaque);
!                     if (BufferIsValid(buf))
!                         maxoff = offnum = PageGetMaxOffsetNumber(page);
!                     else
!                     {
!                         /* end of bucket */
!                         maxoff = offnum = InvalidOffsetNumber;
!                         break;    /* exit while */
!                     }
!                 }
!                 break;
!
!             default:
!                 /* NoMovementScanDirection */
!                 /* this should not be reached */
!                 break;
          }
!
!         /* we ran off the end of the world without finding a match */
!         if (offnum == InvalidOffsetNumber)
          {
!             *bufP = so->hashso_curbuf = InvalidBuffer;
!             ItemPointerSetInvalid(current);
!             return false;
          }

!         /* get ready to check this tuple */
!         hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
!         itup = &hitem->hash_itup;
!     } while (!_hash_checkqual(scan, itup));
!
!     /* if we made it to here, we've found a valid tuple */
!     blkno = BufferGetBlockNumber(buf);
!     *bufP = so->hashso_curbuf = buf;
!     ItemPointerSet(current, blkno, offnum);
!     return true;
  }
--- 244,304 ----
      _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
      opaque = (HashPageOpaque) PageGetSpecialPointer(page);
      bucket = opaque->hasho_bucket;
+     maxoff = PageGetMaxOffsetNumber(page);

      /*
       * If _hash_step is called from _hash_first, current will not be
!      * valid
       */
      if (ItemPointerIsValid(current))
          offnum = ItemPointerGetOffsetNumber(current);
      else
          offnum = InvalidOffsetNumber;

!     for (;;)
      {
!         if (offnum == InvalidOffsetNumber)
          {
!             /*
!              * This is the first time we're scanning this particular
!              * page of the bucket, so jump to the right spot via
!              * binary search.
!              */
!             offnum = _hash_binsearch(page, so->hashso_sk_hash);
          }
!         else
          {
!             /* Advance to the next tuple */
!             offnum = OffsetNumberNext(offnum);
          }

!         if (offnum <= maxoff && _hash_checkqual(scan, page, offnum))
!         {
!             /* Found a matching tuple */
!             *bufP = so->hashso_curbuf = buf;
!             ItemPointerSet(current, BufferGetBlockNumber(buf), offnum);
!             return true;
!         }
!         else
!         {
!             /* No more matches on this page, so go on to next page */
!             if (ScanDirectionIsForward(dir))
!                 _hash_readnext(rel, &buf, &page, &opaque);
!             else
!                 _hash_readprev(rel, &buf, &page, &opaque);
!
!             if (BufferIsValid(buf))
!             {
!                 maxoff = PageGetMaxOffsetNumber(page);
!                 offnum = InvalidOffsetNumber;
!             }
!             else
!             {
!                 /* Ran out of pages, so we're done */
!                 *bufP = so->hashso_curbuf = InvalidBuffer;
!                 ItemPointerSetInvalid(current);
!                 return false;
!             }
!         }
!     }
  }
Index: src/backend/access/hash/hashutil.c
===================================================================
RCS file: /var/lib/cvs/pgsql/src/backend/access/hash/hashutil.c,v
retrieving revision 1.42
diff -c -r1.42 hashutil.c
*** src/backend/access/hash/hashutil.c    11 May 2005 01:26:01 -0000    1.42
--- src/backend/access/hash/hashutil.c    13 May 2005 01:56:48 -0000
***************
*** 15,43 ****
  #include "postgres.h"

  #include "access/genam.h"
  #include "access/hash.h"
  #include "access/iqual.h"


  /*
!  * _hash_checkqual -- does the index tuple satisfy the scan conditions?
   */
  bool
! _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
  {
!     return index_keytest(itup, RelationGetDescr(scan->indexRelation),
!                          scan->numberOfKeys, scan->keyData);
  }

  /*
   * _hash_formitem -- construct a hash index entry
   */
  HashItem
! _hash_formitem(IndexTuple itup)
  {
-     int            nbytes_hitem;
      HashItem    hitem;
!     Size        tuplen;

      /* disallow nulls in hash keys */
      if (IndexTupleHasNulls(itup))
--- 15,66 ----
  #include "postgres.h"

  #include "access/genam.h"
+ #include "access/heapam.h"
  #include "access/hash.h"
  #include "access/iqual.h"


  /*
!  * _hash_checkqual -- does the index entry at the specified page and
!  * offset match the scan's hash value?
   */
  bool
! _hash_checkqual(IndexScanDesc scan, Page page, Offset offset)
  {
!     HashScanOpaque so = (HashScanOpaque) scan->opaque;
!     HashItem hitem;
!
!     Assert(offset <= PageGetMaxOffsetNumber(page));
!     hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offset));
!
!     /*
!      * The hash index entry only stores the hash of the index element,
!      * not the full value. Therefore, we first compare the stored hash
!      * with the scan's hash; if it is different, we know the tuple
!      * cannot match.
!      */
!     if (so->hashso_sk_hash != hitem->hash)
!         return false;
!
!     /*
!      * NB: Just because the hash values are the same does not
!      * necessarily mean the corresponding heap tuple matches the scan
!      * key -- a hash collision might have occurred. Rely on the fact
!      * that our opclass is marked "lossy" so that the executor will
!      * recheck the qual.
!      */
!     return true;
  }

  /*
   * _hash_formitem -- construct a hash index entry
   */
  HashItem
! _hash_formitem(IndexTuple itup, Relation rel)
  {
      HashItem    hitem;
!     Datum        attr;
!     bool        isnull;

      /* disallow nulls in hash keys */
      if (IndexTupleHasNulls(itup))
***************
*** 45,62 ****
                  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                   errmsg("hash indexes cannot contain null keys")));

!     /*
!      * make a copy of the index tuple (XXX do we still need to copy?)
!      *
!      * HashItemData used to have more fields than IndexTupleData, but no
!      * longer...
!      */
!     tuplen = IndexTupleSize(itup);
!     nbytes_hitem = tuplen +
!         (sizeof(HashItemData) - sizeof(IndexTupleData));

!     hitem = (HashItem) palloc(nbytes_hitem);
!     memcpy(&(hitem->hash_itup), itup, tuplen);

      return hitem;
  }
--- 68,84 ----
                  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                   errmsg("hash indexes cannot contain null keys")));

!     if (rel->rd_rel->relnatts != 1)
!         elog(ERROR, "hash indexes support only one index key");

!     hitem = (HashItem) palloc(sizeof(HashItemData));
!     hitem->flags = 0;
!     hitem->tid = itup->t_tid;
!
!     /* hash the tuple; we only store the hash value in the index */
!     attr = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
!     Assert(!isnull);
!     hitem->hash = _hash_datum2hashkey(rel, attr);

      return hitem;
  }
***************
*** 150,154 ****
--- 172,202 ----

          Assert(opaque->hasho_flag & flags);
      }
+
+     /*
+      * Check that the entries on bucket and overflow pages are in
+      * sorted by hash key
+      */
+ #if 0
+     if ((flags & LH_BUCKET_PAGE) || (flags & LH_OVERFLOW_PAGE))
+     {
+         OffsetNumber off;
+         HashItem prev;
+
+         if (FirstOffsetNumber > PageGetMaxOffsetNumber(page))
+             return;
+
+         prev = (HashItem) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
+
+         for (off = OffsetNumberNext(FirstOffsetNumber);
+              off <= PageGetMaxOffsetNumber(page);
+              off = OffsetNumberNext(off))
+         {
+             HashItem cur = (HashItem) PageGetItem(page, PageGetItemId(page, off));
+             Assert(cur->hash >= prev->hash);
+             prev = cur;
+         }
+     }
+ #endif
  #endif   /* USE_ASSERT_CHECKING */
  }
Index: src/backend/executor/nodeIndexscan.c
===================================================================
RCS file: /var/lib/cvs/pgsql/src/backend/executor/nodeIndexscan.c,v
retrieving revision 1.103
diff -c -r1.103 nodeIndexscan.c
*** src/backend/executor/nodeIndexscan.c    6 May 2005 17:24:54 -0000    1.103
--- src/backend/executor/nodeIndexscan.c    13 May 2005 01:30:49 -0000
***************
*** 120,126 ****
          /*
           * Store the scanned tuple in the scan tuple slot of the scan
           * state.  Note: we pass 'false' because tuples returned by
!          * amgetnext are pointers onto disk pages and must not be
           * pfree()'d.
           */
          ExecStoreTuple(tuple,                /* tuple to store */
--- 120,126 ----
          /*
           * Store the scanned tuple in the scan tuple slot of the scan
           * state.  Note: we pass 'false' because tuples returned by
!          * index_getnext are pointers onto disk pages and must not be
           * pfree()'d.
           */
          ExecStoreTuple(tuple,                /* tuple to store */
Index: src/include/access/hash.h
===================================================================
RCS file: /var/lib/cvs/pgsql/src/include/access/hash.h,v
retrieving revision 1.61
diff -c -r1.61 hash.h
*** src/include/access/hash.h    27 Mar 2005 23:53:04 -0000    1.61
--- src/include/access/hash.h    13 May 2005 02:03:41 -0000
***************
*** 96,101 ****
--- 96,103 ----
       */
      Buffer        hashso_curbuf;
      Buffer        hashso_mrkbuf;
+
+     uint32        hashso_sk_hash;        /* The hash of the scan key */
  } HashScanOpaqueData;

  typedef HashScanOpaqueData *HashScanOpaque;
***************
*** 158,166 ****

  typedef HashMetaPageData *HashMetaPage;

  typedef struct HashItemData
  {
!     IndexTupleData hash_itup;
  } HashItemData;

  typedef HashItemData *HashItem;
--- 160,176 ----

  typedef HashMetaPageData *HashMetaPage;

+ /*
+  * Entries in a hash index are slightly unusual: we don't store the
+  * value of the tuple's attribute. Instead, we just store its hash
+  * value, and the TID, so we can locate the tuple value itself in the
+  * heap if needed.
+  */
  typedef struct HashItemData
  {
!     ItemPointerData        tid;    /* reference TID to heap tuple */
!     unsigned short        flags;    /* metadata; currently unused */
!     uint32                hash;
  } HashItemData;

  typedef HashItemData *HashItem;
***************
*** 268,273 ****
--- 278,285 ----

  /* hashinsert.c */
  extern void _hash_doinsert(Relation rel, HashItem hitem);
+ extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, HashItem hitem);
+ extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);

  /* hashovfl.c */
  extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
***************
*** 304,311 ****
  extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);

  /* hashutil.c */
! extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
! extern HashItem _hash_formitem(IndexTuple itup);
  extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
  extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
                       uint32 highmask, uint32 lowmask);
--- 316,323 ----
  extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);

  /* hashutil.c */
! extern bool _hash_checkqual(IndexScanDesc scan, Page page, Offset offset);
! extern HashItem _hash_formitem(IndexTuple itup, Relation rel);
  extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
  extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
                       uint32 highmask, uint32 lowmask);
Index: src/include/catalog/pg_amop.h
===================================================================
RCS file: /var/lib/cvs/pgsql/src/include/catalog/pg_amop.h,v
retrieving revision 1.63
diff -c -r1.63 pg_amop.h
*** src/include/catalog/pg_amop.h    14 Apr 2005 01:38:20 -0000    1.63
--- src/include/catalog/pg_amop.h    13 May 2005 01:30:49 -0000
***************
*** 545,614 ****
   */

  /* bpchar_ops */
! DATA(insert (     427    0 1 f 1054 ));
  /* char_ops */
! DATA(insert (     431    0 1 f    92 ));
  /* cidr_ops */
! DATA(insert (     433    0 1 f  820 ));
  /* date_ops */
! DATA(insert (     435    0 1 f 1093 ));
  /* float4_ops */
! DATA(insert (    1971    0 1 f  620 ));
  /* float8_ops */
! DATA(insert (    1973    0 1 f  670 ));
  /* inet_ops */
! DATA(insert (    1975    0 1 f 1201 ));
  /* int2_ops */
! DATA(insert (    1977    0 1 f    94 ));
  /* int4_ops */
! DATA(insert (    1979    0 1 f    96 ));
  /* int8_ops */
! DATA(insert (    1981    0 1 f  410 ));
  /* interval_ops */
! DATA(insert (    1983    0 1 f 1330 ));
  /* macaddr_ops */
! DATA(insert (    1985    0 1 f 1220 ));
  /* name_ops */
! DATA(insert (    1987    0 1 f    93 ));
  /* oid_ops */
! DATA(insert (    1990    0 1 f  607 ));
  /* oidvector_ops */
! DATA(insert (    1992    0 1 f  649 ));
  /* text_ops */
! DATA(insert (    1995    0 1 f    98 ));
  /* time_ops */
! DATA(insert (    1997    0 1 f 1108 ));
  /* timestamptz_ops */
! DATA(insert (    1999    0 1 f 1320 ));
  /* timetz_ops */
! DATA(insert (    2001    0 1 f 1550 ));
  /* varchar_ops */
! DATA(insert (    2004    0 1 f    98 ));
  /* timestamp_ops */
! DATA(insert (    2040    0 1 f 2060 ));
  /* bool_ops */
! DATA(insert (    2222    0 1 f    91 ));
  /* bytea_ops */
! DATA(insert (    2223    0 1 f 1955 ));
  /* int2vector_ops */
! DATA(insert (    2224    0 1 f  386 ));
  /* xid_ops */
! DATA(insert (    2225    0 1 f  352 ));
  /* cid_ops */
! DATA(insert (    2226    0 1 f  385 ));
  /* abstime_ops */
! DATA(insert (    2227    0 1 f  560 ));
  /* reltime_ops */
! DATA(insert (    2228    0 1 f  566 ));
  /* text_pattern_ops */
! DATA(insert (    2229    0 1 f 2316 ));
  /* varchar_pattern_ops */
! DATA(insert (    2230    0 1 f 2316 ));
  /* bpchar_pattern_ops */
! DATA(insert (    2231    0 1 f 2328 ));
  /* name_pattern_ops */
! DATA(insert (    2232    0 1 f 2334 ));
  /* aclitem_ops */
! DATA(insert (    2235    0 1 f  974 ));

  #endif   /* PG_AMOP_H */
--- 545,614 ----
   */

  /* bpchar_ops */
! DATA(insert (     427    0 1 t 1054 ));
  /* char_ops */
! DATA(insert (     431    0 1 t    92 ));
  /* cidr_ops */
! DATA(insert (     433    0 1 t  820 ));
  /* date_ops */
! DATA(insert (     435    0 1 t 1093 ));
  /* float4_ops */
! DATA(insert (    1971    0 1 t  620 ));
  /* float8_ops */
! DATA(insert (    1973    0 1 t  670 ));
  /* inet_ops */
! DATA(insert (    1975    0 1 t 1201 ));
  /* int2_ops */
! DATA(insert (    1977    0 1 t    94 ));
  /* int4_ops */
! DATA(insert (    1979    0 1 t    96 ));
  /* int8_ops */
! DATA(insert (    1981    0 1 t  410 ));
  /* interval_ops */
! DATA(insert (    1983    0 1 t 1330 ));
  /* macaddr_ops */
! DATA(insert (    1985    0 1 t 1220 ));
  /* name_ops */
! DATA(insert (    1987    0 1 t    93 ));
  /* oid_ops */
! DATA(insert (    1990    0 1 t  607 ));
  /* oidvector_ops */
! DATA(insert (    1992    0 1 t  649 ));
  /* text_ops */
! DATA(insert (    1995    0 1 t    98 ));
  /* time_ops */
! DATA(insert (    1997    0 1 t 1108 ));
  /* timestamptz_ops */
! DATA(insert (    1999    0 1 t 1320 ));
  /* timetz_ops */
! DATA(insert (    2001    0 1 t 1550 ));
  /* varchar_ops */
! DATA(insert (    2004    0 1 t    98 ));
  /* timestamp_ops */
! DATA(insert (    2040    0 1 t 2060 ));
  /* bool_ops */
! DATA(insert (    2222    0 1 t    91 ));
  /* bytea_ops */
! DATA(insert (    2223    0 1 t 1955 ));
  /* int2vector_ops */
! DATA(insert (    2224    0 1 t  386 ));
  /* xid_ops */
! DATA(insert (    2225    0 1 t  352 ));
  /* cid_ops */
! DATA(insert (    2226    0 1 t  385 ));
  /* abstime_ops */
! DATA(insert (    2227    0 1 t  560 ));
  /* reltime_ops */
! DATA(insert (    2228    0 1 t  566 ));
  /* text_pattern_ops */
! DATA(insert (    2229    0 1 t 2316 ));
  /* varchar_pattern_ops */
! DATA(insert (    2230    0 1 t 2316 ));
  /* bpchar_pattern_ops */
! DATA(insert (    2231    0 1 t 2328 ));
  /* name_pattern_ops */
! DATA(insert (    2232    0 1 t 2334 ));
  /* aclitem_ops */
! DATA(insert (    2235    0 1 t  974 ));

  #endif   /* PG_AMOP_H */

pgsql-patches by date:

Previous
From: Alvaro Herrera
Date:
Subject: Refactoring in lock.c
Next
From: Neil Conway
Date:
Subject: Re: CSV consecutive newline bug