Re: [HACKERS] Small improvement to compactify_tuples - Mailing list pgsql-hackers

From Tom Lane
Subject Re: [HACKERS] Small improvement to compactify_tuples
Date
Msg-id 29007.1510090796@sss.pgh.pa.us
Whole thread Raw
In response to Re: [HACKERS] Small improvement to compactify_tuples  (Юрий Соколов <funny.falcon@gmail.com>)
Responses Re: [HACKERS] Small improvement to compactify_tuples  (Peter Geoghegan <pg@bowt.ie>)
Re: [HACKERS] Small improvement to compactify_tuples  (Robert Haas <robertmhaas@gmail.com>)
List pgsql-hackers
I've been getting less and less excited about this patch, because I still
couldn't measure any above-the-noise performance improvement without
artificial exaggerations, and some cases seemed actually slower.

However, this morning I had an epiphany: why are we sorting at all?

There is no requirement that these functions preserve the physical
ordering of the tuples' data areas, only that the line-pointer ordering be
preserved.  Indeed, reorganizing the data areas into an ordering matching
the line pointers is probably a good thing, because it should improve
locality of access in future scans of the page.  This is trivial to
implement if we copy the data into a workspace area and back again, as
I was already proposing to do to avoid memmove.  Moreover, at that point
there's little value in a separate compactify function at all: we can
integrate the data-copying logic into the line pointer scan loops in
PageRepairFragmentation and PageIndexMultiDelete, and get rid of the
costs of constructing the intermediate itemIdSortData arrays.

That led me to the attached patch, which is the first version of any
of this work that produces an above-the-noise performance win for me.
I'm seeing 10-20% gains on this modified version of Yura's original
example:

psql -f test3setup.sql
pgbench -M prepared -c 3 -s 10000000 -T 300 -P 3 -n -f test3.sql

(sql scripts also attached below; I'm using 1GB shared_buffers and
fsync off, other parameters stock.)

However, there are a couple of objections that could be raised to
this patch:

1. It's trading off per-byte work, in the form of an extra memcpy,
to save sorting work that has per-tuple costs.  Therefore, the relatively
narrow tuples used in Yura's example offer a best-case scenario;
with wider tuples the performance might be worse.

2. On a platform with memmove not so much worse than memcpy as I'm
seeing on my RHEL6 server, trading memmove for memcpy might not be
such a win.

To address point 1, I tried some measurements on the standard pgbench
scenario, which uses significantly wider tuples.  In hopes of addressing
point 2, I also ran the measurements on a laptop running Fedora 25
(gcc 6.4.1, glibc 2.24); I haven't actually checked memmove vs memcpy
on that machine, but at least it's a reasonably late-model glibc.

What I'm getting from the standard pgbench measurements, on both machines,
is that this patch might be a couple percent slower than HEAD, but that is
barely above the noise floor so I'm not too sure about it.

So I think we should seriously consider the attached, but it'd be a
good idea to benchmark it on a wider variety of platforms and test
cases.

            regards, tom lane

drop table if exists test3;

create unlogged table test3 (
         id integer PRIMARY KEY with (fillfactor=85),
         val text
     ) WITH (fillfactor=85);

insert into test3 select i, '!'||i from generate_series(1, 10000000) as i;

vacuum analyze; checkpoint;

create or replace function dotest3(n int, scale float8) returns void
language plpgsql as $$
begin
for i in 1..n loop
  declare
    id1 int := random() * scale;
    id2 int := random() * scale;
  begin
    perform * from test3 where id = id1;
    update test3 set val = '!'|| id2 where id = id1;
  end;
end loop;
end $$;
select dotest3(100, :scale);
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index b6aa2af..73b73de 100644
*** a/src/backend/storage/page/bufpage.c
--- b/src/backend/storage/page/bufpage.c
*************** PageRestoreTempPage(Page tempPage, Page
*** 415,471 ****
  }

  /*
-  * sorting support for PageRepairFragmentation and PageIndexMultiDelete
-  */
- typedef struct itemIdSortData
- {
-     uint16        offsetindex;    /* linp array index */
-     int16        itemoff;        /* page offset of item data */
-     uint16        alignedlen;        /* MAXALIGN(item data len) */
- } itemIdSortData;
- typedef itemIdSortData *itemIdSort;
-
- static int
- itemoffcompare(const void *itemidp1, const void *itemidp2)
- {
-     /* Sort in decreasing itemoff order */
-     return ((itemIdSort) itemidp2)->itemoff -
-         ((itemIdSort) itemidp1)->itemoff;
- }
-
- /*
-  * After removing or marking some line pointers unused, move the tuples to
-  * remove the gaps caused by the removed items.
-  */
- static void
- compactify_tuples(itemIdSort itemidbase, int nitems, Page page)
- {
-     PageHeader    phdr = (PageHeader) page;
-     Offset        upper;
-     int            i;
-
-     /* sort itemIdSortData array into decreasing itemoff order */
-     qsort((char *) itemidbase, nitems, sizeof(itemIdSortData),
-           itemoffcompare);
-
-     upper = phdr->pd_special;
-     for (i = 0; i < nitems; i++)
-     {
-         itemIdSort    itemidptr = &itemidbase[i];
-         ItemId        lp;
-
-         lp = PageGetItemId(page, itemidptr->offsetindex + 1);
-         upper -= itemidptr->alignedlen;
-         memmove((char *) page + upper,
-                 (char *) page + itemidptr->itemoff,
-                 itemidptr->alignedlen);
-         lp->lp_off = upper;
-     }
-
-     phdr->pd_upper = upper;
- }
-
- /*
   * PageRepairFragmentation
   *
   * Frees fragmented space on a page.
--- 415,420 ----
*************** PageRepairFragmentation(Page page)
*** 481,494 ****
      Offset        pd_lower = ((PageHeader) page)->pd_lower;
      Offset        pd_upper = ((PageHeader) page)->pd_upper;
      Offset        pd_special = ((PageHeader) page)->pd_special;
!     itemIdSortData itemidbase[MaxHeapTuplesPerPage];
!     itemIdSort    itemidptr;
!     ItemId        lp;
!     int            nline,
!                 nstorage,
!                 nunused;
!     int            i;
!     Size        totallen;

      /*
       * It's worth the trouble to be more paranoid here than in most places,
--- 430,444 ----
      Offset        pd_lower = ((PageHeader) page)->pd_lower;
      Offset        pd_upper = ((PageHeader) page)->pd_upper;
      Offset        pd_special = ((PageHeader) page)->pd_special;
!     int            new_pd_upper,
!                 nline,
!                 nunused,
!                 i;
!     union
!     {
!         char        page[BLCKSZ];
!         double        align;        /* force workspace to be MAXALIGN'd */
!     }            workspace;

      /*
       * It's worth the trouble to be more paranoid here than in most places,
*************** PageRepairFragmentation(Page page)
*** 508,563 ****
                          pd_lower, pd_upper, pd_special)));

      /*
!      * Run through the line pointer array and collect data about live items.
       */
      nline = PageGetMaxOffsetNumber(page);
!     itemidptr = itemidbase;
!     nunused = totallen = 0;
      for (i = FirstOffsetNumber; i <= nline; i++)
      {
!         lp = PageGetItemId(page, i);
          if (ItemIdIsUsed(lp))
          {
              if (ItemIdHasStorage(lp))
              {
!                 itemidptr->offsetindex = i - 1;
!                 itemidptr->itemoff = ItemIdGetOffset(lp);
!                 if (unlikely(itemidptr->itemoff < (int) pd_upper ||
!                              itemidptr->itemoff >= (int) pd_special))
                      ereport(ERROR,
                              (errcode(ERRCODE_DATA_CORRUPTED),
!                              errmsg("corrupted item pointer: %u",
!                                     itemidptr->itemoff)));
!                 itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp));
!                 totallen += itemidptr->alignedlen;
!                 itemidptr++;
              }
          }
          else
          {
!             /* Unused entries should have lp_len = 0, but make sure */
!             ItemIdSetUnused(lp);
              nunused++;
          }
      }

!     nstorage = itemidptr - itemidbase;
!     if (nstorage == 0)
!     {
!         /* Page is completely empty, so just reset it quickly */
!         ((PageHeader) page)->pd_upper = pd_special;
!     }
!     else
!     {
!         /* Need to compact the page the hard way */
!         if (totallen > (Size) (pd_special - pd_lower))
!             ereport(ERROR,
!                     (errcode(ERRCODE_DATA_CORRUPTED),
!                      errmsg("corrupted item lengths: total %u, available space %u",
!                             (unsigned int) totallen, pd_special - pd_lower)));

!         compactify_tuples(itemidbase, nstorage, page);
!     }

      /* Set hint bit for PageAddItem */
      if (nunused > 0)
--- 458,526 ----
                          pd_lower, pd_upper, pd_special)));

      /*
!      * We build updated copies of the line pointer array and tuple data area
!      * in workspace.page, and then copy them back to the real page when done.
!      * This ensures that if we error out partway through, we have not changed
!      * the real page.  It also lets us use memcpy rather than memmove for the
!      * data transfers, which is faster on some machines.
!      *
!      * A useful side effect of this approach is that the tuples are re-sorted
!      * so that their physical order matches their line pointer order, which
!      * should improve locality of access in future scans of the page.
       */
      nline = PageGetMaxOffsetNumber(page);
!     new_pd_upper = pd_special;
!     nunused = 0;
      for (i = FirstOffsetNumber; i <= nline; i++)
      {
!         ItemId        lp = PageGetItemId(page, i);
!         ItemId        newlp = PageGetItemId(workspace.page, i);
!
          if (ItemIdIsUsed(lp))
          {
+             *newlp = *lp;
              if (ItemIdHasStorage(lp))
              {
!                 int            offset = ItemIdGetOffset(lp);
!                 int            alignedlen = MAXALIGN(ItemIdGetLength(lp));
!
!                 new_pd_upper -= alignedlen;
!                 newlp->lp_off = new_pd_upper;
!                 if (unlikely(offset < (int) pd_upper ||
!                              (offset + alignedlen) > (int) pd_special ||
!                              offset != MAXALIGN(offset) ||
!                              new_pd_upper < (int) pd_lower))
                      ereport(ERROR,
                              (errcode(ERRCODE_DATA_CORRUPTED),
!                              errmsg("corrupted item pointer: offset = %u, length = %u",
!                                     offset, alignedlen)));
!                 memcpy(workspace.page + new_pd_upper,
!                        (char *) page + offset,
!                        alignedlen);
              }
          }
          else
          {
!             /* We can just zero out all the fields in *newlp */
!             ItemIdSetUnused(newlp);
              nunused++;
          }
      }

!     /*
!      * Okay, copy lower and upper workspace areas back to the real page.
!      */
!     if (pd_lower > SizeOfPageHeaderData)
!         memcpy((char *) page + SizeOfPageHeaderData,
!                workspace.page + SizeOfPageHeaderData,
!                pd_lower - SizeOfPageHeaderData);
!     if (new_pd_upper < pd_special)
!         memcpy((char *) page + new_pd_upper,
!                workspace.page + new_pd_upper,
!                pd_special - new_pd_upper);

!     /* Page's pd_lower doesn't change, but pd_upper does */
!     ((PageHeader) page)->pd_upper = new_pd_upper;

      /* Set hint bit for PageAddItem */
      if (nunused > 0)
*************** PageIndexTupleDelete(Page page, OffsetNu
*** 831,853 ****
  void
  PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
  {
!     PageHeader    phdr = (PageHeader) page;
!     Offset        pd_lower = phdr->pd_lower;
!     Offset        pd_upper = phdr->pd_upper;
!     Offset        pd_special = phdr->pd_special;
!     itemIdSortData itemidbase[MaxIndexTuplesPerPage];
!     ItemIdData    newitemids[MaxIndexTuplesPerPage];
!     itemIdSort    itemidptr;
!     ItemId        lp;
!     int            nline,
!                 nused;
!     Size        totallen;
!     Size        size;
!     unsigned    offset;
!     int            nextitm;
      OffsetNumber offnum;
!
!     Assert(nitems <= MaxIndexTuplesPerPage);

      /*
       * If there aren't very many items to delete, then retail
--- 794,813 ----
  void
  PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
  {
!     Offset        pd_lower = ((PageHeader) page)->pd_lower;
!     Offset        pd_upper = ((PageHeader) page)->pd_upper;
!     Offset        pd_special = ((PageHeader) page)->pd_special;
!     int            new_pd_upper,
!                 nline,
!                 nextnew,
!                 nextitm,
!                 lpalen;
      OffsetNumber offnum;
!     union
!     {
!         char        page[BLCKSZ];
!         double        align;        /* force workspace to be MAXALIGN'd */
!     }            workspace;

      /*
       * If there aren't very many items to delete, then retail
*************** PageIndexMultiDelete(Page page, OffsetNu
*** 878,906 ****
                          pd_lower, pd_upper, pd_special)));

      /*
!      * Scan the item pointer array and build a list of just the ones we are
!      * going to keep.  Notice we do not modify the page yet, since we are
!      * still validity-checking.
       */
      nline = PageGetMaxOffsetNumber(page);
!     itemidptr = itemidbase;
!     totallen = 0;
!     nused = 0;
      nextitm = 0;
      for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
      {
-         lp = PageGetItemId(page, offnum);
-         Assert(ItemIdHasStorage(lp));
-         size = ItemIdGetLength(lp);
-         offset = ItemIdGetOffset(lp);
-         if (offset < pd_upper ||
-             (offset + size) > pd_special ||
-             offset != MAXALIGN(offset))
-             ereport(ERROR,
-                     (errcode(ERRCODE_DATA_CORRUPTED),
-                      errmsg("corrupted item pointer: offset = %u, length = %u",
-                             offset, (unsigned int) size)));
-
          if (nextitm < nitems && offnum == itemnos[nextitm])
          {
              /* skip item to be deleted */
--- 838,853 ----
                          pd_lower, pd_upper, pd_special)));

      /*
!      * As in PageRepairFragmentation, we build new copies of the line pointer
!      * array and tuple data area in workspace.page, then transfer them back to
!      * the real page.
       */
      nline = PageGetMaxOffsetNumber(page);
!     new_pd_upper = pd_special;
!     nextnew = FirstOffsetNumber;
      nextitm = 0;
      for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
      {
          if (nextitm < nitems && offnum == itemnos[nextitm])
          {
              /* skip item to be deleted */
*************** PageIndexMultiDelete(Page page, OffsetNu
*** 908,920 ****
          }
          else
          {
!             itemidptr->offsetindex = nused; /* where it will go */
!             itemidptr->itemoff = offset;
!             itemidptr->alignedlen = MAXALIGN(size);
!             totallen += itemidptr->alignedlen;
!             newitemids[nused] = *lp;
!             itemidptr++;
!             nused++;
          }
      }

--- 855,884 ----
          }
          else
          {
!             ItemId        lp = PageGetItemId(page, offnum);
!             ItemId        newlp;
!             int            offset;
!             int            alignedlen;
!
!             Assert(ItemIdHasStorage(lp));
!             offset = ItemIdGetOffset(lp);
!             alignedlen = MAXALIGN(ItemIdGetLength(lp));
!             new_pd_upper -= alignedlen;
!             if (unlikely(offset < (int) pd_upper ||
!                          (offset + alignedlen) > (int) pd_special ||
!                          offset != MAXALIGN(offset) ||
!                          new_pd_upper < (int) pd_lower))
!                 ereport(ERROR,
!                         (errcode(ERRCODE_DATA_CORRUPTED),
!                          errmsg("corrupted item pointer: offset = %u, length = %u",
!                                 offset, alignedlen)));
!             memcpy(workspace.page + new_pd_upper,
!                    (char *) page + offset,
!                    alignedlen);
!             newlp = PageGetItemId(workspace.page, nextnew);
!             *newlp = *lp;
!             newlp->lp_off = new_pd_upper;
!             nextnew++;
          }
      }

*************** PageIndexMultiDelete(Page page, OffsetNu
*** 922,942 ****
      if (nextitm != nitems)
          elog(ERROR, "incorrect index offsets supplied");

-     if (totallen > (Size) (pd_special - pd_lower))
-         ereport(ERROR,
-                 (errcode(ERRCODE_DATA_CORRUPTED),
-                  errmsg("corrupted item lengths: total %u, available space %u",
-                         (unsigned int) totallen, pd_special - pd_lower)));
-
      /*
!      * Looks good. Overwrite the line pointers with the copy, from which we've
!      * removed all the unused items.
       */
!     memcpy(phdr->pd_linp, newitemids, nused * sizeof(ItemIdData));
!     phdr->pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData);

!     /* and compactify the tuple data */
!     compactify_tuples(itemidbase, nused, page);
  }


--- 886,906 ----
      if (nextitm != nitems)
          elog(ERROR, "incorrect index offsets supplied");

      /*
!      * Okay, copy lower and upper workspace areas back to the real page.
       */
!     lpalen = (nextnew - FirstOffsetNumber) * sizeof(ItemIdData);
!     if (lpalen > 0)
!         memcpy((char *) page + SizeOfPageHeaderData,
!                workspace.page + SizeOfPageHeaderData,
!                lpalen);
!     if (new_pd_upper < pd_special)
!         memcpy((char *) page + new_pd_upper,
!                workspace.page + new_pd_upper,
!                pd_special - new_pd_upper);

!     ((PageHeader) page)->pd_lower = SizeOfPageHeaderData + lpalen;
!     ((PageHeader) page)->pd_upper = new_pd_upper;
  }



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

pgsql-hackers by date:

Previous
From: Robert Haas
Date:
Subject: Re: [HACKERS] Fix a typo in dsm_impl.c
Next
From: Robert Haas
Date:
Subject: Re: [HACKERS] pg_stat_wal_write statistics view