Thread: MemSet inline for newNode

MemSet inline for newNode

From
Bruce Momjian
Date:
I have applied this patch to inline MemSet for newNode.

I will make another commit to make more general use of palloc0.

This was already discussed on hackers.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073
Index: src/backend/nodes/nodes.c
===================================================================
RCS file: /cvsroot/pgsql-server/src/backend/nodes/nodes.c,v
retrieving revision 1.15
diff -c -c -r1.15 nodes.c
*** src/backend/nodes/nodes.c    20 Jun 2002 20:29:29 -0000    1.15
--- src/backend/nodes/nodes.c    9 Oct 2002 05:17:13 -0000
***************
*** 28,42 ****
   *      macro makeNode. eg. to create a Resdom node, use makeNode(Resdom)
   *
   */
! Node *
! newNode(Size size, NodeTag tag)
! {
!     Node       *newNode;

-     Assert(size >= sizeof(Node));        /* need the tag, at least */
-
-     newNode = (Node *) palloc(size);
-     MemSet((char *) newNode, 0, size);
-     newNode->type = tag;
-     return newNode;
- }
--- 28,32 ----
   *      macro makeNode. eg. to create a Resdom node, use makeNode(Resdom)
   *
   */
! Node *newNodeMacroHolder;

Index: src/backend/utils/mmgr/mcxt.c
===================================================================
RCS file: /cvsroot/pgsql-server/src/backend/utils/mmgr/mcxt.c,v
retrieving revision 1.32
diff -c -c -r1.32 mcxt.c
*** src/backend/utils/mmgr/mcxt.c    12 Aug 2002 00:36:12 -0000    1.32
--- src/backend/utils/mmgr/mcxt.c    9 Oct 2002 05:17:17 -0000
***************
*** 453,458 ****
--- 453,481 ----
  }

  /*
+  * MemoryContextAllocZero
+  *        Like MemoryContextAlloc, but clears allocated memory
+  *
+  *    We could just call MemoryContextAlloc then clear the memory, but this
+  *    function is called too many times, so we have a separate version.
+  */
+ void *
+ MemoryContextAllocZero(MemoryContext context, Size size)
+ {
+     void *ret;
+
+     AssertArg(MemoryContextIsValid(context));
+
+     if (!AllocSizeIsValid(size))
+         elog(ERROR, "MemoryContextAllocZero: invalid request size %lu",
+              (unsigned long) size);
+
+     ret = (*context->methods->alloc) (context, size);
+     MemSet(ret, 0, size);
+     return ret;
+ }
+
+ /*
   * pfree
   *        Release an allocated chunk.
   */
Index: src/include/nodes/nodes.h
===================================================================
RCS file: /cvsroot/pgsql-server/src/include/nodes/nodes.h,v
retrieving revision 1.118
diff -c -c -r1.118 nodes.h
*** src/include/nodes/nodes.h    31 Aug 2002 22:10:47 -0000    1.118
--- src/include/nodes/nodes.h    9 Oct 2002 05:17:20 -0000
***************
*** 261,266 ****
--- 261,284 ----

  #define nodeTag(nodeptr)        (((Node*)(nodeptr))->type)

+ /*
+  *    There is no way to dereference the palloc'ed pointer to assign the
+  *    tag, and return the pointer itself, so we need a holder variable.
+  *    Fortunately, this function isn't recursive so we just define
+  *    a global variable for this purpose.
+  */
+ extern Node *newNodeMacroHolder;
+
+ #define newNode(size, tag) \
+ ( \
+     AssertMacro((size) >= sizeof(Node)),        /* need the tag, at least */ \
+ \
+     newNodeMacroHolder = (Node *) palloc0(size), \
+     newNodeMacroHolder->type = (tag), \
+     newNodeMacroHolder \
+ )
+
+
  #define makeNode(_type_)        ((_type_ *) newNode(sizeof(_type_),T_##_type_))
  #define NodeSetTag(nodeptr,t)    (((Node*)(nodeptr))->type = (t))

***************
*** 281,291 ****
   *                      extern declarations follow
   * ----------------------------------------------------------------
   */
-
- /*
-  * nodes/nodes.c
-  */
- extern Node *newNode(Size size, NodeTag tag);

  /*
   * nodes/{outfuncs.c,print.c}
--- 299,304 ----
Index: src/include/utils/palloc.h
===================================================================
RCS file: /cvsroot/pgsql-server/src/include/utils/palloc.h,v
retrieving revision 1.19
diff -c -c -r1.19 palloc.h
*** src/include/utils/palloc.h    20 Jun 2002 20:29:53 -0000    1.19
--- src/include/utils/palloc.h    9 Oct 2002 05:17:21 -0000
***************
*** 46,53 ****
--- 46,56 ----
   * Fundamental memory-allocation operations (more are in utils/memutils.h)
   */
  extern void *MemoryContextAlloc(MemoryContext context, Size size);
+ extern void *MemoryContextAllocZero(MemoryContext context, Size size);

  #define palloc(sz)    MemoryContextAlloc(CurrentMemoryContext, (sz))
+
+ #define palloc0(sz)    MemoryContextAllocZero(CurrentMemoryContext, (sz))

  extern void pfree(void *pointer);


Re: MemSet inline for newNode

From
Tom Lane
Date:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
> I have applied this patch to inline MemSet for newNode.
> I will make another commit to make more general use of palloc0.

Refresh my memory on why either of these was a good idea?

> This was already discussed on hackers.

And this was not the approach agreed to, IIRC.  What you've done has
eliminated the possibility of optimizing away the controlling tests
in MemSet.

            regards, tom lane

Re: MemSet inline for newNode

From
Bruce Momjian
Date:
Tom Lane wrote:
> Bruce Momjian <pgman@candle.pha.pa.us> writes:
> > I have applied this patch to inline MemSet for newNode.
> > I will make another commit to make more general use of palloc0.
>
> Refresh my memory on why either of these was a good idea?

You are talking about the use of palloc0 in place of palloc/MemSet(0),
not the use of palloc0 in newNode, right?

> > This was already discussed on hackers.
>
> And this was not the approach agreed to, IIRC.  What you've done has
> eliminated the possibility of optimizing away the controlling tests
> in MemSet.

I see now:

    http://groups.google.com/groups?hl=en&lr=&ie=UTF-8&selm=200210120006.g9C06i502964%40candle.pha.pa.us

I thought someone had tested that there was little/no performance
difference between these two statements:

    MemSet(ptr, 0, 256)

vs.

    i = 256;
    MemSet(ptr, 0, i)

However, seeing no email reply, I now assume no test was done.

Neil, can you run such a test and let us know.  It has to be with a
compiler that optimizes out the MemSet length test when the len is a
constant.  I can't remember what compiler version/settings that was, but
it may have been -O3 gcc 3.20.

I can back out my changes, but it would be easier to see if there is a
benefit before doing that.  Personally, I am going to be surprised if a
single conditional tests in MemSet (which is not in the assignment loop)
will make any measurable difference.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073

Re: MemSet inline for newNode

From
Tom Lane
Date:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
> I thought someone had tested that there was little/no performance
> difference between these two statements:
>     MemSet(ptr, 0, 256)
> vs.
>     i = 256;
>     MemSet(ptr, 0, i)

But with any reasonably smart compiler, those *will* be the same because
the compiler can do constant-folding (ie, it knows i is 256 when control
reaches the MemSet).  Pushing the MemSet into MemoryContextAlloc
eliminates the possibility of knowing the size argument's value.

> I can back out my changes, but it would be easier to see if there is a
> benefit before doing that.  Personally, I am going to be surprised if a
> single conditional tests in MemSet (which is not in the assignment loop)
> will make any measurable difference.

Then why were we bothering?  IIRC, this was being sold as a performance
improvement.

            regards, tom lane

Re: MemSet inline for newNode

From
Bruce Momjian
Date:
Tom Lane wrote:
> Bruce Momjian <pgman@candle.pha.pa.us> writes:
> > I thought someone had tested that there was little/no performance
> > difference between these two statements:
> >     MemSet(ptr, 0, 256)
> > vs.
> >     i = 256;
> >     MemSet(ptr, 0, i)
>
> But with any reasonably smart compiler, those *will* be the same because
> the compiler can do constant-folding (ie, it knows i is 256 when control
> reaches the MemSet).  Pushing the MemSet into MemoryContextAlloc
> eliminates the possibility of knowing the size argument's value.

Yes, however, I am not seeing that getting optimized with gcc 2.95 -O2.

> > I can back out my changes, but it would be easier to see if there is a
> > benefit before doing that.  Personally, I am going to be surprised if a
> > single conditional tests in MemSet (which is not in the assignment loop)
> > will make any measurable difference.
>
> Then why were we bothering?  IIRC, this was being sold as a performance
> improvement.

Well, the palloc0 use by newNode was a performance boost, as tested by
Neil Conway.  Without palloc0, there was no way to inline newNode.  He
found in his tests that the palloc0 version of newNode was as fast as
his inline newNode, and it didn't cause the same code bloat.

The palloc0's used elsewhere in the code was merely for code clarity
and to reduce code bloat caused by MemSet in a few cases.  I will back
it out while I do some more tests.

One thing I am concerned about is that newNode _isn't_ making use of a
constant len for MemSet, but doing it inline was causing too much bloat.

I made a new version that did the alignment test of start and len in one
pass:

                if ((( ((long) _start) | _len) & INT_ALIGN_MASK) == 0 && \
                        _val == 0 && \
                        _len <= MEMSET_LOOP_LIMIT) \

I have found that the last test is the one that slows it down.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073

Re: MemSet inline for newNode

From
Tom Lane
Date:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
> Tom Lane wrote:
>> But with any reasonably smart compiler, those *will* be the same because
>> the compiler can do constant-folding (ie, it knows i is 256 when control
>> reaches the MemSet).  Pushing the MemSet into MemoryContextAlloc
>> eliminates the possibility of knowing the size argument's value.

> Yes, however, I am not seeing that getting optimized with gcc 2.95 -O2.

You aren't?  I just checked gcc 2.95.3 on HPPA; it definitely propagates
the constant at -O2.

> Well, the palloc0 use by newNode was a performance boost, as tested by
> Neil Conway.  Without palloc0, there was no way to inline newNode.

But your patch as committed does NOT inline newNode, in fact it does the
exact opposite: the MemSet macro is removed from the callsite.

> I made a new version that did the alignment test of start and len in one
> pass:

I thought the point of this exercise was to eliminate the alignment test
altogether, by making it doable at compile time.  IIRC, the upshot of
the prior discussion was to keep the MemSet calls at the call site and
to eliminate the alignment test on the pointer by special-casing palloc0
(essentially, wiring into the macro the assumption that palloc's
returned pointer will be suitably aligned).

            regards, tom lane

Re: MemSet inline for newNode

From
Bruce Momjian
Date:
Tom Lane wrote:
> Bruce Momjian <pgman@candle.pha.pa.us> writes:
> > Tom Lane wrote:
> >> But with any reasonably smart compiler, those *will* be the same because
> >> the compiler can do constant-folding (ie, it knows i is 256 when control
> >> reaches the MemSet).  Pushing the MemSet into MemoryContextAlloc
> >> eliminates the possibility of knowing the size argument's value.
>
> > Yes, however, I am not seeing that getting optimized with gcc 2.95 -O2.
>
> You aren't?  I just checked gcc 2.95.3 on HPPA; it definitely propagates
> the constant at -O2.

I am definately seeing different timings using the constant and the
variable in the test.

>
> > Well, the palloc0 use by newNode was a performance boost, as tested by
> > Neil Conway.  Without palloc0, there was no way to inline newNode.
>
> But your patch as committed does NOT inline newNode, in fact it does the
> exact opposite: the MemSet macro is removed from the callsite.

Yes, there were actually two patches;  the first inlined newNode by
calling palloc0.  Without palloc, there was no way to inline newNode.

The second was a more general one to merge palloc/MemSet into a single
palloc0 call.  That has been backed out while I research it.

>
> > I made a new version that did the alignment test of start and len in one
> > pass:
>
> I thought the point of this exercise was to eliminate the alignment test
> altogether, by making it doable at compile time.  IIRC, the upshot of
> the prior discussion was to keep the MemSet calls at the call site and
> to eliminate the alignment test on the pointer by special-casing palloc0
> (essentially, wiring into the macro the assumption that palloc's
> returned pointer will be suitably aligned).

I didn't assume we could special case palloc0 by knowing it was going to
be properly aligned and be of the proper length (multiple if int).

I have a new idea.  Right now we have:

    #define palloc0(sz) MemoryContextAllocZero(CurrentMemoryContext, (sz))

My new idea is to add a third boolean argument to
MemoryContextAllocZero() which will control whether the MemSet
assignment loop is used, or memset().  My idea is to use "? :" to test
for the proper value in the macro.  Basically, this will decouple the
MemSet test and the MemSet loop, allowing the optimizer to process the
length at the call site, but allow palloc0 to perform the assignment
loop.  I need palloc0 because MemSet can't return a pointer.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073

Re: MemSet inline for newNode

From
Tom Lane
Date:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
>> But your patch as committed does NOT inline newNode, in fact it does the
>> exact opposite: the MemSet macro is removed from the callsite.

> Yes, there were actually two patches;  the first inlined newNode by
> calling palloc0.  Without palloc, there was no way to inline newNode.

But you have *not* inlined newNode: the memset is still done at a location
that cannot know the size at compile time.

> The second was a more general one to merge palloc/MemSet into a single
> palloc0 call.  That has been backed out while I research it.

That part could be sold just on the basis of making the code easier
to read and less error-prone; particularly for call sites where the
length is a runtime computation anyway.  I think that newNode() is the
principal case where the length is knowable at compile time.  So my
feeling is that the general change to invent a palloc0() call is fine
... but if you want performance then newNode() should *not* be using the
generic palloc0() call.

> My new idea is to add a third boolean argument to
> MemoryContextAllocZero() which will control whether the MemSet
> assignment loop is used, or memset().

But we are *trying to eliminate any runtime test whatever*.  Short
of that, you aren't going to get the speedup.  Certainly passing a third
argument to the alloc subroutine will eat enough cycles to negate any
savings from simplifying the runtime test slightly.

            regards, tom lane

Re: MemSet inline for newNode

From
Bruce Momjian
Date:
Tom Lane wrote:
> Bruce Momjian <pgman@candle.pha.pa.us> writes:
> >> But your patch as committed does NOT inline newNode, in fact it does the
> >> exact opposite: the MemSet macro is removed from the callsite.
>
> > Yes, there were actually two patches;  the first inlined newNode by
> > calling palloc0.  Without palloc, there was no way to inline newNode.
>
> But you have *not* inlined newNode: the memset is still done at a location
> that cannot know the size at compile time.

I am sorry.  I inlined newNode, but by pushing MemSet into palloc0, the
MemSet conditional tests can not be optimized away.  Is that clearer?

    #define newNode(size, tag) \
    ( \
        AssertMacro((size) >= sizeof(Node)),        /* need the tag, at least */ \
    \
        newNodeMacroHolder = (Node *) palloc0(size), \
        newNodeMacroHolder->type = (tag), \
        newNodeMacroHolder \
    )

In this case, I needed palloc0 to make newNode inlined, and the old
function version of newNode couldn't optimize away the conditional test
either because it was called with various sizes.


> > The second was a more general one to merge palloc/MemSet into a single
> > palloc0 call.  That has been backed out while I research it.
>
> That part could be sold just on the basis of making the code easier
> to read and less error-prone; particularly for call sites where the
> length is a runtime computation anyway.  I think that newNode() is the

Yes, that was my thought.

> principal case where the length is knowable at compile time.  So my
> feeling is that the general change to invent a palloc0() call is fine
> ... but if you want performance then newNode() should *not* be using the
> generic palloc0() call.

OK. I am still struggling about how to do that.

> > My new idea is to add a third boolean argument to
> > MemoryContextAllocZero() which will control whether the MemSet
> > assignment loop is used, or memset().
>
> But we are *trying to eliminate any runtime test whatever*.  Short
> of that, you aren't going to get the speedup.  Certainly passing a third
> argument to the alloc subroutine will eat enough cycles to negate any
> savings from simplifying the runtime test slightly.

My idea is that the conditional tests will be optimized away before the
palloc0 call is made.

Attached is a patch that implements the MemSet split and allows the size
tests to be optimized away _before_ at the call location.  It isn't done
yet, but you can get the idea.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073
Index: src/backend/utils/mmgr/mcxt.c
===================================================================
RCS file: /cvsroot/pgsql-server/src/backend/utils/mmgr/mcxt.c,v
retrieving revision 1.35
diff -c -c -r1.35 mcxt.c
*** src/backend/utils/mmgr/mcxt.c    10 Nov 2002 02:17:25 -0000    1.35
--- src/backend/utils/mmgr/mcxt.c    11 Nov 2002 04:48:27 -0000
***************
*** 460,466 ****
   *    function is called too many times, so we have a separate version.
   */
  void *
! MemoryContextAllocZero(MemoryContext context, Size size)
  {
      void *ret;

--- 460,466 ----
   *    function is called too many times, so we have a separate version.
   */
  void *
! MemoryContextAllocZero(MemoryContext context, Size size, bool use_libc)
  {
      void *ret;

***************
*** 471,477 ****
               (unsigned long) size);

      ret = (*context->methods->alloc) (context, size);
!     MemSet(ret, 0, size);
      return ret;
  }

--- 471,477 ----
               (unsigned long) size);

      ret = (*context->methods->alloc) (context, size);
!     MemSetLoop(ret, 0, size, use_libc);
      return ret;
  }

Index: src/include/c.h
===================================================================
RCS file: /cvsroot/pgsql-server/src/include/c.h,v
retrieving revision 1.131
diff -c -c -r1.131 c.h
*** src/include/c.h    11 Nov 2002 03:02:19 -0000    1.131
--- src/include/c.h    11 Nov 2002 04:48:33 -0000
***************
*** 579,601 ****
   *
   *    bjm 2002-10-08
   */
! #define MemSet(start, val, len) \
      do \
      { \
          int32 * _start = (int32 *) (start); \
          int        _val = (val); \
          Size    _len = (len); \
  \
!         if ((( ((long) _start) | _len) & INT_ALIGN_MASK) == 0 && \
!             _val == 0 && \
!             _len <= MEMSET_LOOP_LIMIT) \
          { \
              int32 * _stop = (int32 *) ((char *) _start + _len); \
              while (_start < _stop) \
                  *_start++ = 0; \
          } \
          else \
!             memset((char *) _start, _val, _len); \
      } while (0)

  #define MEMSET_LOOP_LIMIT  1024
--- 579,610 ----
   *
   *    bjm 2002-10-08
   */
! #define MemSetTest(val, len) \
!         ( ((len) & INT_ALIGN_MASK) == 0 && \
!         (len) <= MEMSET_LOOP_LIMIT && \
!         (val) == 0 )
!
! #define MemSetLoop(start, val, len, use_libc) \
      do \
      { \
          int32 * _start = (int32 *) (start); \
          int        _val = (val); \
          Size    _len = (len); \
  \
!         if (!use_libc && ((long) _start & INT_ALIGN_MASK) == 0 ) \
          { \
              int32 * _stop = (int32 *) ((char *) _start + _len); \
              while (_start < _stop) \
                  *_start++ = 0; \
          } \
          else \
!             memset((char *) _start,  _val, _len); \
!     } while (0)
!
! #define MemSet(start, val, len) \
!     do \
!     { \
!         MemSetLoop(start, val, len, MemSetTest(val, len)); \
      } while (0)

  #define MEMSET_LOOP_LIMIT  1024
Index: src/include/utils/palloc.h
===================================================================
RCS file: /cvsroot/pgsql-server/src/include/utils/palloc.h,v
retrieving revision 1.22
diff -c -c -r1.22 palloc.h
*** src/include/utils/palloc.h    10 Nov 2002 02:17:25 -0000    1.22
--- src/include/utils/palloc.h    11 Nov 2002 04:48:35 -0000
***************
*** 46,56 ****
   * Fundamental memory-allocation operations (more are in utils/memutils.h)
   */
  extern void *MemoryContextAlloc(MemoryContext context, Size size);
! extern void *MemoryContextAllocZero(MemoryContext context, Size size);

  #define palloc(sz)    MemoryContextAlloc(CurrentMemoryContext, (sz))

! #define palloc0(sz)    MemoryContextAllocZero(CurrentMemoryContext, (sz))

  extern void pfree(void *pointer);

--- 46,57 ----
   * Fundamental memory-allocation operations (more are in utils/memutils.h)
   */
  extern void *MemoryContextAlloc(MemoryContext context, Size size);
! extern void *MemoryContextAllocZero(MemoryContext context, Size size, bool use_libc);

  #define palloc(sz)    MemoryContextAlloc(CurrentMemoryContext, (sz))

! #define palloc0(sz)    MemoryContextAllocZero(CurrentMemoryContext, (sz), \
!                     MemSetTest(0, (sz)))

  extern void pfree(void *pointer);


Re: MemSet inline for newNode

From
Bruce Momjian
Date:
OK, here is a newer version of the macro.  I tested it on
catalog/index.c and saw the optimizer remove the MemSetTest conditional
for the newNode/palloc0 call in that file:

        pushl $1    <---------
        pushl $108
        movl CurrentMemoryContext,%eax
        pushl %eax
        call MemoryContextAllocZero

I still test the palloc pointer alignment in MemSetLoop because that is
not a constant.

I just ran a test and now see a 7% difference for the following test:

    j = 32;
    for (i = 0; i < 1000000; i++)
        MemSet(buf, 0, j /* choose 32 or j here */);
    return 0;

It used to be a 14% difference.  However, this looks more like an
optimizer problem than actual coding and with the new macros, MemSet
will be called with a constant third parameter anyway in almost all
cases, including newNode and the merge of palloc/MemSet into palloc0
patch.

---------------------------------------------------------------------------

Bruce Momjian wrote:
> Tom Lane wrote:
> > Bruce Momjian <pgman@candle.pha.pa.us> writes:
> > >> But your patch as committed does NOT inline newNode, in fact it does the
> > >> exact opposite: the MemSet macro is removed from the callsite.
> >
> > > Yes, there were actually two patches;  the first inlined newNode by
> > > calling palloc0.  Without palloc, there was no way to inline newNode.
> >
> > But you have *not* inlined newNode: the memset is still done at a location
> > that cannot know the size at compile time.
>
> I am sorry.  I inlined newNode, but by pushing MemSet into palloc0, the
> MemSet conditional tests can not be optimized away.  Is that clearer?
>
>     #define newNode(size, tag) \
>     ( \
>         AssertMacro((size) >= sizeof(Node)),        /* need the tag, at least */ \
>     \
>         newNodeMacroHolder = (Node *) palloc0(size), \
>         newNodeMacroHolder->type = (tag), \
>         newNodeMacroHolder \
>     )
>
> In this case, I needed palloc0 to make newNode inlined, and the old
> function version of newNode couldn't optimize away the conditional test
> either because it was called with various sizes.
>
>
> > > The second was a more general one to merge palloc/MemSet into a single
> > > palloc0 call.  That has been backed out while I research it.
> >
> > That part could be sold just on the basis of making the code easier
> > to read and less error-prone; particularly for call sites where the
> > length is a runtime computation anyway.  I think that newNode() is the
>
> Yes, that was my thought.
>
> > principal case where the length is knowable at compile time.  So my
> > feeling is that the general change to invent a palloc0() call is fine
> > ... but if you want performance then newNode() should *not* be using the
> > generic palloc0() call.
>
> OK. I am still struggling about how to do that.
>
> > > My new idea is to add a third boolean argument to
> > > MemoryContextAllocZero() which will control whether the MemSet
> > > assignment loop is used, or memset().
> >
> > But we are *trying to eliminate any runtime test whatever*.  Short
> > of that, you aren't going to get the speedup.  Certainly passing a third
> > argument to the alloc subroutine will eat enough cycles to negate any
> > savings from simplifying the runtime test slightly.
>
> My idea is that the conditional tests will be optimized away before the
> palloc0 call is made.
>
> Attached is a patch that implements the MemSet split and allows the size
> tests to be optimized away _before_ at the call location.  It isn't done
> yet, but you can get the idea.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073
Index: src/backend/utils/mmgr/mcxt.c
===================================================================
RCS file: /cvsroot/pgsql-server/src/backend/utils/mmgr/mcxt.c,v
retrieving revision 1.35
diff -c -c -r1.35 mcxt.c
*** src/backend/utils/mmgr/mcxt.c    10 Nov 2002 02:17:25 -0000    1.35
--- src/backend/utils/mmgr/mcxt.c    11 Nov 2002 15:56:09 -0000
***************
*** 460,466 ****
   *    function is called too many times, so we have a separate version.
   */
  void *
! MemoryContextAllocZero(MemoryContext context, Size size)
  {
      void *ret;

--- 460,466 ----
   *    function is called too many times, so we have a separate version.
   */
  void *
! MemoryContextAllocZero(MemoryContext context, Size size, bool use_loop)
  {
      void *ret;

***************
*** 471,477 ****
               (unsigned long) size);

      ret = (*context->methods->alloc) (context, size);
!     MemSet(ret, 0, size);
      return ret;
  }

--- 471,477 ----
               (unsigned long) size);

      ret = (*context->methods->alloc) (context, size);
!     MemSetLoop(ret, 0, size, use_loop);
      return ret;
  }

Index: src/include/c.h
===================================================================
RCS file: /cvsroot/pgsql-server/src/include/c.h,v
retrieving revision 1.131
diff -c -c -r1.131 c.h
*** src/include/c.h    11 Nov 2002 03:02:19 -0000    1.131
--- src/include/c.h    11 Nov 2002 15:56:12 -0000
***************
*** 577,601 ****
   *    memset() functions.  More research needs to be done, perhaps with
   *    platform-specific MEMSET_LOOP_LIMIT values or tests in configure.
   *
   *    bjm 2002-10-08
   */
! #define MemSet(start, val, len) \
      do \
      { \
!         int32 * _start = (int32 *) (start); \
!         int        _val = (val); \
!         Size    _len = (len); \
! \
!         if ((( ((long) _start) | _len) & INT_ALIGN_MASK) == 0 && \
!             _val == 0 && \
!             _len <= MEMSET_LOOP_LIMIT) \
          { \
!             int32 * _stop = (int32 *) ((char *) _start + _len); \
              while (_start < _stop) \
                  *_start++ = 0; \
          } \
          else \
!             memset((char *) _start, _val, _len); \
      } while (0)

  #define MEMSET_LOOP_LIMIT  1024
--- 577,613 ----
   *    memset() functions.  More research needs to be done, perhaps with
   *    platform-specific MEMSET_LOOP_LIMIT values or tests in configure.
   *
+  *    MemSet has been split into two parts so MemSetTest can be optimized
+  *    away for constant 'val' and 'len'.  This is used by palloc0().
+  *
+  *    Note, arguments are evaluated more than once.
+  *
   *    bjm 2002-10-08
   */
! #define MemSetTest(val, len) \
!         ( ((len) & INT_ALIGN_MASK) == 0 && \
!         (len) <= MEMSET_LOOP_LIMIT && \
!         (val) == 0 )
!
! #define MemSetLoop(start, val, len, use_loop) \
      do \
      { \
!         if (use_loop && ((long)(start) & INT_ALIGN_MASK) == 0 ) \
          { \
!             int32 * _start = (int32 *) (start); \
!             int32 * _stop = (int32 *) ((char *) _start + (len)); \
! \
              while (_start < _stop) \
                  *_start++ = 0; \
          } \
          else \
!             memset((char *)(start), (val), (len)); \
!     } while (0)
!
! #define MemSet(start, val, len) \
!     do \
!     { \
!         MemSetLoop(start, val, len, MemSetTest(val, len)); \
      } while (0)

  #define MEMSET_LOOP_LIMIT  1024
Index: src/include/utils/palloc.h
===================================================================
RCS file: /cvsroot/pgsql-server/src/include/utils/palloc.h,v
retrieving revision 1.22
diff -c -c -r1.22 palloc.h
*** src/include/utils/palloc.h    10 Nov 2002 02:17:25 -0000    1.22
--- src/include/utils/palloc.h    11 Nov 2002 15:56:14 -0000
***************
*** 46,56 ****
   * Fundamental memory-allocation operations (more are in utils/memutils.h)
   */
  extern void *MemoryContextAlloc(MemoryContext context, Size size);
! extern void *MemoryContextAllocZero(MemoryContext context, Size size);

  #define palloc(sz)    MemoryContextAlloc(CurrentMemoryContext, (sz))

! #define palloc0(sz)    MemoryContextAllocZero(CurrentMemoryContext, (sz))

  extern void pfree(void *pointer);

--- 46,57 ----
   * Fundamental memory-allocation operations (more are in utils/memutils.h)
   */
  extern void *MemoryContextAlloc(MemoryContext context, Size size);
! extern void *MemoryContextAllocZero(MemoryContext context, Size size, bool use_loop);

  #define palloc(sz)    MemoryContextAlloc(CurrentMemoryContext, (sz))

! #define palloc0(sz)    MemoryContextAllocZero(CurrentMemoryContext, (sz), \
!                     MemSetTest(0, (sz)))

  extern void pfree(void *pointer);


Re: MemSet inline for newNode

From
Tom Lane
Date:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
> I still test the palloc pointer alignment in MemSetLoop because that is
> not a constant.

But it's known aligned when you just got it from palloc.

Why don't you simply implement what was agreed to in the original
thread, namely:

    * provide a MemSetAligned macro that is just like the standard
      one except it omits the pointer alignment test

    * provide a palloc0 macro that does MemSetAligned inside the
      macro; known safe because palloc returns a maxaligned pointer

    * use palloc0 in the newNode macro

The approach you are taking is messier *and* slower than this.

            regards, tom lane

Re: MemSet inline for newNode

From
Bruce Momjian
Date:
Tom Lane wrote:
> Bruce Momjian <pgman@candle.pha.pa.us> writes:
> > I still test the palloc pointer alignment in MemSetLoop because that is
> > not a constant.
>
> But it's known aligned when you just got it from palloc.
>
> Why don't you simply implement what was agreed to in the original
> thread, namely:
>
>     * provide a MemSetAligned macro that is just like the standard
>       one except it omits the pointer alignment test
>
>     * provide a palloc0 macro that does MemSetAligned inside the
>       macro; known safe because palloc returns a maxaligned pointer

I can't do MemSet in a macro that returns a value, as palloc requires.
MemSet has a loop, and that can't be done in a macro that returns a value.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073

Re: MemSet inline for newNode

From
Tom Lane
Date:
Bruce Momjian <pgman@candle.pha.pa.us> writes:
> I can't do MemSet in a macro that returns a value, as palloc requires.
> MemSet has a loop, and that can't be done in a macro that returns a value.

Hm.  How did Neil test this originally --- was he relying on being able
to "inline" newNode()?

Anyway, I don't think that passing an extra parameter can be a win.
If there has to be a runtime test, testing whether the two low bits
of the length are zero is probably about the same speed as testing a
boolean parameter.  It's unlikely to be enough slower to justify the
cost of passing another parameter.

            regards, tom lane

Re: MemSet inline for newNode

From
Bruce Momjian
Date:
Tom Lane wrote:
> Bruce Momjian <pgman@candle.pha.pa.us> writes:
> > I can't do MemSet in a macro that returns a value, as palloc requires.
> > MemSet has a loop, and that can't be done in a macro that returns a value.
>
> Hm.  How did Neil test this originally --- was he relying on being able
> to "inline" newNode()?

Yes.

> Anyway, I don't think that passing an extra parameter can be a win.
> If there has to be a runtime test, testing whether the two low bits
> of the length are zero is probably about the same speed as testing a
> boolean parameter.  It's unlikely to be enough slower to justify the
> cost of passing another parameter.

OK, new version attached, with extra parameter removed.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073
Index: src/backend/utils/mmgr/mcxt.c
===================================================================
RCS file: /cvsroot/pgsql-server/src/backend/utils/mmgr/mcxt.c,v
retrieving revision 1.35
diff -c -c -r1.35 mcxt.c
*** src/backend/utils/mmgr/mcxt.c    10 Nov 2002 02:17:25 -0000    1.35
--- src/backend/utils/mmgr/mcxt.c    11 Nov 2002 19:11:05 -0000
***************
*** 453,466 ****
  }

  /*
!  * MemoryContextAllocZero
   *        Like MemoryContextAlloc, but clears allocated memory
   *
   *    We could just call MemoryContextAlloc then clear the memory, but this
   *    function is called too many times, so we have a separate version.
   */
  void *
! MemoryContextAllocZero(MemoryContext context, Size size)
  {
      void *ret;

--- 453,466 ----
  }

  /*
!  * MemoryContextAllocPalloc0
   *        Like MemoryContextAlloc, but clears allocated memory
   *
   *    We could just call MemoryContextAlloc then clear the memory, but this
   *    function is called too many times, so we have a separate version.
   */
  void *
! MemoryContextAllocPalloc0(MemoryContext context, Size size)
  {
      void *ret;

***************
*** 471,477 ****
               (unsigned long) size);

      ret = (*context->methods->alloc) (context, size);
!     MemSet(ret, 0, size);
      return ret;
  }

--- 471,477 ----
               (unsigned long) size);

      ret = (*context->methods->alloc) (context, size);
!     MemSetLoop(ret, 0, size);
      return ret;
  }

Index: src/include/c.h
===================================================================
RCS file: /cvsroot/pgsql-server/src/include/c.h,v
retrieving revision 1.131
diff -c -c -r1.131 c.h
*** src/include/c.h    11 Nov 2002 03:02:19 -0000    1.131
--- src/include/c.h    11 Nov 2002 19:11:08 -0000
***************
*** 577,602 ****
   *    memset() functions.  More research needs to be done, perhaps with
   *    platform-specific MEMSET_LOOP_LIMIT values or tests in configure.
   *
   *    bjm 2002-10-08
   */
! #define MemSet(start, val, len) \
!     do \
!     { \
!         int32 * _start = (int32 *) (start); \
!         int        _val = (val); \
!         Size    _len = (len); \
  \
!         if ((( ((long) _start) | _len) & INT_ALIGN_MASK) == 0 && \
!             _val == 0 && \
!             _len <= MEMSET_LOOP_LIMIT) \
!         { \
!             int32 * _stop = (int32 *) ((char *) _start + _len); \
!             while (_start < _stop) \
!                 *_start++ = 0; \
!         } \
!         else \
!             memset((char *) _start, _val, _len); \
!     } while (0)

  #define MEMSET_LOOP_LIMIT  1024

--- 577,612 ----
   *    memset() functions.  More research needs to be done, perhaps with
   *    platform-specific MEMSET_LOOP_LIMIT values or tests in configure.
   *
+  *    MemSet has been split into two parts so MemSetTest can be optimized
+  *    away for constant 'val' and 'len'.  This is used by palloc0().
+  *
+  *    Note, arguments are evaluated more than once.
+  *
   *    bjm 2002-10-08
   */
! #define MemSetTest(val, len) \
!     ( ((len) & INT_ALIGN_MASK) == 0 && \
!     (len) <= MEMSET_LOOP_LIMIT && \
!     (val) == 0 )
!
! #define MemSetLoop(start, val, len) \
! do \
! { \
!     int32 * _start = (int32 *) (start); \
!     int32 * _stop = (int32 *) ((char *) _start + (len)); \
  \
!     while (_start < _stop) \
!         *_start++ = 0; \
! } while (0)
!
! #define MemSet(start, val, len) \
! do \
! { \
!     if (MemSetTest(val, len) && ((long)(start) & INT_ALIGN_MASK) == 0 ) \
!         MemSetLoop(start, val, len); \
!     else \
!         memset((char *)(start), (val), (len)); \
! } while (0)

  #define MEMSET_LOOP_LIMIT  1024

Index: src/include/utils/palloc.h
===================================================================
RCS file: /cvsroot/pgsql-server/src/include/utils/palloc.h,v
retrieving revision 1.22
diff -c -c -r1.22 palloc.h
*** src/include/utils/palloc.h    10 Nov 2002 02:17:25 -0000    1.22
--- src/include/utils/palloc.h    11 Nov 2002 19:11:10 -0000
***************
*** 46,56 ****
   * Fundamental memory-allocation operations (more are in utils/memutils.h)
   */
  extern void *MemoryContextAlloc(MemoryContext context, Size size);
! extern void *MemoryContextAllocZero(MemoryContext context, Size size);

  #define palloc(sz)    MemoryContextAlloc(CurrentMemoryContext, (sz))

! #define palloc0(sz)    MemoryContextAllocZero(CurrentMemoryContext, (sz))

  extern void pfree(void *pointer);

--- 46,59 ----
   * Fundamental memory-allocation operations (more are in utils/memutils.h)
   */
  extern void *MemoryContextAlloc(MemoryContext context, Size size);
! extern void *MemoryContextAllocPalloc0(MemoryContext context, Size size);

  #define palloc(sz)    MemoryContextAlloc(CurrentMemoryContext, (sz))

! #define palloc0(sz)    \
!     ( MemSetTest(0, (sz)) ? \
!         MemoryContextAllocPalloc0(CurrentMemoryContext, (sz)) : \
!         memset(palloc(sz), 0, (sz)))

  extern void pfree(void *pointer);


Re: MemSet inline for newNode

From
Bruce Momjian
Date:
I just ran a test on catalog/index.c and found that with the new patch,
the conditional test is completely gone.  It now calls the palloc0
memory allocator unconditionally because all the conditions are met by
constants:

    BuildIndexInfo:
            pushl %ebp
            movl %esp,%ebp
            subl $32,%esp
            pushl %esi
            pushl %ebx
            addl $-8,%esp
            pushl $108
            movl CurrentMemoryContext,%eax
            pushl %eax
            call MemoryContextAllocPalloc0

and, in MemoryContextAllocPalloc0, the inline MemSet loop is used
unconditionally.

---------------------------------------------------------------------------

Bruce Momjian wrote:
> Tom Lane wrote:
> > Bruce Momjian <pgman@candle.pha.pa.us> writes:
> > > I can't do MemSet in a macro that returns a value, as palloc requires.
> > > MemSet has a loop, and that can't be done in a macro that returns a value.
> >
> > Hm.  How did Neil test this originally --- was he relying on being able
> > to "inline" newNode()?
>
> Yes.
>
> > Anyway, I don't think that passing an extra parameter can be a win.
> > If there has to be a runtime test, testing whether the two low bits
> > of the length are zero is probably about the same speed as testing a
> > boolean parameter.  It's unlikely to be enough slower to justify the
> > cost of passing another parameter.
>
> OK, new version attached, with extra parameter removed.
>

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073