Thread: Some platform-specific MemSet research

Some platform-specific MemSet research

From
Seneca Cunningham
Date:
After reading the post on -patches proposing that MemSet be changed to
use long instead of int32 on the grounds that a pair of x86-64 linux
boxes took less time to execute the long code 64*10^6 times[1], I took a
look at how the testcode performed on AIX with gcc.  While the switch to
long did result in a minor performance improvement, dropping the
MemSetLoop in favour of the native memset resulted in the tests taking
~25% the time as the MemSetLoop-like int loop. The 32-bit linux system I
ran the expanded tests on showed that for the buffer size range that
postgres can use the looping MemSet instead of memset (size <= 1024
bytes), MemSet generally had better performance.

Test results, reformatted for space:

* AIX5.3 ML3
        gcc version 4.0.1
        OBJECT_MODE=64 gcc -maix64 -O2
sizeof(int)  = 4
sizeof(long) = 8

int    size=8      1.876096   1.875817   1.875998
long   size=8      0.215347   0.215389   0.215367
memset size=8      0.127711   0.127726   0.127706
int    size=16     0.617316   0.617346   0.617300
long   size=16     0.408607   0.408294   0.408263
memset size=16     0.212843   0.176918   0.212854
int    size=32     2.983032   2.982887   2.982724
long   size=32     2.172499   2.172440   2.172549
memset size=32     0.255465   0.255449   0.255422
int    size=64     3.560825   3.559743   3.559785
long   size=64     2.974126   2.999054   2.942597
memset size=64     1.021843   1.021709   1.021704
int    size=128    4.983803   4.983515   4.983236
long   size=128    3.515213   3.514761   3.514733
memset size=128    1.319846   1.319699   1.319671
int    size=256    9.071160   9.070497   9.070350
long   size=256    7.428318   7.001997   6.990831
memset size=256    1.830684   1.830558   1.830533
int    size=512   17.330519  17.329175  17.328520
long   size=512   14.903931  14.902345  14.902329
memset size=512    3.512420   3.512139   3.512111
int    size=1024  34.593734  34.592775  34.591700
long   size=1024  23.804386  23.652192  24.043249
memset size=1024   6.010309   6.049034   6.052664
int    size=2048  66.380036  66.374455  66.375010
long   size=2048  45.094202  45.087909  45.087128
memset size=2048  11.638963  11.662794  11.664649
int    size=4096 131.777427 131.764230 131.764542
long   size=4096  88.906880  88.840758  88.887926
memset size=4096  22.882468  22.921160  22.920992


* Pentium 4 2.80GHz
        Ubuntu 5.10 2.6.12-10-686 #1
        gcc version 4.0.2 20050808 (prerelease) (Ubuntu 4.0.1-4ubuntu9)
        gcc -O2
sizeof(int)  = 4
sizeof(long) = 4

int    size=8      0.319620   0.270326   0.288407
long   size=8      0.279157   0.278571   0.339791
memset size=8      0.186439   0.192561   0.194865
int    size=16     0.455448   0.459051   0.519848
long   size=16     0.455193   0.451253   0.565159
memset size=16     0.257428   0.256752   0.356195
int    size=32     0.732009   0.730730   0.750304
long   size=32     0.731353   0.734311   0.743041
memset size=32     1.386004   1.404297   1.378161
int    size=64     1.289708   1.397941   1.288536
long   size=64     1.302256   1.380754   1.294904
memset size=64     2.965440   3.197489   2.958864
int    size=128    3.162121   3.548065   3.158412
long   size=128    3.150525   3.161121   3.153037
memset size=128    3.705133   3.739082   3.704949
int    size=256    5.393701   5.415562   5.583510
long   size=256    5.420254   5.367381   5.362041
memset size=256    9.246601   8.983931   9.040215
int    size=512   10.219667   9.854537   9.851564
long   size=512    9.906317   9.878196  10.202070
memset size=512   11.290588  11.050312  11.789231
int    size=1024  19.777706  20.752631  19.846717
long   size=1024  18.934663  18.870325  19.854066
memset size=1024  15.349694  15.487714  15.999638
int    size=2048  28.783087  28.214086  26.228851
long   size=2048  26.628890  30.611856  26.245331
memset size=2048  24.434751  24.095879  23.435490
int    size=4096  53.817698  57.266583  51.547177
long   size=4096  55.868670  53.012144  51.564656
memset size=4096  45.772710  40.651142  39.702063


[1] http://archives.postgresql.org/pgsql-patches/2006-01/msg00211.php

--
Seneca Cunningham
scunning@ca.afilias.info
#include <stdio.h>
#include <sys/time.h>
#include <string.h>

#define TYPEALIGN(ALIGNVAL,LEN)  \
    (((long) (LEN) + ((ALIGNVAL) - 1)) & ~((long) ((ALIGNVAL) - 1)))

#define MemSetLoop(type, start, val, len) \
    do \
    { \
        type * _start = (type *) (start); \
        type * _stop = (type *) ((char *) _start + (size_t) (len)); \
    \
        while (_start < _stop) \
            *_start++ = 0; \
    } while (0)

#define MAXALIGN    8
#define MAXSIZE        4096
#define LOOP        (1000*1000*64)

static void print_time(const char* msg, int size, const struct timeval *start, const struct timeval *end)
{
    double t;
    t = (end->tv_sec - start->tv_sec) + (end->tv_usec - start->tv_usec) / 1000000.0;
    printf("%s (size=%d) : %f\n", msg, size, t);
}

#define TEST(type, size)    \
    do { \
        int i; \
        gettimeofday(&start, NULL); \
        for(i = 0; i < LOOP; i++) \
        { \
            MemSetLoop(type, buffer, 0, size); \
        } \
        gettimeofday(&end, NULL); \
        print_time("Loop by " #type, size, &start, &end); \
    } while (0)

#define TESTNATIVE(type, size)    \
    do { \
        int i; \
        gettimeofday(&start, NULL); \
        for(i = 0; i < LOOP; i++) \
        { \
            memset(buffer, 0, size); \
        } \
        gettimeofday(&end, NULL); \
        print_time("memset by " #type, size, &start, &end); \
    } while (0)

int main()
{
    int j;
    struct timeval start, end;
    char buffer0[MAXSIZE + MAXALIGN];
    char* buffer = (char*) TYPEALIGN(MAXALIGN, buffer0);

    printf("sizeof(int)  = %d\n", sizeof(int));
    printf("sizeof(long) = %d\n", sizeof(long));

    for(j = 0; j < 3; j++)
    {
        TEST(int , 8);
        TESTNATIVE(int , 8);
        TEST(long, 8);
        TEST(int , 16);
        TESTNATIVE(int , 16);
        TEST(long, 16);
        TEST(int , 32);
        TESTNATIVE(int , 32);
        TEST(long, 32);
        TEST(int , 64);
        TESTNATIVE(int , 64);
        TEST(long, 64);
        TEST(int , 128);
        TESTNATIVE(int , 128);
        TEST(long, 128);
        TEST(int , 256);
        TESTNATIVE(int , 256);
        TEST(long, 256);
        TEST(int , 512);
        TESTNATIVE(int , 512);
        TEST(long, 512);
        TEST(int , 1024);
        TESTNATIVE(int , 1024);
        TEST(long, 1024);
        TEST(int , 2048);
        TESTNATIVE(int , 2048);
        TEST(long, 2048);
        TEST(int , 4096);
        TESTNATIVE(int , 4096);
        TEST(long, 4096);
    }
    return 0;
}

Re: Some platform-specific MemSet research

From
Martijn van Oosterhout
Date:
On Tue, Jan 24, 2006 at 05:24:28PM -0500, Seneca Cunningham wrote:
> After reading the post on -patches proposing that MemSet be changed to
> use long instead of int32 on the grounds that a pair of x86-64 linux
> boxes took less time to execute the long code 64*10^6 times[1], I took a
> look at how the testcode performed on AIX with gcc.  While the switch to
> long did result in a minor performance improvement, dropping the
> MemSetLoop in favour of the native memset resulted in the tests taking
> ~25% the time as the MemSetLoop-like int loop. The 32-bit linux system I
> ran the expanded tests on showed that for the buffer size range that
> postgres can use the looping MemSet instead of memset (size <= 1024
> bytes), MemSet generally had better performance.

Could you please check the asm output to see what's going on. We've had
tests like these produce odd results in the past because the compiler
optimised away stuff that didn't have any effect. Since every memset
after the first is a no-op, you want to make sure it's still actually
doing the work...

Have a nice day,
--
Martijn van Oosterhout   <kleptog@svana.org>   http://svana.org/kleptog/
> Patent. n. Genius is 5% inspiration and 95% perspiration. A patent is a
> tool for doing 5% of the work and then sitting around waiting for someone
> else to do the other 95% so you can sue them.

Re: Some platform-specific MemSet research

From
Seneca Cunningham
Date:
Martijn van Oosterhout wrote:
> On Tue, Jan 24, 2006 at 05:24:28PM -0500, Seneca Cunningham wrote:
> 
>>After reading the post on -patches proposing that MemSet be changed to
>>use long instead of int32 on the grounds that a pair of x86-64 linux
>>boxes took less time to execute the long code 64*10^6 times[1], I took a
>>look at how the testcode performed on AIX with gcc.  While the switch to
>>long did result in a minor performance improvement, dropping the
>>MemSetLoop in favour of the native memset resulted in the tests taking
>>~25% the time as the MemSetLoop-like int loop. The 32-bit linux system I
>>ran the expanded tests on showed that for the buffer size range that
>>postgres can use the looping MemSet instead of memset (size <= 1024
>>bytes), MemSet generally had better performance.
> 
> 
> Could you please check the asm output to see what's going on. We've had
> tests like these produce odd results in the past because the compiler
> optimised away stuff that didn't have any effect. Since every memset
> after the first is a no-op, you want to make sure it's still actually
> doing the work...

Well, on both linux and AIX, all 30 of the 64000000 iterations loops
from the source exist (10 int, 10 long, 10 memset).  According to my
understanding of the assembler, memset itself is only called for values
>= 64 bytes on both platforms and the memset is called in each iteration.

The assembler for the 64 byte loops, with prepended line number, first
loop MemSetLoop int-variant, second loop memset, third loop MemSetLoop
long-variant:

64-bit AIX:
   419     addi 3,1,112   420     li 4,0   421     bl .gettimeofday   422     nop   423     lis 10,0x3d0   424
cmpld6,26,16   425     li 11,0   426     ori 10,10,36864   427 L..41:   428     bge 6,L..42   429     mr 9,26   430
li0,0   431 L..44:   432     stw 0,0(9)   433     addi 9,9,4   434     cmpld 7,16,9   435     bgt 7,L..44   436 L..42:
437     addi 0,11,1   438     extsw 11,0   439     cmpw 7,11,10   440     bne+ 7,L..41   441     li 4,0   442     mr
3,22  443     lis 25,0x3d0   444     li 28,0   445     bl .gettimeofday   446     nop   447     li 4,64   448     addi
5,1,112  449     ld 3,LC..9(2)   450     mr 6,22   451     ori 25,25,36864   452     bl .print_time   453     addi
3,1,112  454     li 4,0   455     bl .gettimeofday   456     nop   457 L..46:   458     mr 3,26   459     li 4,0   460
  li 5,64   461     bl .memset   462     nop   463     addi 0,28,1   464     extsw 28,0   465     cmpw 7,28,25   466
bne+ 7,L..46   467     li 4,0   468     mr 3,22   469     bl .gettimeofday   470     nop   471     li 4,64   472
addi5,1,112   473     ld 3,LC..11(2)   474     mr 6,22   475     bl .print_time   476     addi 3,1,112   477     li 4,0
 478     bl .gettimeofday   479     nop   480     lis 10,0x3d0   481     cmpld 6,26,16   482     li 11,0   483     ori
10,10,36864  484 L..48:   485     bge 6,L..49   486     mr 9,26   487     li 0,0   488 L..51:   489     std 0,0(9)
490    addi 9,9,8   491     cmpld 7,9,16   492     blt 7,L..51   493 L..49:   494     addi 0,11,1   495     extsw 11,0
496     cmpw 7,11,10   497     bne+ 7,L..48   498     li 4,0   499     mr 3,22   500     bl .gettimeofday   501     nop
 502     li 4,64   503     addi 5,1,112   504     ld 3,LC..13(2)   505     mr 6,22   506     bl .print_time
 


32-bit Linux:
   387     popl    %ecx   388     popl    %edi   389     pushl   $0   390     leal    -20(%ebp), %edx   391     pushl
%edx  392     call    gettimeofday   393     xorl    %edx, %edx   394     addl    $16, %esp   395 .L41:   396     movl
 -4160(%ebp), %eax   397     cmpl    %eax, -4144(%ebp)   398     jae .L42   399     movl    -4144(%ebp), %eax   400
.L44:  401     movl    $0, (%eax)   402     addl    $4, %eax   403     cmpl    %eax, -4160(%ebp)   404     ja  .L44
405.L42:   406     incl    %edx   407     cmpl    $64000000, %edx   408     jne .L41   409     subl    $8, %esp   410
 pushl   $0   411     leal    -28(%ebp), %edx   412     pushl   %edx   413     call    gettimeofday   414     leal
-28(%ebp),%eax   415     movl    %eax, (%esp)   416     leal    -20(%ebp), %ecx   417     movl    $64, %edx   418
movl   $.LC5, %eax   419     call    print_time   420     popl    %eax   421     popl    %edx   422     pushl   $0
423    leal    -20(%ebp), %edx   424     pushl   %edx   425     call    gettimeofday   426     xorl    %edi, %edi   427
   addl    $16, %esp   428 .L46:   429     pushl   %eax   430     pushl   $64   431     pushl   $0   432     movl
-4144(%ebp),%ecx   433     pushl   %ecx   434     call    memset   435     incl    %edi   436     addl    $16, %esp
437    cmpl    $64000000, %edi   438     jne .L46   439     subl    $8, %esp   440     pushl   $0   441     leal
-28(%ebp),%eax   442     pushl   %eax   443     call    gettimeofday   444     leal    -28(%ebp), %edx   445     movl
%edx, (%esp)   446     leal    -20(%ebp), %ecx   447     movl    $64, %edx   448     movl    $.LC6, %eax   449     call
  print_time   450     popl    %eax   451     popl    %edx   452     pushl   $0   453     leal    -20(%ebp), %eax   454
   pushl   %eax   455     call    gettimeofday   456     xorl    %edx, %edx   457     addl    $16, %esp   458 .L48:
459    movl    -4160(%ebp), %eax   460     cmpl    %eax, -4144(%ebp)   461     jae .L49   462     movl    -4144(%ebp),
%eax  463 .L51:   464     movl    $0, (%eax)   465     addl    $4, %eax   466     cmpl    -4160(%ebp), %eax   467
jb .L51   468 .L49:   469     incl    %edx   470     cmpl    $64000000, %edx   471     jne .L48   472     subl    $8,
%esp  473     pushl   $0   474     leal    -28(%ebp), %edx   475     pushl   %edx   476     call    gettimeofday   477
  leal    -28(%ebp), %eax   478     movl    %eax, (%esp)   479     leal    -20(%ebp), %ecx   480     movl    $64, %edx
481     movl    $.LC7, %eax   482     call    print_time
 

-- 
Seneca Cunningham
scunning@ca.afilias.info


Re: Some platform-specific MemSet research

From
Bruce Momjian
Date:
My guess is that there is some really fast assembler for memory copy on
AIX, and only libc memset() has it.  If you want, we can make
MEMSET_LOOP_LIMIT in c.h a configure value, and allow template/aix to
set it to zero, causing memset() to be always used.

Are you prepared to make this optimization decision for all AIX users
using gcc, or only for certain versions?

---------------------------------------------------------------------------

Seneca Cunningham wrote:
> Martijn van Oosterhout wrote:
> > On Tue, Jan 24, 2006 at 05:24:28PM -0500, Seneca Cunningham wrote:
> > 
> >>After reading the post on -patches proposing that MemSet be changed to
> >>use long instead of int32 on the grounds that a pair of x86-64 linux
> >>boxes took less time to execute the long code 64*10^6 times[1], I took a
> >>look at how the testcode performed on AIX with gcc.  While the switch to
> >>long did result in a minor performance improvement, dropping the
> >>MemSetLoop in favour of the native memset resulted in the tests taking
> >>~25% the time as the MemSetLoop-like int loop. The 32-bit linux system I
> >>ran the expanded tests on showed that for the buffer size range that
> >>postgres can use the looping MemSet instead of memset (size <= 1024
> >>bytes), MemSet generally had better performance.
> > 
> > 
> > Could you please check the asm output to see what's going on. We've had
> > tests like these produce odd results in the past because the compiler
> > optimised away stuff that didn't have any effect. Since every memset
> > after the first is a no-op, you want to make sure it's still actually
> > doing the work...
> 
> Well, on both linux and AIX, all 30 of the 64000000 iterations loops
> from the source exist (10 int, 10 long, 10 memset).  According to my
> understanding of the assembler, memset itself is only called for values
> >= 64 bytes on both platforms and the memset is called in each iteration.
> 
> The assembler for the 64 byte loops, with prepended line number, first
> loop MemSetLoop int-variant, second loop memset, third loop MemSetLoop
> long-variant:
> 
> 64-bit AIX:
> 
>     419     addi 3,1,112
>     420     li 4,0
>     421     bl .gettimeofday
>     422     nop
>     423     lis 10,0x3d0
>     424     cmpld 6,26,16
>     425     li 11,0
>     426     ori 10,10,36864
>     427 L..41:
>     428     bge 6,L..42
>     429     mr 9,26
>     430     li 0,0
>     431 L..44:
>     432     stw 0,0(9)
>     433     addi 9,9,4
>     434     cmpld 7,16,9
>     435     bgt 7,L..44
>     436 L..42:
>     437     addi 0,11,1
>     438     extsw 11,0
>     439     cmpw 7,11,10
>     440     bne+ 7,L..41
>     441     li 4,0
>     442     mr 3,22
>     443     lis 25,0x3d0
>     444     li 28,0
>     445     bl .gettimeofday
>     446     nop
>     447     li 4,64
>     448     addi 5,1,112
>     449     ld 3,LC..9(2)
>     450     mr 6,22
>     451     ori 25,25,36864
>     452     bl .print_time
>     453     addi 3,1,112
>     454     li 4,0
>     455     bl .gettimeofday
>     456     nop
>     457 L..46:
>     458     mr 3,26
>     459     li 4,0
>     460     li 5,64
>     461     bl .memset
>     462     nop
>     463     addi 0,28,1
>     464     extsw 28,0
>     465     cmpw 7,28,25
>     466     bne+ 7,L..46
>     467     li 4,0
>     468     mr 3,22
>     469     bl .gettimeofday
>     470     nop
>     471     li 4,64
>     472     addi 5,1,112
>     473     ld 3,LC..11(2)
>     474     mr 6,22
>     475     bl .print_time
>     476     addi 3,1,112
>     477     li 4,0
>     478     bl .gettimeofday
>     479     nop
>     480     lis 10,0x3d0
>     481     cmpld 6,26,16
>     482     li 11,0
>     483     ori 10,10,36864
>     484 L..48:
>     485     bge 6,L..49
>     486     mr 9,26
>     487     li 0,0
>     488 L..51:
>     489     std 0,0(9)
>     490     addi 9,9,8
>     491     cmpld 7,9,16
>     492     blt 7,L..51
>     493 L..49:
>     494     addi 0,11,1
>     495     extsw 11,0
>     496     cmpw 7,11,10
>     497     bne+ 7,L..48
>     498     li 4,0
>     499     mr 3,22
>     500     bl .gettimeofday
>     501     nop
>     502     li 4,64
>     503     addi 5,1,112
>     504     ld 3,LC..13(2)
>     505     mr 6,22
>     506     bl .print_time
> 
> 
> 32-bit Linux:
> 
>     387     popl    %ecx
>     388     popl    %edi
>     389     pushl   $0
>     390     leal    -20(%ebp), %edx
>     391     pushl   %edx
>     392     call    gettimeofday
>     393     xorl    %edx, %edx
>     394     addl    $16, %esp
>     395 .L41:
>     396     movl    -4160(%ebp), %eax
>     397     cmpl    %eax, -4144(%ebp)
>     398     jae .L42
>     399     movl    -4144(%ebp), %eax
>     400 .L44:
>     401     movl    $0, (%eax)
>     402     addl    $4, %eax
>     403     cmpl    %eax, -4160(%ebp)
>     404     ja  .L44
>     405 .L42:
>     406     incl    %edx
>     407     cmpl    $64000000, %edx
>     408     jne .L41
>     409     subl    $8, %esp
>     410     pushl   $0
>     411     leal    -28(%ebp), %edx
>     412     pushl   %edx
>     413     call    gettimeofday
>     414     leal    -28(%ebp), %eax
>     415     movl    %eax, (%esp)
>     416     leal    -20(%ebp), %ecx
>     417     movl    $64, %edx
>     418     movl    $.LC5, %eax
>     419     call    print_time
>     420     popl    %eax
>     421     popl    %edx
>     422     pushl   $0
>     423     leal    -20(%ebp), %edx
>     424     pushl   %edx
>     425     call    gettimeofday
>     426     xorl    %edi, %edi
>     427     addl    $16, %esp
>     428 .L46:
>     429     pushl   %eax
>     430     pushl   $64
>     431     pushl   $0
>     432     movl    -4144(%ebp), %ecx
>     433     pushl   %ecx
>     434     call    memset
>     435     incl    %edi
>     436     addl    $16, %esp
>     437     cmpl    $64000000, %edi
>     438     jne .L46
>     439     subl    $8, %esp
>     440     pushl   $0
>     441     leal    -28(%ebp), %eax
>     442     pushl   %eax
>     443     call    gettimeofday
>     444     leal    -28(%ebp), %edx
>     445     movl    %edx, (%esp)
>     446     leal    -20(%ebp), %ecx
>     447     movl    $64, %edx
>     448     movl    $.LC6, %eax
>     449     call    print_time
>     450     popl    %eax
>     451     popl    %edx
>     452     pushl   $0
>     453     leal    -20(%ebp), %eax
>     454     pushl   %eax
>     455     call    gettimeofday
>     456     xorl    %edx, %edx
>     457     addl    $16, %esp
>     458 .L48:
>     459     movl    -4160(%ebp), %eax
>     460     cmpl    %eax, -4144(%ebp)
>     461     jae .L49
>     462     movl    -4144(%ebp), %eax
>     463 .L51:
>     464     movl    $0, (%eax)
>     465     addl    $4, %eax
>     466     cmpl    -4160(%ebp), %eax
>     467     jb  .L51
>     468 .L49:
>     469     incl    %edx
>     470     cmpl    $64000000, %edx
>     471     jne .L48
>     472     subl    $8, %esp
>     473     pushl   $0
>     474     leal    -28(%ebp), %edx
>     475     pushl   %edx
>     476     call    gettimeofday
>     477     leal    -28(%ebp), %eax
>     478     movl    %eax, (%esp)
>     479     leal    -20(%ebp), %ecx
>     480     movl    $64, %edx
>     481     movl    $.LC7, %eax
>     482     call    print_time
> 
> -- 
> Seneca Cunningham
> scunning@ca.afilias.info
> 
> ---------------------------(end of broadcast)---------------------------
> TIP 5: don't forget to increase your free space map settings
> 

--  Bruce Momjian                        |  http://candle.pha.pa.us pgman@candle.pha.pa.us               |  (610)
359-1001+  If your life is a hard drive,     |  13 Roberts Road +  Christ can be your backup.        |  Newtown Square,
Pennsylvania19073
 


Re: Some platform-specific MemSet research

From
"Rocco Altier"
Date:
I wanted to chime in that I also see this speedup from using XLC 6.0
(IBM's cc), even in 32bit mode.  I have tested on AIX 5.2 and 5.1.

I think this would be good to include in the regular release.

Not sure how many people are running older versions of AIX that would
want a new version of postgres.
-rocco



> -----Original Message-----
> From: pgsql-hackers-owner@postgresql.org
> [mailto:pgsql-hackers-owner@postgresql.org] On Behalf Of Bruce Momjian
> Sent: Wednesday, February 01, 2006 12:11 PM
> To: Seneca Cunningham
> Cc: Martijn van Oosterhout; pgsql-hackers@postgresql.org
> Subject: Re: [HACKERS] Some platform-specific MemSet research
>
>
>
> My guess is that there is some really fast assembler for
> memory copy on
> AIX, and only libc memset() has it.  If you want, we can make
> MEMSET_LOOP_LIMIT in c.h a configure value, and allow template/aix to
> set it to zero, causing memset() to be always used.
>
> Are you prepared to make this optimization decision for all AIX users
> using gcc, or only for certain versions?
>
> --------------------------------------------------------------
> -------------
>
> Seneca Cunningham wrote:
> > Martijn van Oosterhout wrote:
> > > On Tue, Jan 24, 2006 at 05:24:28PM -0500, Seneca Cunningham wrote:
> > >
> > >>After reading the post on -patches proposing that MemSet
> be changed to
> > >>use long instead of int32 on the grounds that a pair of
> x86-64 linux
> > >>boxes took less time to execute the long code 64*10^6
> times[1], I took a
> > >>look at how the testcode performed on AIX with gcc.
> While the switch to
> > >>long did result in a minor performance improvement, dropping the
> > >>MemSetLoop in favour of the native memset resulted in the
> tests taking
> > >>~25% the time as the MemSetLoop-like int loop. The 32-bit
> linux system I
> > >>ran the expanded tests on showed that for the buffer size
> range that
> > >>postgres can use the looping MemSet instead of memset
> (size <= 1024
> > >>bytes), MemSet generally had better performance.
> > >
> > >
> > > Could you please check the asm output to see what's going
> on. We've had
> > > tests like these produce odd results in the past because
> the compiler
> > > optimised away stuff that didn't have any effect. Since
> every memset
> > > after the first is a no-op, you want to make sure it's
> still actually
> > > doing the work...
> >
> > Well, on both linux and AIX, all 30 of the 64000000 iterations loops
> > from the source exist (10 int, 10 long, 10 memset).  According to my
> > understanding of the assembler, memset itself is only
> called for values
> > >= 64 bytes on both platforms and the memset is called in
> each iteration.
> >
> > The assembler for the 64 byte loops, with prepended line
> number, first
> > loop MemSetLoop int-variant, second loop memset, third loop
> MemSetLoop
> > long-variant:
> >
> > 64-bit AIX:
> >
> >     419     addi 3,1,112
> >     420     li 4,0
> >     421     bl .gettimeofday
> >     422     nop
> >     423     lis 10,0x3d0
> >     424     cmpld 6,26,16
> >     425     li 11,0
> >     426     ori 10,10,36864
> >     427 L..41:
> >     428     bge 6,L..42
> >     429     mr 9,26
> >     430     li 0,0
> >     431 L..44:
> >     432     stw 0,0(9)
> >     433     addi 9,9,4
> >     434     cmpld 7,16,9
> >     435     bgt 7,L..44
> >     436 L..42:
> >     437     addi 0,11,1
> >     438     extsw 11,0
> >     439     cmpw 7,11,10
> >     440     bne+ 7,L..41
> >     441     li 4,0
> >     442     mr 3,22
> >     443     lis 25,0x3d0
> >     444     li 28,0
> >     445     bl .gettimeofday
> >     446     nop
> >     447     li 4,64
> >     448     addi 5,1,112
> >     449     ld 3,LC..9(2)
> >     450     mr 6,22
> >     451     ori 25,25,36864
> >     452     bl .print_time
> >     453     addi 3,1,112
> >     454     li 4,0
> >     455     bl .gettimeofday
> >     456     nop
> >     457 L..46:
> >     458     mr 3,26
> >     459     li 4,0
> >     460     li 5,64
> >     461     bl .memset
> >     462     nop
> >     463     addi 0,28,1
> >     464     extsw 28,0
> >     465     cmpw 7,28,25
> >     466     bne+ 7,L..46
> >     467     li 4,0
> >     468     mr 3,22
> >     469     bl .gettimeofday
> >     470     nop
> >     471     li 4,64
> >     472     addi 5,1,112
> >     473     ld 3,LC..11(2)
> >     474     mr 6,22
> >     475     bl .print_time
> >     476     addi 3,1,112
> >     477     li 4,0
> >     478     bl .gettimeofday
> >     479     nop
> >     480     lis 10,0x3d0
> >     481     cmpld 6,26,16
> >     482     li 11,0
> >     483     ori 10,10,36864
> >     484 L..48:
> >     485     bge 6,L..49
> >     486     mr 9,26
> >     487     li 0,0
> >     488 L..51:
> >     489     std 0,0(9)
> >     490     addi 9,9,8
> >     491     cmpld 7,9,16
> >     492     blt 7,L..51
> >     493 L..49:
> >     494     addi 0,11,1
> >     495     extsw 11,0
> >     496     cmpw 7,11,10
> >     497     bne+ 7,L..48
> >     498     li 4,0
> >     499     mr 3,22
> >     500     bl .gettimeofday
> >     501     nop
> >     502     li 4,64
> >     503     addi 5,1,112
> >     504     ld 3,LC..13(2)
> >     505     mr 6,22
> >     506     bl .print_time
> >
> >
> > 32-bit Linux:
> >
> >     387     popl    %ecx
> >     388     popl    %edi
> >     389     pushl   $0
> >     390     leal    -20(%ebp), %edx
> >     391     pushl   %edx
> >     392     call    gettimeofday
> >     393     xorl    %edx, %edx
> >     394     addl    $16, %esp
> >     395 .L41:
> >     396     movl    -4160(%ebp), %eax
> >     397     cmpl    %eax, -4144(%ebp)
> >     398     jae .L42
> >     399     movl    -4144(%ebp), %eax
> >     400 .L44:
> >     401     movl    $0, (%eax)
> >     402     addl    $4, %eax
> >     403     cmpl    %eax, -4160(%ebp)
> >     404     ja  .L44
> >     405 .L42:
> >     406     incl    %edx
> >     407     cmpl    $64000000, %edx
> >     408     jne .L41
> >     409     subl    $8, %esp
> >     410     pushl   $0
> >     411     leal    -28(%ebp), %edx
> >     412     pushl   %edx
> >     413     call    gettimeofday
> >     414     leal    -28(%ebp), %eax
> >     415     movl    %eax, (%esp)
> >     416     leal    -20(%ebp), %ecx
> >     417     movl    $64, %edx
> >     418     movl    $.LC5, %eax
> >     419     call    print_time
> >     420     popl    %eax
> >     421     popl    %edx
> >     422     pushl   $0
> >     423     leal    -20(%ebp), %edx
> >     424     pushl   %edx
> >     425     call    gettimeofday
> >     426     xorl    %edi, %edi
> >     427     addl    $16, %esp
> >     428 .L46:
> >     429     pushl   %eax
> >     430     pushl   $64
> >     431     pushl   $0
> >     432     movl    -4144(%ebp), %ecx
> >     433     pushl   %ecx
> >     434     call    memset
> >     435     incl    %edi
> >     436     addl    $16, %esp
> >     437     cmpl    $64000000, %edi
> >     438     jne .L46
> >     439     subl    $8, %esp
> >     440     pushl   $0
> >     441     leal    -28(%ebp), %eax
> >     442     pushl   %eax
> >     443     call    gettimeofday
> >     444     leal    -28(%ebp), %edx
> >     445     movl    %edx, (%esp)
> >     446     leal    -20(%ebp), %ecx
> >     447     movl    $64, %edx
> >     448     movl    $.LC6, %eax
> >     449     call    print_time
> >     450     popl    %eax
> >     451     popl    %edx
> >     452     pushl   $0
> >     453     leal    -20(%ebp), %eax
> >     454     pushl   %eax
> >     455     call    gettimeofday
> >     456     xorl    %edx, %edx
> >     457     addl    $16, %esp
> >     458 .L48:
> >     459     movl    -4160(%ebp), %eax
> >     460     cmpl    %eax, -4144(%ebp)
> >     461     jae .L49
> >     462     movl    -4144(%ebp), %eax
> >     463 .L51:
> >     464     movl    $0, (%eax)
> >     465     addl    $4, %eax
> >     466     cmpl    -4160(%ebp), %eax
> >     467     jb  .L51
> >     468 .L49:
> >     469     incl    %edx
> >     470     cmpl    $64000000, %edx
> >     471     jne .L48
> >     472     subl    $8, %esp
> >     473     pushl   $0
> >     474     leal    -28(%ebp), %edx
> >     475     pushl   %edx
> >     476     call    gettimeofday
> >     477     leal    -28(%ebp), %eax
> >     478     movl    %eax, (%esp)
> >     479     leal    -20(%ebp), %ecx
> >     480     movl    $64, %edx
> >     481     movl    $.LC7, %eax
> >     482     call    print_time
> >
> > --
> > Seneca Cunningham
> > scunning@ca.afilias.info
> >
> > ---------------------------(end of
> broadcast)---------------------------
> > TIP 5: don't forget to increase your free space map settings
> >
>
> --
>   Bruce Momjian                        |  http://candle.pha.pa.us
>   pgman@candle.pha.pa.us               |  (610) 359-1001
>   +  If your life is a hard drive,     |  13 Roberts Road
>   +  Christ can be your backup.        |  Newtown Square,
> Pennsylvania 19073
>
> ---------------------------(end of
> broadcast)---------------------------
> TIP 9: In versions below 8.0, the planner will ignore your desire to
>        choose an index scan if your joining column's datatypes do not
>        match
>


Re: Some platform-specific MemSet research

From
Bruce Momjian
Date:
Rocco Altier wrote:
> I wanted to chime in that I also see this speedup from using XLC 6.0
> (IBM's cc), even in 32bit mode.  I have tested on AIX 5.2 and 5.1.
> 
> I think this would be good to include in the regular release.  
> 
> Not sure how many people are running older versions of AIX that would
> want a new version of postgres.
> 

OK, perfect.  I will work on making it platform-specific and report
back.

---------------------------------------------------------------------------


>     -rocco
> 
> 
> 
> > -----Original Message-----
> > From: pgsql-hackers-owner@postgresql.org 
> > [mailto:pgsql-hackers-owner@postgresql.org] On Behalf Of Bruce Momjian
> > Sent: Wednesday, February 01, 2006 12:11 PM
> > To: Seneca Cunningham
> > Cc: Martijn van Oosterhout; pgsql-hackers@postgresql.org
> > Subject: Re: [HACKERS] Some platform-specific MemSet research
> > 
> > 
> > 
> > My guess is that there is some really fast assembler for 
> > memory copy on
> > AIX, and only libc memset() has it.  If you want, we can make
> > MEMSET_LOOP_LIMIT in c.h a configure value, and allow template/aix to
> > set it to zero, causing memset() to be always used.
> > 
> > Are you prepared to make this optimization decision for all AIX users
> > using gcc, or only for certain versions?
> > 
> > --------------------------------------------------------------
> > -------------
> > 
> > Seneca Cunningham wrote:
> > > Martijn van Oosterhout wrote:
> > > > On Tue, Jan 24, 2006 at 05:24:28PM -0500, Seneca Cunningham wrote:
> > > > 
> > > >>After reading the post on -patches proposing that MemSet 
> > be changed to
> > > >>use long instead of int32 on the grounds that a pair of 
> > x86-64 linux
> > > >>boxes took less time to execute the long code 64*10^6 
> > times[1], I took a
> > > >>look at how the testcode performed on AIX with gcc.  
> > While the switch to
> > > >>long did result in a minor performance improvement, dropping the
> > > >>MemSetLoop in favour of the native memset resulted in the 
> > tests taking
> > > >>~25% the time as the MemSetLoop-like int loop. The 32-bit 
> > linux system I
> > > >>ran the expanded tests on showed that for the buffer size 
> > range that
> > > >>postgres can use the looping MemSet instead of memset 
> > (size <= 1024
> > > >>bytes), MemSet generally had better performance.
> > > > 
> > > > 
> > > > Could you please check the asm output to see what's going 
> > on. We've had
> > > > tests like these produce odd results in the past because 
> > the compiler
> > > > optimised away stuff that didn't have any effect. Since 
> > every memset
> > > > after the first is a no-op, you want to make sure it's 
> > still actually
> > > > doing the work...
> > > 
> > > Well, on both linux and AIX, all 30 of the 64000000 iterations loops
> > > from the source exist (10 int, 10 long, 10 memset).  According to my
> > > understanding of the assembler, memset itself is only 
> > called for values
> > > >= 64 bytes on both platforms and the memset is called in 
> > each iteration.
> > > 
> > > The assembler for the 64 byte loops, with prepended line 
> > number, first
> > > loop MemSetLoop int-variant, second loop memset, third loop 
> > MemSetLoop
> > > long-variant:
> > > 
> > > 64-bit AIX:
> > > 
> > >     419     addi 3,1,112
> > >     420     li 4,0
> > >     421     bl .gettimeofday
> > >     422     nop
> > >     423     lis 10,0x3d0
> > >     424     cmpld 6,26,16
> > >     425     li 11,0
> > >     426     ori 10,10,36864
> > >     427 L..41:
> > >     428     bge 6,L..42
> > >     429     mr 9,26
> > >     430     li 0,0
> > >     431 L..44:
> > >     432     stw 0,0(9)
> > >     433     addi 9,9,4
> > >     434     cmpld 7,16,9
> > >     435     bgt 7,L..44
> > >     436 L..42:
> > >     437     addi 0,11,1
> > >     438     extsw 11,0
> > >     439     cmpw 7,11,10
> > >     440     bne+ 7,L..41
> > >     441     li 4,0
> > >     442     mr 3,22
> > >     443     lis 25,0x3d0
> > >     444     li 28,0
> > >     445     bl .gettimeofday
> > >     446     nop
> > >     447     li 4,64
> > >     448     addi 5,1,112
> > >     449     ld 3,LC..9(2)
> > >     450     mr 6,22
> > >     451     ori 25,25,36864
> > >     452     bl .print_time
> > >     453     addi 3,1,112
> > >     454     li 4,0
> > >     455     bl .gettimeofday
> > >     456     nop
> > >     457 L..46:
> > >     458     mr 3,26
> > >     459     li 4,0
> > >     460     li 5,64
> > >     461     bl .memset
> > >     462     nop
> > >     463     addi 0,28,1
> > >     464     extsw 28,0
> > >     465     cmpw 7,28,25
> > >     466     bne+ 7,L..46
> > >     467     li 4,0
> > >     468     mr 3,22
> > >     469     bl .gettimeofday
> > >     470     nop
> > >     471     li 4,64
> > >     472     addi 5,1,112
> > >     473     ld 3,LC..11(2)
> > >     474     mr 6,22
> > >     475     bl .print_time
> > >     476     addi 3,1,112
> > >     477     li 4,0
> > >     478     bl .gettimeofday
> > >     479     nop
> > >     480     lis 10,0x3d0
> > >     481     cmpld 6,26,16
> > >     482     li 11,0
> > >     483     ori 10,10,36864
> > >     484 L..48:
> > >     485     bge 6,L..49
> > >     486     mr 9,26
> > >     487     li 0,0
> > >     488 L..51:
> > >     489     std 0,0(9)
> > >     490     addi 9,9,8
> > >     491     cmpld 7,9,16
> > >     492     blt 7,L..51
> > >     493 L..49:
> > >     494     addi 0,11,1
> > >     495     extsw 11,0
> > >     496     cmpw 7,11,10
> > >     497     bne+ 7,L..48
> > >     498     li 4,0
> > >     499     mr 3,22
> > >     500     bl .gettimeofday
> > >     501     nop
> > >     502     li 4,64
> > >     503     addi 5,1,112
> > >     504     ld 3,LC..13(2)
> > >     505     mr 6,22
> > >     506     bl .print_time
> > > 
> > > 
> > > 32-bit Linux:
> > > 
> > >     387     popl    %ecx
> > >     388     popl    %edi
> > >     389     pushl   $0
> > >     390     leal    -20(%ebp), %edx
> > >     391     pushl   %edx
> > >     392     call    gettimeofday
> > >     393     xorl    %edx, %edx
> > >     394     addl    $16, %esp
> > >     395 .L41:
> > >     396     movl    -4160(%ebp), %eax
> > >     397     cmpl    %eax, -4144(%ebp)
> > >     398     jae .L42
> > >     399     movl    -4144(%ebp), %eax
> > >     400 .L44:
> > >     401     movl    $0, (%eax)
> > >     402     addl    $4, %eax
> > >     403     cmpl    %eax, -4160(%ebp)
> > >     404     ja  .L44
> > >     405 .L42:
> > >     406     incl    %edx
> > >     407     cmpl    $64000000, %edx
> > >     408     jne .L41
> > >     409     subl    $8, %esp
> > >     410     pushl   $0
> > >     411     leal    -28(%ebp), %edx
> > >     412     pushl   %edx
> > >     413     call    gettimeofday
> > >     414     leal    -28(%ebp), %eax
> > >     415     movl    %eax, (%esp)
> > >     416     leal    -20(%ebp), %ecx
> > >     417     movl    $64, %edx
> > >     418     movl    $.LC5, %eax
> > >     419     call    print_time
> > >     420     popl    %eax
> > >     421     popl    %edx
> > >     422     pushl   $0
> > >     423     leal    -20(%ebp), %edx
> > >     424     pushl   %edx
> > >     425     call    gettimeofday
> > >     426     xorl    %edi, %edi
> > >     427     addl    $16, %esp
> > >     428 .L46:
> > >     429     pushl   %eax
> > >     430     pushl   $64
> > >     431     pushl   $0
> > >     432     movl    -4144(%ebp), %ecx
> > >     433     pushl   %ecx
> > >     434     call    memset
> > >     435     incl    %edi
> > >     436     addl    $16, %esp
> > >     437     cmpl    $64000000, %edi
> > >     438     jne .L46
> > >     439     subl    $8, %esp
> > >     440     pushl   $0
> > >     441     leal    -28(%ebp), %eax
> > >     442     pushl   %eax
> > >     443     call    gettimeofday
> > >     444     leal    -28(%ebp), %edx
> > >     445     movl    %edx, (%esp)
> > >     446     leal    -20(%ebp), %ecx
> > >     447     movl    $64, %edx
> > >     448     movl    $.LC6, %eax
> > >     449     call    print_time
> > >     450     popl    %eax
> > >     451     popl    %edx
> > >     452     pushl   $0
> > >     453     leal    -20(%ebp), %eax
> > >     454     pushl   %eax
> > >     455     call    gettimeofday
> > >     456     xorl    %edx, %edx
> > >     457     addl    $16, %esp
> > >     458 .L48:
> > >     459     movl    -4160(%ebp), %eax
> > >     460     cmpl    %eax, -4144(%ebp)
> > >     461     jae .L49
> > >     462     movl    -4144(%ebp), %eax
> > >     463 .L51:
> > >     464     movl    $0, (%eax)
> > >     465     addl    $4, %eax
> > >     466     cmpl    -4160(%ebp), %eax
> > >     467     jb  .L51
> > >     468 .L49:
> > >     469     incl    %edx
> > >     470     cmpl    $64000000, %edx
> > >     471     jne .L48
> > >     472     subl    $8, %esp
> > >     473     pushl   $0
> > >     474     leal    -28(%ebp), %edx
> > >     475     pushl   %edx
> > >     476     call    gettimeofday
> > >     477     leal    -28(%ebp), %eax
> > >     478     movl    %eax, (%esp)
> > >     479     leal    -20(%ebp), %ecx
> > >     480     movl    $64, %edx
> > >     481     movl    $.LC7, %eax
> > >     482     call    print_time
> > > 
> > > -- 
> > > Seneca Cunningham
> > > scunning@ca.afilias.info
> > > 
> > > ---------------------------(end of 
> > broadcast)---------------------------
> > > TIP 5: don't forget to increase your free space map settings
> > > 
> > 
> > -- 
> >   Bruce Momjian                        |  http://candle.pha.pa.us
> >   pgman@candle.pha.pa.us               |  (610) 359-1001
> >   +  If your life is a hard drive,     |  13 Roberts Road
> >   +  Christ can be your backup.        |  Newtown Square, 
> > Pennsylvania 19073
> > 
> > ---------------------------(end of 
> > broadcast)---------------------------
> > TIP 9: In versions below 8.0, the planner will ignore your desire to
> >        choose an index scan if your joining column's datatypes do not
> >        match
> > 
> 
> ---------------------------(end of broadcast)---------------------------
> TIP 6: explain analyze is your friend
> 

--  Bruce Momjian                        |  http://candle.pha.pa.us pgman@candle.pha.pa.us               |  (610)
359-1001+  If your life is a hard drive,     |  13 Roberts Road +  Christ can be your backup.        |  Newtown Square,
Pennsylvania19073