Thread: Some platform-specific MemSet research
After reading the post on -patches proposing that MemSet be changed to use long instead of int32 on the grounds that a pair of x86-64 linux boxes took less time to execute the long code 64*10^6 times[1], I took a look at how the testcode performed on AIX with gcc. While the switch to long did result in a minor performance improvement, dropping the MemSetLoop in favour of the native memset resulted in the tests taking ~25% the time as the MemSetLoop-like int loop. The 32-bit linux system I ran the expanded tests on showed that for the buffer size range that postgres can use the looping MemSet instead of memset (size <= 1024 bytes), MemSet generally had better performance. Test results, reformatted for space: * AIX5.3 ML3 gcc version 4.0.1 OBJECT_MODE=64 gcc -maix64 -O2 sizeof(int) = 4 sizeof(long) = 8 int size=8 1.876096 1.875817 1.875998 long size=8 0.215347 0.215389 0.215367 memset size=8 0.127711 0.127726 0.127706 int size=16 0.617316 0.617346 0.617300 long size=16 0.408607 0.408294 0.408263 memset size=16 0.212843 0.176918 0.212854 int size=32 2.983032 2.982887 2.982724 long size=32 2.172499 2.172440 2.172549 memset size=32 0.255465 0.255449 0.255422 int size=64 3.560825 3.559743 3.559785 long size=64 2.974126 2.999054 2.942597 memset size=64 1.021843 1.021709 1.021704 int size=128 4.983803 4.983515 4.983236 long size=128 3.515213 3.514761 3.514733 memset size=128 1.319846 1.319699 1.319671 int size=256 9.071160 9.070497 9.070350 long size=256 7.428318 7.001997 6.990831 memset size=256 1.830684 1.830558 1.830533 int size=512 17.330519 17.329175 17.328520 long size=512 14.903931 14.902345 14.902329 memset size=512 3.512420 3.512139 3.512111 int size=1024 34.593734 34.592775 34.591700 long size=1024 23.804386 23.652192 24.043249 memset size=1024 6.010309 6.049034 6.052664 int size=2048 66.380036 66.374455 66.375010 long size=2048 45.094202 45.087909 45.087128 memset size=2048 11.638963 11.662794 11.664649 int size=4096 131.777427 131.764230 131.764542 long size=4096 88.906880 88.840758 88.887926 memset size=4096 22.882468 22.921160 22.920992 * Pentium 4 2.80GHz Ubuntu 5.10 2.6.12-10-686 #1 gcc version 4.0.2 20050808 (prerelease) (Ubuntu 4.0.1-4ubuntu9) gcc -O2 sizeof(int) = 4 sizeof(long) = 4 int size=8 0.319620 0.270326 0.288407 long size=8 0.279157 0.278571 0.339791 memset size=8 0.186439 0.192561 0.194865 int size=16 0.455448 0.459051 0.519848 long size=16 0.455193 0.451253 0.565159 memset size=16 0.257428 0.256752 0.356195 int size=32 0.732009 0.730730 0.750304 long size=32 0.731353 0.734311 0.743041 memset size=32 1.386004 1.404297 1.378161 int size=64 1.289708 1.397941 1.288536 long size=64 1.302256 1.380754 1.294904 memset size=64 2.965440 3.197489 2.958864 int size=128 3.162121 3.548065 3.158412 long size=128 3.150525 3.161121 3.153037 memset size=128 3.705133 3.739082 3.704949 int size=256 5.393701 5.415562 5.583510 long size=256 5.420254 5.367381 5.362041 memset size=256 9.246601 8.983931 9.040215 int size=512 10.219667 9.854537 9.851564 long size=512 9.906317 9.878196 10.202070 memset size=512 11.290588 11.050312 11.789231 int size=1024 19.777706 20.752631 19.846717 long size=1024 18.934663 18.870325 19.854066 memset size=1024 15.349694 15.487714 15.999638 int size=2048 28.783087 28.214086 26.228851 long size=2048 26.628890 30.611856 26.245331 memset size=2048 24.434751 24.095879 23.435490 int size=4096 53.817698 57.266583 51.547177 long size=4096 55.868670 53.012144 51.564656 memset size=4096 45.772710 40.651142 39.702063 [1] http://archives.postgresql.org/pgsql-patches/2006-01/msg00211.php -- Seneca Cunningham scunning@ca.afilias.info #include <stdio.h> #include <sys/time.h> #include <string.h> #define TYPEALIGN(ALIGNVAL,LEN) \ (((long) (LEN) + ((ALIGNVAL) - 1)) & ~((long) ((ALIGNVAL) - 1))) #define MemSetLoop(type, start, val, len) \ do \ { \ type * _start = (type *) (start); \ type * _stop = (type *) ((char *) _start + (size_t) (len)); \ \ while (_start < _stop) \ *_start++ = 0; \ } while (0) #define MAXALIGN 8 #define MAXSIZE 4096 #define LOOP (1000*1000*64) static void print_time(const char* msg, int size, const struct timeval *start, const struct timeval *end) { double t; t = (end->tv_sec - start->tv_sec) + (end->tv_usec - start->tv_usec) / 1000000.0; printf("%s (size=%d) : %f\n", msg, size, t); } #define TEST(type, size) \ do { \ int i; \ gettimeofday(&start, NULL); \ for(i = 0; i < LOOP; i++) \ { \ MemSetLoop(type, buffer, 0, size); \ } \ gettimeofday(&end, NULL); \ print_time("Loop by " #type, size, &start, &end); \ } while (0) #define TESTNATIVE(type, size) \ do { \ int i; \ gettimeofday(&start, NULL); \ for(i = 0; i < LOOP; i++) \ { \ memset(buffer, 0, size); \ } \ gettimeofday(&end, NULL); \ print_time("memset by " #type, size, &start, &end); \ } while (0) int main() { int j; struct timeval start, end; char buffer0[MAXSIZE + MAXALIGN]; char* buffer = (char*) TYPEALIGN(MAXALIGN, buffer0); printf("sizeof(int) = %d\n", sizeof(int)); printf("sizeof(long) = %d\n", sizeof(long)); for(j = 0; j < 3; j++) { TEST(int , 8); TESTNATIVE(int , 8); TEST(long, 8); TEST(int , 16); TESTNATIVE(int , 16); TEST(long, 16); TEST(int , 32); TESTNATIVE(int , 32); TEST(long, 32); TEST(int , 64); TESTNATIVE(int , 64); TEST(long, 64); TEST(int , 128); TESTNATIVE(int , 128); TEST(long, 128); TEST(int , 256); TESTNATIVE(int , 256); TEST(long, 256); TEST(int , 512); TESTNATIVE(int , 512); TEST(long, 512); TEST(int , 1024); TESTNATIVE(int , 1024); TEST(long, 1024); TEST(int , 2048); TESTNATIVE(int , 2048); TEST(long, 2048); TEST(int , 4096); TESTNATIVE(int , 4096); TEST(long, 4096); } return 0; }
On Tue, Jan 24, 2006 at 05:24:28PM -0500, Seneca Cunningham wrote: > After reading the post on -patches proposing that MemSet be changed to > use long instead of int32 on the grounds that a pair of x86-64 linux > boxes took less time to execute the long code 64*10^6 times[1], I took a > look at how the testcode performed on AIX with gcc. While the switch to > long did result in a minor performance improvement, dropping the > MemSetLoop in favour of the native memset resulted in the tests taking > ~25% the time as the MemSetLoop-like int loop. The 32-bit linux system I > ran the expanded tests on showed that for the buffer size range that > postgres can use the looping MemSet instead of memset (size <= 1024 > bytes), MemSet generally had better performance. Could you please check the asm output to see what's going on. We've had tests like these produce odd results in the past because the compiler optimised away stuff that didn't have any effect. Since every memset after the first is a no-op, you want to make sure it's still actually doing the work... Have a nice day, -- Martijn van Oosterhout <kleptog@svana.org> http://svana.org/kleptog/ > Patent. n. Genius is 5% inspiration and 95% perspiration. A patent is a > tool for doing 5% of the work and then sitting around waiting for someone > else to do the other 95% so you can sue them.
Martijn van Oosterhout wrote: > On Tue, Jan 24, 2006 at 05:24:28PM -0500, Seneca Cunningham wrote: > >>After reading the post on -patches proposing that MemSet be changed to >>use long instead of int32 on the grounds that a pair of x86-64 linux >>boxes took less time to execute the long code 64*10^6 times[1], I took a >>look at how the testcode performed on AIX with gcc. While the switch to >>long did result in a minor performance improvement, dropping the >>MemSetLoop in favour of the native memset resulted in the tests taking >>~25% the time as the MemSetLoop-like int loop. The 32-bit linux system I >>ran the expanded tests on showed that for the buffer size range that >>postgres can use the looping MemSet instead of memset (size <= 1024 >>bytes), MemSet generally had better performance. > > > Could you please check the asm output to see what's going on. We've had > tests like these produce odd results in the past because the compiler > optimised away stuff that didn't have any effect. Since every memset > after the first is a no-op, you want to make sure it's still actually > doing the work... Well, on both linux and AIX, all 30 of the 64000000 iterations loops from the source exist (10 int, 10 long, 10 memset). According to my understanding of the assembler, memset itself is only called for values >= 64 bytes on both platforms and the memset is called in each iteration. The assembler for the 64 byte loops, with prepended line number, first loop MemSetLoop int-variant, second loop memset, third loop MemSetLoop long-variant: 64-bit AIX: 419 addi 3,1,112 420 li 4,0 421 bl .gettimeofday 422 nop 423 lis 10,0x3d0 424 cmpld6,26,16 425 li 11,0 426 ori 10,10,36864 427 L..41: 428 bge 6,L..42 429 mr 9,26 430 li0,0 431 L..44: 432 stw 0,0(9) 433 addi 9,9,4 434 cmpld 7,16,9 435 bgt 7,L..44 436 L..42: 437 addi 0,11,1 438 extsw 11,0 439 cmpw 7,11,10 440 bne+ 7,L..41 441 li 4,0 442 mr 3,22 443 lis 25,0x3d0 444 li 28,0 445 bl .gettimeofday 446 nop 447 li 4,64 448 addi 5,1,112 449 ld 3,LC..9(2) 450 mr 6,22 451 ori 25,25,36864 452 bl .print_time 453 addi 3,1,112 454 li 4,0 455 bl .gettimeofday 456 nop 457 L..46: 458 mr 3,26 459 li 4,0 460 li 5,64 461 bl .memset 462 nop 463 addi 0,28,1 464 extsw 28,0 465 cmpw 7,28,25 466 bne+ 7,L..46 467 li 4,0 468 mr 3,22 469 bl .gettimeofday 470 nop 471 li 4,64 472 addi5,1,112 473 ld 3,LC..11(2) 474 mr 6,22 475 bl .print_time 476 addi 3,1,112 477 li 4,0 478 bl .gettimeofday 479 nop 480 lis 10,0x3d0 481 cmpld 6,26,16 482 li 11,0 483 ori 10,10,36864 484 L..48: 485 bge 6,L..49 486 mr 9,26 487 li 0,0 488 L..51: 489 std 0,0(9) 490 addi 9,9,8 491 cmpld 7,9,16 492 blt 7,L..51 493 L..49: 494 addi 0,11,1 495 extsw 11,0 496 cmpw 7,11,10 497 bne+ 7,L..48 498 li 4,0 499 mr 3,22 500 bl .gettimeofday 501 nop 502 li 4,64 503 addi 5,1,112 504 ld 3,LC..13(2) 505 mr 6,22 506 bl .print_time 32-bit Linux: 387 popl %ecx 388 popl %edi 389 pushl $0 390 leal -20(%ebp), %edx 391 pushl %edx 392 call gettimeofday 393 xorl %edx, %edx 394 addl $16, %esp 395 .L41: 396 movl -4160(%ebp), %eax 397 cmpl %eax, -4144(%ebp) 398 jae .L42 399 movl -4144(%ebp), %eax 400 .L44: 401 movl $0, (%eax) 402 addl $4, %eax 403 cmpl %eax, -4160(%ebp) 404 ja .L44 405.L42: 406 incl %edx 407 cmpl $64000000, %edx 408 jne .L41 409 subl $8, %esp 410 pushl $0 411 leal -28(%ebp), %edx 412 pushl %edx 413 call gettimeofday 414 leal -28(%ebp),%eax 415 movl %eax, (%esp) 416 leal -20(%ebp), %ecx 417 movl $64, %edx 418 movl $.LC5, %eax 419 call print_time 420 popl %eax 421 popl %edx 422 pushl $0 423 leal -20(%ebp), %edx 424 pushl %edx 425 call gettimeofday 426 xorl %edi, %edi 427 addl $16, %esp 428 .L46: 429 pushl %eax 430 pushl $64 431 pushl $0 432 movl -4144(%ebp),%ecx 433 pushl %ecx 434 call memset 435 incl %edi 436 addl $16, %esp 437 cmpl $64000000, %edi 438 jne .L46 439 subl $8, %esp 440 pushl $0 441 leal -28(%ebp),%eax 442 pushl %eax 443 call gettimeofday 444 leal -28(%ebp), %edx 445 movl %edx, (%esp) 446 leal -20(%ebp), %ecx 447 movl $64, %edx 448 movl $.LC6, %eax 449 call print_time 450 popl %eax 451 popl %edx 452 pushl $0 453 leal -20(%ebp), %eax 454 pushl %eax 455 call gettimeofday 456 xorl %edx, %edx 457 addl $16, %esp 458 .L48: 459 movl -4160(%ebp), %eax 460 cmpl %eax, -4144(%ebp) 461 jae .L49 462 movl -4144(%ebp), %eax 463 .L51: 464 movl $0, (%eax) 465 addl $4, %eax 466 cmpl -4160(%ebp), %eax 467 jb .L51 468 .L49: 469 incl %edx 470 cmpl $64000000, %edx 471 jne .L48 472 subl $8, %esp 473 pushl $0 474 leal -28(%ebp), %edx 475 pushl %edx 476 call gettimeofday 477 leal -28(%ebp), %eax 478 movl %eax, (%esp) 479 leal -20(%ebp), %ecx 480 movl $64, %edx 481 movl $.LC7, %eax 482 call print_time -- Seneca Cunningham scunning@ca.afilias.info
My guess is that there is some really fast assembler for memory copy on AIX, and only libc memset() has it. If you want, we can make MEMSET_LOOP_LIMIT in c.h a configure value, and allow template/aix to set it to zero, causing memset() to be always used. Are you prepared to make this optimization decision for all AIX users using gcc, or only for certain versions? --------------------------------------------------------------------------- Seneca Cunningham wrote: > Martijn van Oosterhout wrote: > > On Tue, Jan 24, 2006 at 05:24:28PM -0500, Seneca Cunningham wrote: > > > >>After reading the post on -patches proposing that MemSet be changed to > >>use long instead of int32 on the grounds that a pair of x86-64 linux > >>boxes took less time to execute the long code 64*10^6 times[1], I took a > >>look at how the testcode performed on AIX with gcc. While the switch to > >>long did result in a minor performance improvement, dropping the > >>MemSetLoop in favour of the native memset resulted in the tests taking > >>~25% the time as the MemSetLoop-like int loop. The 32-bit linux system I > >>ran the expanded tests on showed that for the buffer size range that > >>postgres can use the looping MemSet instead of memset (size <= 1024 > >>bytes), MemSet generally had better performance. > > > > > > Could you please check the asm output to see what's going on. We've had > > tests like these produce odd results in the past because the compiler > > optimised away stuff that didn't have any effect. Since every memset > > after the first is a no-op, you want to make sure it's still actually > > doing the work... > > Well, on both linux and AIX, all 30 of the 64000000 iterations loops > from the source exist (10 int, 10 long, 10 memset). According to my > understanding of the assembler, memset itself is only called for values > >= 64 bytes on both platforms and the memset is called in each iteration. > > The assembler for the 64 byte loops, with prepended line number, first > loop MemSetLoop int-variant, second loop memset, third loop MemSetLoop > long-variant: > > 64-bit AIX: > > 419 addi 3,1,112 > 420 li 4,0 > 421 bl .gettimeofday > 422 nop > 423 lis 10,0x3d0 > 424 cmpld 6,26,16 > 425 li 11,0 > 426 ori 10,10,36864 > 427 L..41: > 428 bge 6,L..42 > 429 mr 9,26 > 430 li 0,0 > 431 L..44: > 432 stw 0,0(9) > 433 addi 9,9,4 > 434 cmpld 7,16,9 > 435 bgt 7,L..44 > 436 L..42: > 437 addi 0,11,1 > 438 extsw 11,0 > 439 cmpw 7,11,10 > 440 bne+ 7,L..41 > 441 li 4,0 > 442 mr 3,22 > 443 lis 25,0x3d0 > 444 li 28,0 > 445 bl .gettimeofday > 446 nop > 447 li 4,64 > 448 addi 5,1,112 > 449 ld 3,LC..9(2) > 450 mr 6,22 > 451 ori 25,25,36864 > 452 bl .print_time > 453 addi 3,1,112 > 454 li 4,0 > 455 bl .gettimeofday > 456 nop > 457 L..46: > 458 mr 3,26 > 459 li 4,0 > 460 li 5,64 > 461 bl .memset > 462 nop > 463 addi 0,28,1 > 464 extsw 28,0 > 465 cmpw 7,28,25 > 466 bne+ 7,L..46 > 467 li 4,0 > 468 mr 3,22 > 469 bl .gettimeofday > 470 nop > 471 li 4,64 > 472 addi 5,1,112 > 473 ld 3,LC..11(2) > 474 mr 6,22 > 475 bl .print_time > 476 addi 3,1,112 > 477 li 4,0 > 478 bl .gettimeofday > 479 nop > 480 lis 10,0x3d0 > 481 cmpld 6,26,16 > 482 li 11,0 > 483 ori 10,10,36864 > 484 L..48: > 485 bge 6,L..49 > 486 mr 9,26 > 487 li 0,0 > 488 L..51: > 489 std 0,0(9) > 490 addi 9,9,8 > 491 cmpld 7,9,16 > 492 blt 7,L..51 > 493 L..49: > 494 addi 0,11,1 > 495 extsw 11,0 > 496 cmpw 7,11,10 > 497 bne+ 7,L..48 > 498 li 4,0 > 499 mr 3,22 > 500 bl .gettimeofday > 501 nop > 502 li 4,64 > 503 addi 5,1,112 > 504 ld 3,LC..13(2) > 505 mr 6,22 > 506 bl .print_time > > > 32-bit Linux: > > 387 popl %ecx > 388 popl %edi > 389 pushl $0 > 390 leal -20(%ebp), %edx > 391 pushl %edx > 392 call gettimeofday > 393 xorl %edx, %edx > 394 addl $16, %esp > 395 .L41: > 396 movl -4160(%ebp), %eax > 397 cmpl %eax, -4144(%ebp) > 398 jae .L42 > 399 movl -4144(%ebp), %eax > 400 .L44: > 401 movl $0, (%eax) > 402 addl $4, %eax > 403 cmpl %eax, -4160(%ebp) > 404 ja .L44 > 405 .L42: > 406 incl %edx > 407 cmpl $64000000, %edx > 408 jne .L41 > 409 subl $8, %esp > 410 pushl $0 > 411 leal -28(%ebp), %edx > 412 pushl %edx > 413 call gettimeofday > 414 leal -28(%ebp), %eax > 415 movl %eax, (%esp) > 416 leal -20(%ebp), %ecx > 417 movl $64, %edx > 418 movl $.LC5, %eax > 419 call print_time > 420 popl %eax > 421 popl %edx > 422 pushl $0 > 423 leal -20(%ebp), %edx > 424 pushl %edx > 425 call gettimeofday > 426 xorl %edi, %edi > 427 addl $16, %esp > 428 .L46: > 429 pushl %eax > 430 pushl $64 > 431 pushl $0 > 432 movl -4144(%ebp), %ecx > 433 pushl %ecx > 434 call memset > 435 incl %edi > 436 addl $16, %esp > 437 cmpl $64000000, %edi > 438 jne .L46 > 439 subl $8, %esp > 440 pushl $0 > 441 leal -28(%ebp), %eax > 442 pushl %eax > 443 call gettimeofday > 444 leal -28(%ebp), %edx > 445 movl %edx, (%esp) > 446 leal -20(%ebp), %ecx > 447 movl $64, %edx > 448 movl $.LC6, %eax > 449 call print_time > 450 popl %eax > 451 popl %edx > 452 pushl $0 > 453 leal -20(%ebp), %eax > 454 pushl %eax > 455 call gettimeofday > 456 xorl %edx, %edx > 457 addl $16, %esp > 458 .L48: > 459 movl -4160(%ebp), %eax > 460 cmpl %eax, -4144(%ebp) > 461 jae .L49 > 462 movl -4144(%ebp), %eax > 463 .L51: > 464 movl $0, (%eax) > 465 addl $4, %eax > 466 cmpl -4160(%ebp), %eax > 467 jb .L51 > 468 .L49: > 469 incl %edx > 470 cmpl $64000000, %edx > 471 jne .L48 > 472 subl $8, %esp > 473 pushl $0 > 474 leal -28(%ebp), %edx > 475 pushl %edx > 476 call gettimeofday > 477 leal -28(%ebp), %eax > 478 movl %eax, (%esp) > 479 leal -20(%ebp), %ecx > 480 movl $64, %edx > 481 movl $.LC7, %eax > 482 call print_time > > -- > Seneca Cunningham > scunning@ca.afilias.info > > ---------------------------(end of broadcast)--------------------------- > TIP 5: don't forget to increase your free space map settings > -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 359-1001+ If your life is a hard drive, | 13 Roberts Road + Christ can be your backup. | Newtown Square, Pennsylvania19073
I wanted to chime in that I also see this speedup from using XLC 6.0 (IBM's cc), even in 32bit mode. I have tested on AIX 5.2 and 5.1. I think this would be good to include in the regular release. Not sure how many people are running older versions of AIX that would want a new version of postgres. -rocco > -----Original Message----- > From: pgsql-hackers-owner@postgresql.org > [mailto:pgsql-hackers-owner@postgresql.org] On Behalf Of Bruce Momjian > Sent: Wednesday, February 01, 2006 12:11 PM > To: Seneca Cunningham > Cc: Martijn van Oosterhout; pgsql-hackers@postgresql.org > Subject: Re: [HACKERS] Some platform-specific MemSet research > > > > My guess is that there is some really fast assembler for > memory copy on > AIX, and only libc memset() has it. If you want, we can make > MEMSET_LOOP_LIMIT in c.h a configure value, and allow template/aix to > set it to zero, causing memset() to be always used. > > Are you prepared to make this optimization decision for all AIX users > using gcc, or only for certain versions? > > -------------------------------------------------------------- > ------------- > > Seneca Cunningham wrote: > > Martijn van Oosterhout wrote: > > > On Tue, Jan 24, 2006 at 05:24:28PM -0500, Seneca Cunningham wrote: > > > > > >>After reading the post on -patches proposing that MemSet > be changed to > > >>use long instead of int32 on the grounds that a pair of > x86-64 linux > > >>boxes took less time to execute the long code 64*10^6 > times[1], I took a > > >>look at how the testcode performed on AIX with gcc. > While the switch to > > >>long did result in a minor performance improvement, dropping the > > >>MemSetLoop in favour of the native memset resulted in the > tests taking > > >>~25% the time as the MemSetLoop-like int loop. The 32-bit > linux system I > > >>ran the expanded tests on showed that for the buffer size > range that > > >>postgres can use the looping MemSet instead of memset > (size <= 1024 > > >>bytes), MemSet generally had better performance. > > > > > > > > > Could you please check the asm output to see what's going > on. We've had > > > tests like these produce odd results in the past because > the compiler > > > optimised away stuff that didn't have any effect. Since > every memset > > > after the first is a no-op, you want to make sure it's > still actually > > > doing the work... > > > > Well, on both linux and AIX, all 30 of the 64000000 iterations loops > > from the source exist (10 int, 10 long, 10 memset). According to my > > understanding of the assembler, memset itself is only > called for values > > >= 64 bytes on both platforms and the memset is called in > each iteration. > > > > The assembler for the 64 byte loops, with prepended line > number, first > > loop MemSetLoop int-variant, second loop memset, third loop > MemSetLoop > > long-variant: > > > > 64-bit AIX: > > > > 419 addi 3,1,112 > > 420 li 4,0 > > 421 bl .gettimeofday > > 422 nop > > 423 lis 10,0x3d0 > > 424 cmpld 6,26,16 > > 425 li 11,0 > > 426 ori 10,10,36864 > > 427 L..41: > > 428 bge 6,L..42 > > 429 mr 9,26 > > 430 li 0,0 > > 431 L..44: > > 432 stw 0,0(9) > > 433 addi 9,9,4 > > 434 cmpld 7,16,9 > > 435 bgt 7,L..44 > > 436 L..42: > > 437 addi 0,11,1 > > 438 extsw 11,0 > > 439 cmpw 7,11,10 > > 440 bne+ 7,L..41 > > 441 li 4,0 > > 442 mr 3,22 > > 443 lis 25,0x3d0 > > 444 li 28,0 > > 445 bl .gettimeofday > > 446 nop > > 447 li 4,64 > > 448 addi 5,1,112 > > 449 ld 3,LC..9(2) > > 450 mr 6,22 > > 451 ori 25,25,36864 > > 452 bl .print_time > > 453 addi 3,1,112 > > 454 li 4,0 > > 455 bl .gettimeofday > > 456 nop > > 457 L..46: > > 458 mr 3,26 > > 459 li 4,0 > > 460 li 5,64 > > 461 bl .memset > > 462 nop > > 463 addi 0,28,1 > > 464 extsw 28,0 > > 465 cmpw 7,28,25 > > 466 bne+ 7,L..46 > > 467 li 4,0 > > 468 mr 3,22 > > 469 bl .gettimeofday > > 470 nop > > 471 li 4,64 > > 472 addi 5,1,112 > > 473 ld 3,LC..11(2) > > 474 mr 6,22 > > 475 bl .print_time > > 476 addi 3,1,112 > > 477 li 4,0 > > 478 bl .gettimeofday > > 479 nop > > 480 lis 10,0x3d0 > > 481 cmpld 6,26,16 > > 482 li 11,0 > > 483 ori 10,10,36864 > > 484 L..48: > > 485 bge 6,L..49 > > 486 mr 9,26 > > 487 li 0,0 > > 488 L..51: > > 489 std 0,0(9) > > 490 addi 9,9,8 > > 491 cmpld 7,9,16 > > 492 blt 7,L..51 > > 493 L..49: > > 494 addi 0,11,1 > > 495 extsw 11,0 > > 496 cmpw 7,11,10 > > 497 bne+ 7,L..48 > > 498 li 4,0 > > 499 mr 3,22 > > 500 bl .gettimeofday > > 501 nop > > 502 li 4,64 > > 503 addi 5,1,112 > > 504 ld 3,LC..13(2) > > 505 mr 6,22 > > 506 bl .print_time > > > > > > 32-bit Linux: > > > > 387 popl %ecx > > 388 popl %edi > > 389 pushl $0 > > 390 leal -20(%ebp), %edx > > 391 pushl %edx > > 392 call gettimeofday > > 393 xorl %edx, %edx > > 394 addl $16, %esp > > 395 .L41: > > 396 movl -4160(%ebp), %eax > > 397 cmpl %eax, -4144(%ebp) > > 398 jae .L42 > > 399 movl -4144(%ebp), %eax > > 400 .L44: > > 401 movl $0, (%eax) > > 402 addl $4, %eax > > 403 cmpl %eax, -4160(%ebp) > > 404 ja .L44 > > 405 .L42: > > 406 incl %edx > > 407 cmpl $64000000, %edx > > 408 jne .L41 > > 409 subl $8, %esp > > 410 pushl $0 > > 411 leal -28(%ebp), %edx > > 412 pushl %edx > > 413 call gettimeofday > > 414 leal -28(%ebp), %eax > > 415 movl %eax, (%esp) > > 416 leal -20(%ebp), %ecx > > 417 movl $64, %edx > > 418 movl $.LC5, %eax > > 419 call print_time > > 420 popl %eax > > 421 popl %edx > > 422 pushl $0 > > 423 leal -20(%ebp), %edx > > 424 pushl %edx > > 425 call gettimeofday > > 426 xorl %edi, %edi > > 427 addl $16, %esp > > 428 .L46: > > 429 pushl %eax > > 430 pushl $64 > > 431 pushl $0 > > 432 movl -4144(%ebp), %ecx > > 433 pushl %ecx > > 434 call memset > > 435 incl %edi > > 436 addl $16, %esp > > 437 cmpl $64000000, %edi > > 438 jne .L46 > > 439 subl $8, %esp > > 440 pushl $0 > > 441 leal -28(%ebp), %eax > > 442 pushl %eax > > 443 call gettimeofday > > 444 leal -28(%ebp), %edx > > 445 movl %edx, (%esp) > > 446 leal -20(%ebp), %ecx > > 447 movl $64, %edx > > 448 movl $.LC6, %eax > > 449 call print_time > > 450 popl %eax > > 451 popl %edx > > 452 pushl $0 > > 453 leal -20(%ebp), %eax > > 454 pushl %eax > > 455 call gettimeofday > > 456 xorl %edx, %edx > > 457 addl $16, %esp > > 458 .L48: > > 459 movl -4160(%ebp), %eax > > 460 cmpl %eax, -4144(%ebp) > > 461 jae .L49 > > 462 movl -4144(%ebp), %eax > > 463 .L51: > > 464 movl $0, (%eax) > > 465 addl $4, %eax > > 466 cmpl -4160(%ebp), %eax > > 467 jb .L51 > > 468 .L49: > > 469 incl %edx > > 470 cmpl $64000000, %edx > > 471 jne .L48 > > 472 subl $8, %esp > > 473 pushl $0 > > 474 leal -28(%ebp), %edx > > 475 pushl %edx > > 476 call gettimeofday > > 477 leal -28(%ebp), %eax > > 478 movl %eax, (%esp) > > 479 leal -20(%ebp), %ecx > > 480 movl $64, %edx > > 481 movl $.LC7, %eax > > 482 call print_time > > > > -- > > Seneca Cunningham > > scunning@ca.afilias.info > > > > ---------------------------(end of > broadcast)--------------------------- > > TIP 5: don't forget to increase your free space map settings > > > > -- > Bruce Momjian | http://candle.pha.pa.us > pgman@candle.pha.pa.us | (610) 359-1001 > + If your life is a hard drive, | 13 Roberts Road > + Christ can be your backup. | Newtown Square, > Pennsylvania 19073 > > ---------------------------(end of > broadcast)--------------------------- > TIP 9: In versions below 8.0, the planner will ignore your desire to > choose an index scan if your joining column's datatypes do not > match >
Rocco Altier wrote: > I wanted to chime in that I also see this speedup from using XLC 6.0 > (IBM's cc), even in 32bit mode. I have tested on AIX 5.2 and 5.1. > > I think this would be good to include in the regular release. > > Not sure how many people are running older versions of AIX that would > want a new version of postgres. > OK, perfect. I will work on making it platform-specific and report back. --------------------------------------------------------------------------- > -rocco > > > > > -----Original Message----- > > From: pgsql-hackers-owner@postgresql.org > > [mailto:pgsql-hackers-owner@postgresql.org] On Behalf Of Bruce Momjian > > Sent: Wednesday, February 01, 2006 12:11 PM > > To: Seneca Cunningham > > Cc: Martijn van Oosterhout; pgsql-hackers@postgresql.org > > Subject: Re: [HACKERS] Some platform-specific MemSet research > > > > > > > > My guess is that there is some really fast assembler for > > memory copy on > > AIX, and only libc memset() has it. If you want, we can make > > MEMSET_LOOP_LIMIT in c.h a configure value, and allow template/aix to > > set it to zero, causing memset() to be always used. > > > > Are you prepared to make this optimization decision for all AIX users > > using gcc, or only for certain versions? > > > > -------------------------------------------------------------- > > ------------- > > > > Seneca Cunningham wrote: > > > Martijn van Oosterhout wrote: > > > > On Tue, Jan 24, 2006 at 05:24:28PM -0500, Seneca Cunningham wrote: > > > > > > > >>After reading the post on -patches proposing that MemSet > > be changed to > > > >>use long instead of int32 on the grounds that a pair of > > x86-64 linux > > > >>boxes took less time to execute the long code 64*10^6 > > times[1], I took a > > > >>look at how the testcode performed on AIX with gcc. > > While the switch to > > > >>long did result in a minor performance improvement, dropping the > > > >>MemSetLoop in favour of the native memset resulted in the > > tests taking > > > >>~25% the time as the MemSetLoop-like int loop. The 32-bit > > linux system I > > > >>ran the expanded tests on showed that for the buffer size > > range that > > > >>postgres can use the looping MemSet instead of memset > > (size <= 1024 > > > >>bytes), MemSet generally had better performance. > > > > > > > > > > > > Could you please check the asm output to see what's going > > on. We've had > > > > tests like these produce odd results in the past because > > the compiler > > > > optimised away stuff that didn't have any effect. Since > > every memset > > > > after the first is a no-op, you want to make sure it's > > still actually > > > > doing the work... > > > > > > Well, on both linux and AIX, all 30 of the 64000000 iterations loops > > > from the source exist (10 int, 10 long, 10 memset). According to my > > > understanding of the assembler, memset itself is only > > called for values > > > >= 64 bytes on both platforms and the memset is called in > > each iteration. > > > > > > The assembler for the 64 byte loops, with prepended line > > number, first > > > loop MemSetLoop int-variant, second loop memset, third loop > > MemSetLoop > > > long-variant: > > > > > > 64-bit AIX: > > > > > > 419 addi 3,1,112 > > > 420 li 4,0 > > > 421 bl .gettimeofday > > > 422 nop > > > 423 lis 10,0x3d0 > > > 424 cmpld 6,26,16 > > > 425 li 11,0 > > > 426 ori 10,10,36864 > > > 427 L..41: > > > 428 bge 6,L..42 > > > 429 mr 9,26 > > > 430 li 0,0 > > > 431 L..44: > > > 432 stw 0,0(9) > > > 433 addi 9,9,4 > > > 434 cmpld 7,16,9 > > > 435 bgt 7,L..44 > > > 436 L..42: > > > 437 addi 0,11,1 > > > 438 extsw 11,0 > > > 439 cmpw 7,11,10 > > > 440 bne+ 7,L..41 > > > 441 li 4,0 > > > 442 mr 3,22 > > > 443 lis 25,0x3d0 > > > 444 li 28,0 > > > 445 bl .gettimeofday > > > 446 nop > > > 447 li 4,64 > > > 448 addi 5,1,112 > > > 449 ld 3,LC..9(2) > > > 450 mr 6,22 > > > 451 ori 25,25,36864 > > > 452 bl .print_time > > > 453 addi 3,1,112 > > > 454 li 4,0 > > > 455 bl .gettimeofday > > > 456 nop > > > 457 L..46: > > > 458 mr 3,26 > > > 459 li 4,0 > > > 460 li 5,64 > > > 461 bl .memset > > > 462 nop > > > 463 addi 0,28,1 > > > 464 extsw 28,0 > > > 465 cmpw 7,28,25 > > > 466 bne+ 7,L..46 > > > 467 li 4,0 > > > 468 mr 3,22 > > > 469 bl .gettimeofday > > > 470 nop > > > 471 li 4,64 > > > 472 addi 5,1,112 > > > 473 ld 3,LC..11(2) > > > 474 mr 6,22 > > > 475 bl .print_time > > > 476 addi 3,1,112 > > > 477 li 4,0 > > > 478 bl .gettimeofday > > > 479 nop > > > 480 lis 10,0x3d0 > > > 481 cmpld 6,26,16 > > > 482 li 11,0 > > > 483 ori 10,10,36864 > > > 484 L..48: > > > 485 bge 6,L..49 > > > 486 mr 9,26 > > > 487 li 0,0 > > > 488 L..51: > > > 489 std 0,0(9) > > > 490 addi 9,9,8 > > > 491 cmpld 7,9,16 > > > 492 blt 7,L..51 > > > 493 L..49: > > > 494 addi 0,11,1 > > > 495 extsw 11,0 > > > 496 cmpw 7,11,10 > > > 497 bne+ 7,L..48 > > > 498 li 4,0 > > > 499 mr 3,22 > > > 500 bl .gettimeofday > > > 501 nop > > > 502 li 4,64 > > > 503 addi 5,1,112 > > > 504 ld 3,LC..13(2) > > > 505 mr 6,22 > > > 506 bl .print_time > > > > > > > > > 32-bit Linux: > > > > > > 387 popl %ecx > > > 388 popl %edi > > > 389 pushl $0 > > > 390 leal -20(%ebp), %edx > > > 391 pushl %edx > > > 392 call gettimeofday > > > 393 xorl %edx, %edx > > > 394 addl $16, %esp > > > 395 .L41: > > > 396 movl -4160(%ebp), %eax > > > 397 cmpl %eax, -4144(%ebp) > > > 398 jae .L42 > > > 399 movl -4144(%ebp), %eax > > > 400 .L44: > > > 401 movl $0, (%eax) > > > 402 addl $4, %eax > > > 403 cmpl %eax, -4160(%ebp) > > > 404 ja .L44 > > > 405 .L42: > > > 406 incl %edx > > > 407 cmpl $64000000, %edx > > > 408 jne .L41 > > > 409 subl $8, %esp > > > 410 pushl $0 > > > 411 leal -28(%ebp), %edx > > > 412 pushl %edx > > > 413 call gettimeofday > > > 414 leal -28(%ebp), %eax > > > 415 movl %eax, (%esp) > > > 416 leal -20(%ebp), %ecx > > > 417 movl $64, %edx > > > 418 movl $.LC5, %eax > > > 419 call print_time > > > 420 popl %eax > > > 421 popl %edx > > > 422 pushl $0 > > > 423 leal -20(%ebp), %edx > > > 424 pushl %edx > > > 425 call gettimeofday > > > 426 xorl %edi, %edi > > > 427 addl $16, %esp > > > 428 .L46: > > > 429 pushl %eax > > > 430 pushl $64 > > > 431 pushl $0 > > > 432 movl -4144(%ebp), %ecx > > > 433 pushl %ecx > > > 434 call memset > > > 435 incl %edi > > > 436 addl $16, %esp > > > 437 cmpl $64000000, %edi > > > 438 jne .L46 > > > 439 subl $8, %esp > > > 440 pushl $0 > > > 441 leal -28(%ebp), %eax > > > 442 pushl %eax > > > 443 call gettimeofday > > > 444 leal -28(%ebp), %edx > > > 445 movl %edx, (%esp) > > > 446 leal -20(%ebp), %ecx > > > 447 movl $64, %edx > > > 448 movl $.LC6, %eax > > > 449 call print_time > > > 450 popl %eax > > > 451 popl %edx > > > 452 pushl $0 > > > 453 leal -20(%ebp), %eax > > > 454 pushl %eax > > > 455 call gettimeofday > > > 456 xorl %edx, %edx > > > 457 addl $16, %esp > > > 458 .L48: > > > 459 movl -4160(%ebp), %eax > > > 460 cmpl %eax, -4144(%ebp) > > > 461 jae .L49 > > > 462 movl -4144(%ebp), %eax > > > 463 .L51: > > > 464 movl $0, (%eax) > > > 465 addl $4, %eax > > > 466 cmpl -4160(%ebp), %eax > > > 467 jb .L51 > > > 468 .L49: > > > 469 incl %edx > > > 470 cmpl $64000000, %edx > > > 471 jne .L48 > > > 472 subl $8, %esp > > > 473 pushl $0 > > > 474 leal -28(%ebp), %edx > > > 475 pushl %edx > > > 476 call gettimeofday > > > 477 leal -28(%ebp), %eax > > > 478 movl %eax, (%esp) > > > 479 leal -20(%ebp), %ecx > > > 480 movl $64, %edx > > > 481 movl $.LC7, %eax > > > 482 call print_time > > > > > > -- > > > Seneca Cunningham > > > scunning@ca.afilias.info > > > > > > ---------------------------(end of > > broadcast)--------------------------- > > > TIP 5: don't forget to increase your free space map settings > > > > > > > -- > > Bruce Momjian | http://candle.pha.pa.us > > pgman@candle.pha.pa.us | (610) 359-1001 > > + If your life is a hard drive, | 13 Roberts Road > > + Christ can be your backup. | Newtown Square, > > Pennsylvania 19073 > > > > ---------------------------(end of > > broadcast)--------------------------- > > TIP 9: In versions below 8.0, the planner will ignore your desire to > > choose an index scan if your joining column's datatypes do not > > match > > > > ---------------------------(end of broadcast)--------------------------- > TIP 6: explain analyze is your friend > -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 359-1001+ If your life is a hard drive, | 13 Roberts Road + Christ can be your backup. | Newtown Square, Pennsylvania19073