*** a/src/backend/storage/page/bufpage.c --- b/src/backend/storage/page/bufpage.c *************** *** 944,980 **** PageSetChecksumInplace(Page page, BlockNumber blkno) * Note that if the checksum validation fails we cannot tell the difference * between a transposed block and failure from direct on-block corruption, * though that is better than just ignoring transposed blocks altogether. */ static uint16 PageCalcChecksum16(Page page, BlockNumber blkno) { ! pg_crc32 crc; ! PageHeader p = (PageHeader) page; /* only calculate the checksum for properly-initialized pages */ Assert(!PageIsNew(page)); ! INIT_CRC32(crc); ! /* ! * Initialize the checksum calculation with the block number. This helps ! * catch corruption from whole blocks being transposed with other whole ! * blocks. ! */ ! COMP_CRC32(crc, &blkno, sizeof(blkno)); ! /* ! * Now add in the LSN, which is always the first field on the page. ! */ ! COMP_CRC32(crc, page, sizeof(p->pd_lsn)); ! /* ! * Now add the rest of the page, skipping the pd_checksum field. ! */ ! COMP_CRC32(crc, page + sizeof(p->pd_lsn) + sizeof(p->pd_checksum), ! BLCKSZ - sizeof(p->pd_lsn) - sizeof(p->pd_checksum)); ! FIN_CRC32(crc); ! return (uint16) crc; } --- 944,1211 ---- * Note that if the checksum validation fails we cannot tell the difference * between a transposed block and failure from direct on-block corruption, * though that is better than just ignoring transposed blocks altogether. + * + * The checksum algorithm is designed to be parallelizable on vector capable + * CPU's. The checksum is calculated in 3 phases. First stage aggregates 64 + * 16bit sums with the evolution function: + * + * partial_sum(-1,i) = 0 + * partial_sum(n,i) = partial_sum(n-1,i) * prime1 + ptr16Page[i+64*n] + * + * Second phase aggregates the partial sums together using a similar evolution + * function: + * + * parallel_sum(-1) = 0 + * parallel_sum(i) = parallel_sum(i-1) * prime2 + partial_sum(i) + * + * Third phase mixes together the parallel sum and block number and squeezes + * the output range by a modulo to avoid 0 values. The final checksum is + * calculated according to the formula: + * + * checksum = (parallel_sum * prime1 + blkno * prime2) mod trunc + 1 + * + * The values of the primes are empirically chosen, the exact value of prime 1 + * does not matter much, prime 2 needs to be large to ensure fast mixing. */ + + #define N_SUMS 64 + #define CSUM_PRIME1 0x49 + #define CSUM_PRIME2 0x986b + #define CSUM_TRUNC 65521 + + #if defined(__GNUC__) || defined(__INTEL_COMPILER) + #if defined(__x86_64__) + /* + * For x86-64 we use vectorized assembly code to speed up the algorithm. The + * sums are calculated in parallel using vectors of 8 16bit values. Inner + * loop is fully unrolled and the sums are held in vector registers to + * pipeline multiplication latency and eliminate load-store overhead. The + * aggregation phase reorganizes computations, first multiplying each value + * by its corresponding power of prime2 and then adding up the vector + * registers in a tree configuration. Only SSE2 instructions are used so we + * don't need to check for processor capabilities. + */ + #define HAS_PLATFORM_CHECKSUM + + /* + * Initialize helper vectors. The array contains four 8x16bit vectors: + * 1. Prime 1 broadcasted to a full vector + * 2. Prime 2 powers from 7..0 + * 3. Prime 2 powers from 39..32 + * 4. Prime 2 power 8 broadcasted to a full vector + * Aligned to 64 bytes because we want the whole array to be on a single + * cache line. + */ + #define CSUM_MUL(a,b) ((uint16) ((uint64)a * (uint64)b)) + #define CSUM_PRIME2_POW2 CSUM_MUL(CSUM_PRIME2, CSUM_PRIME2) + #define CSUM_PRIME2_POW3 CSUM_MUL(CSUM_PRIME2_POW2, CSUM_PRIME2) + #define CSUM_PRIME2_POW4 CSUM_MUL(CSUM_PRIME2_POW2, CSUM_PRIME2_POW2) + #define CSUM_PRIME2_POW5 CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2) + #define CSUM_PRIME2_POW6 CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2_POW2) + #define CSUM_PRIME2_POW7 CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2_POW3) + #define CSUM_PRIME2_POW8 CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2_POW4) + #define CSUM_PRIME2_POW8 CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2_POW4) + #define CSUM_PRIME2_POW32 CSUM_MUL(CSUM_PRIME2_POW8, \ + CSUM_MUL(CSUM_PRIME2_POW8, \ + CSUM_MUL(CSUM_PRIME2_POW8, \ + CSUM_PRIME2_POW8))) + + static uint16 primeVectors[32] __attribute__ ((aligned (64))) = + { + CSUM_PRIME1, CSUM_PRIME1, CSUM_PRIME1, CSUM_PRIME1, + CSUM_PRIME1, CSUM_PRIME1, CSUM_PRIME1, CSUM_PRIME1, + + CSUM_PRIME2_POW7, CSUM_PRIME2_POW6, CSUM_PRIME2_POW5, CSUM_PRIME2_POW4, + CSUM_PRIME2_POW3, CSUM_PRIME2_POW2, CSUM_PRIME2, 1, + + CSUM_MUL(CSUM_PRIME2_POW7, CSUM_PRIME2_POW32), + CSUM_MUL(CSUM_PRIME2_POW6, CSUM_PRIME2_POW32), + CSUM_MUL(CSUM_PRIME2_POW5, CSUM_PRIME2_POW32), + CSUM_MUL(CSUM_PRIME2_POW4, CSUM_PRIME2_POW32), + CSUM_MUL(CSUM_PRIME2_POW3, CSUM_PRIME2_POW32), + CSUM_MUL(CSUM_PRIME2_POW2, CSUM_PRIME2_POW32), + CSUM_MUL(CSUM_PRIME2, CSUM_PRIME2_POW32), + CSUM_PRIME2_POW32, + + CSUM_PRIME2_POW8, CSUM_PRIME2_POW8, CSUM_PRIME2_POW8, CSUM_PRIME2_POW8, + CSUM_PRIME2_POW8, CSUM_PRIME2_POW8, CSUM_PRIME2_POW8, CSUM_PRIME2_POW8 + }; + static uint16 PageCalcChecksum16(Page page, BlockNumber blkno) { ! /* Parallel sum is 32bit because we can't copy out only 16 bits from xmm0 */ ! uint32 parallel_sum; ! uint16 checksum; /* only calculate the checksum for properly-initialized pages */ Assert(!PageIsNew(page)); + /* assembly code assumes that the checksum is at offset 8 */ + Assert(offsetof(PageHeaderData, pd_checksum) == 8); + /* assembly code assumes we aggregate 64 sums in parallel */ + Assert(N_SUMS == 64); ! __asm__ __volatile__( ! /* rdx is the iteration step, we aggregate 128bytes in loop */ ! " mov $0x80, %%rdx \n" ! /* rcx is the offset on the page */ ! " xor %%rcx, %%rcx \n" ! /* ! * Registers xmm0..7 keep the intermediate parallel checksums. We ! * initialize them with data from the page, zeroing out the checksum. ! */ ! " movdqu (%1,%%rcx,1), %%xmm0 \n" ! " pinsrw $0x4, %%ecx, %%xmm0 \n" ! " movdqu 0x10(%1,%%rcx,1), %%xmm1 \n" ! " movdqu 0x20(%1,%%rcx,1), %%xmm2 \n" ! " movdqu 0x30(%1,%%rcx,1), %%xmm3 \n" ! " movdqu 0x40(%1,%%rcx,1), %%xmm4 \n" ! " movdqu 0x50(%1,%%rcx,1), %%xmm5 \n" ! " movdqu 0x60(%1,%%rcx,1), %%xmm6 \n" ! " movdqu 0x70(%1,%%rcx,1), %%xmm7 \n" ! /* ! * Update the offset value. We use 32bit registers here for a shorter ! * instruction so the setup code length aligns with 16 bytes and the ! * loop alignment below doesn't cause too much space overhead. ! */ ! " mov %%edx, %%ecx \n" ! /* xmm9 contains prime 1 broadcasted to all positions */ ! " movdqa (%2), %%xmm9 \n" ! /* ! * Main loop, calculate hash codes in parallel, each iteration ! * multiplies the state with prime 1 and adds in 128 bytes from the ! * page. ! */ ! "1: \n" ! ".align 16 \n" ! " movdqu (%1,%%rcx,1), %%xmm8 \n" ! " pmullw %%xmm9, %%xmm0 \n" ! " paddw %%xmm8, %%xmm0 \n" ! " movdqu 0x10(%1,%%rcx,1), %%xmm8\n" ! " pmullw %%xmm9, %%xmm1 \n" ! " paddw %%xmm8, %%xmm1 \n" ! " movdqu 0x20(%1,%%rcx,1), %%xmm8\n" ! " pmullw %%xmm9, %%xmm2 \n" ! " paddw %%xmm8, %%xmm2 \n" ! " movdqu 0x30(%1,%%rcx,1), %%xmm8\n" ! " pmullw %%xmm9, %%xmm3 \n" ! " paddw %%xmm8, %%xmm3 \n" ! " movdqu 0x40(%1,%%rcx,1), %%xmm8\n" ! " pmullw %%xmm9, %%xmm4 \n" ! " paddw %%xmm8, %%xmm4 \n" ! " movdqu 0x50(%1,%%rcx,1), %%xmm8\n" ! " pmullw %%xmm9, %%xmm5 \n" ! " paddw %%xmm8, %%xmm5 \n" ! " movdqu 0x60(%1,%%rcx,1), %%xmm8\n" ! " pmullw %%xmm9, %%xmm6 \n" ! " paddw %%xmm8, %%xmm6 \n" ! " movdqu 0x70(%1,%%rcx,1), %%xmm8\n" ! " pmullw %%xmm9, %%xmm7 \n" ! " paddw %%xmm8, %%xmm7 \n" ! ! /* update offset and check if we have hit page size already */ ! " add %%rdx, %%rcx \n" ! " cmp %3, %%ecx \n" ! " jnz 1b \n" ! /* ! * Aggregation phase. We store prime 2 to the power of 7..0 in xmm10, ! * to the power of 39..32 in xmm1 and to the power if 8 in xmm8. We ! * change the order of operations so that we first multiply each ! * partial checksum with the power that it has in the final value ! * (powers go from 63..0) and then add them together. This code is ! * structured to minimize dependency graph depth. The critical chain ! * has 4 multiplies and 5 adds. The final value ends up in xmm0. ! */ ! " movdqa 0x10(%2), %%xmm10 \n" ! " movdqa 0x20(%2), %%xmm11 \n" ! " movdqa 0x30(%2), %%xmm8 \n" ! ! " pmullw %%xmm10, %%xmm7 \n" ! " pmullw %%xmm8, %%xmm10 \n" ! " pmullw %%xmm10, %%xmm6 \n" ! " paddw %%xmm7, %%xmm6 \n" ! " pmullw %%xmm8, %%xmm10 \n" ! " pmullw %%xmm10, %%xmm5 \n" ! " pmullw %%xmm8, %%xmm10 \n" ! " pmullw %%xmm10, %%xmm4 \n" ! " paddw %%xmm5, %%xmm4 \n" ! " pmullw %%xmm11, %%xmm3 \n" ! " pmullw %%xmm8, %%xmm11 \n" ! " pmullw %%xmm11, %%xmm2 \n" ! " paddw %%xmm3, %%xmm2 \n" ! " pmullw %%xmm8, %%xmm11 \n" ! " pmullw %%xmm11, %%xmm1 \n" ! " pmullw %%xmm8, %%xmm11 \n" ! " pmullw %%xmm11, %%xmm0 \n" ! " paddw %%xmm1, %%xmm0 \n" ! " paddw %%xmm6, %%xmm4 \n" ! " paddw %%xmm2, %%xmm0 \n" ! " paddw %%xmm4, %%xmm0 \n" ! " movdqa %%xmm0, %%xmm1 \n" ! " psrldq $0x8, %%xmm1 \n" ! " paddw %%xmm1, %%xmm0 \n" ! " movdqa %%xmm0, %%xmm1 \n" ! " psrldq $0x4, %%xmm1 \n" ! " paddw %%xmm1, %%xmm0 \n" ! " movdqa %%xmm0, %%xmm1 \n" ! " psrldq $0x2, %%xmm1 \n" ! " paddw %%xmm1, %%xmm0 \n" ! ! /* store the checksum in output register */ ! " movd %%xmm0, %0 \n" ! ! : "=r"(parallel_sum) ! : "r"(page), "r"(primeVectors), "r"(BLCKSZ) ! : "rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4", ! "xmm5","xmm6","xmm7","xmm8", "xmm9","xmm10","xmm11"); ! ! /* mask out only the resulting sum */ ! parallel_sum &= 0xFFFF; ! checksum = ((parallel_sum*CSUM_PRIME1 + blkno*CSUM_PRIME2) % CSUM_TRUNC) + 1; ! ! return checksum; } + #endif /* __x86_64__ */ + #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */ + + #ifndef HAS_PLATFORM_CHECKSUM + /* + * Generic implementation of the checksum algorithm. The code is structured + * so vectorizing compilers can recognize the aggregation pattern. For gcc + * -funroll-loops and -ftree-vectorize will cause the main loop to be + * vectorized. + */ + static uint16 + PageCalcChecksum16(Page page, BlockNumber blkno) + { + uint16 sums[N_SUMS]; + uint16 (*pageArr)[N_SUMS] = (uint16 (*)[N_SUMS]) page; + uint16 parallel_sum = 0; + uint16 checksum; + int i, j; + + /* only calculate the checksum for properly-initialized pages */ + Assert(!PageIsNew(page)); + + /* initialize sums */ + for (j = 0; j < N_SUMS; j++) + sums[j] = (j == offsetof(PageHeaderData, pd_checksum)/sizeof(int16)) ? 0 : pageArr[0][j]; + + for (i = 1; i < BLCKSZ/sizeof(uint16)/N_SUMS; i++) + for (j = 0; j < N_SUMS; j++) + sums[j] = sums[j]*CSUM_PRIME1 + pageArr[i][j]; + + + for (i = 0; i < N_SUMS; i++) + parallel_sum = parallel_sum*CSUM_PRIME2 + sums[i]; + + checksum = (((uint32) parallel_sum*CSUM_PRIME1 + blkno*CSUM_PRIME2) % CSUM_TRUNC) + 1; + return checksum; + } + + #endif /* !HAS_PLATFORM_CHECKSUM */