diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c index f86558d1ee5..e1fbd98fa14 100644 --- a/src/port/pg_popcount_avx512.c +++ b/src/port/pg_popcount_avx512.c @@ -30,20 +30,44 @@ uint64 pg_popcount_avx512(const char *buf, int bytes) { - uint64 popcnt; + __m512i val, cnt; __m512i accum = _mm512_setzero_si512(); + const char *final; + int tail_idx; + __mmask64 mask = -1; - for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i)) - { - const __m512i val = _mm512_loadu_si512((const __m512i *) buf); - const __m512i cnt = _mm512_popcnt_epi64(val); + /* + * Align buffer down to avoid double load overhead from unaligned access. + * Calculate a mask to ignore preceding bytes. Find start offset of final + * iteration and number of valid bytes making sure that final iteration + * is not empty. + */ + mask <<= ((uintptr_t) buf) % sizeof(__m512i); + tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1; + final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1); + buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf); + /* + * Iterate through all but the final iteration. Starting from second + * iteration, the start index mask is ignored. + */ + for (; buf < final; buf += sizeof(__m512i)) + { + val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf); + cnt = _mm512_popcnt_epi64(val); accum = _mm512_add_epi64(accum, cnt); - buf += sizeof(__m512i); + + mask = -1; } - popcnt = _mm512_reduce_add_epi64(accum); - return popcnt + pg_popcount_fast(buf, bytes); + /* Final iteration needs to ignore bytes that are not within the length */ + mask &= ((~0ULL) >> (64 - tail_idx)); + + val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf); + cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + + return _mm512_reduce_add_epi64(accum); } #endif /* TRY_POPCNT_FAST */