From 9b1eea078af6916a0ab078b75fdc4bc48dc17093 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sat, 14 Dec 2019 11:36:28 +1300 Subject: [PATCH] Rotate instead of shifting hash join batch number. Our algorithm for choosing batch numbers turned out not to work effectively for multi-billion key hash tables. We would try to use more hash bits than we have, and effectively concentrate all tuples into a smaller number of batches than we intended. While ideally we should switch to wider hashes, for now, change the algorithm to one that effectively gives up bits from the bucket number when we don't have enough bits. That means we'll finish up with longer bucket chains than would be ideal, but that's better than having batches that don't fit in work_mem. Batch-patch to all supported releases. Author: Thomas Munro Reported-by: James Coleman Discussion: https://postgr.es/m/16104-dc11ed911f1ab9df%40postgresql.org --- src/backend/executor/nodeHash.c | 8 +++++--- src/include/port/pg_bitutils.h | 11 +++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index d6f4eda097..7deab4e2d6 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -37,6 +37,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "port/atomics.h" +#include "port/pg_bitutils.h" #include "utils/dynahash.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -1877,7 +1878,7 @@ ExecHashGetHashValue(HashJoinTable hashtable, * chains), and must only cause the batch number to remain the same or * increase. Our algorithm is * bucketno = hashvalue MOD nbuckets - * batchno = (hashvalue DIV nbuckets) MOD nbatch + * batchno = ROR(hashvalue, log2_nbuckets) MOD nbatch * where nbuckets and nbatch are both expected to be powers of 2, so we can * do the computations by shifting and masking. (This assumes that all hash * functions are good about randomizing all their output bits, else we are @@ -1902,9 +1903,10 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable, if (nbatch > 1) { - /* we can do MOD by masking, DIV by shifting */ *bucketno = hashvalue & (nbuckets - 1); - *batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1); + *batchno = + pg_rotate_right32(hashvalue, + hashtable->log2_nbuckets) & (nbatch - 1); } else { diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 5197926696..7e5b31e513 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -13,6 +13,8 @@ #ifndef PG_BITUTILS_H #define PG_BITUTILS_H +#include + extern PGDLLIMPORT const uint8 pg_leftmost_one_pos[256]; extern PGDLLIMPORT const uint8 pg_rightmost_one_pos[256]; extern PGDLLIMPORT const uint8 pg_number_of_ones[256]; @@ -136,4 +138,13 @@ extern int (*pg_popcount64) (uint64 word); /* Count the number of one-bits in a byte array */ extern uint64 pg_popcount(const char *buf, int bytes); +/* + * Rotate the bits of "word" to the right by n bits. + */ +static inline uint32 +pg_rotate_right32(uint32 word, int n) +{ + return (word >> n) | (word << (sizeof(word) * CHAR_BIT - n)); +} + #endif /* PG_BITUTILS_H */ -- 2.23.0