From 516de50b0fe2075955dab2168a7e46f0f9222278 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Tue, 7 Oct 2025 13:21:14 -0400 Subject: [PATCH v5 1/3] Fix overflow risk in hashtable size calculation ci-os-only: --- src/backend/executor/nodeHash.c | 115 +++++++++++++++----------------- 1 file changed, 55 insertions(+), 60 deletions(-) diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index a3415db4e20..0f815674c9f 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -850,90 +850,85 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, /* * Optimize the total amount of memory consumed by the hash node. * - * The nbatch calculation above focuses on the size of the in-memory hash - * table, assuming no per-batch overhead. Now adjust the number of batches - * and the size of the hash table to minimize total memory consumed by the - * hash node. - * - * Each batch file has a BLCKSZ buffer, and we may need two files per - * batch (inner and outer side). So with enough batches this can be - * significantly more memory than the hashtable itself. + * The nbatch calculation above focuses on the the in-memory hash table, + * assuming no per-batch overhead. But each batch may have two files, each + * with a BLCKSZ buffer. With enough batches these buffers may use + * significantly more memory than the hash table. * * The total memory usage may be expressed by this formula: * - * (inner_rel_bytes / nbatch) + (2 * nbatch * BLCKSZ) <= hash_table_bytes + * (inner_rel_bytes / nbatch) + (2 * nbatch * BLCKSZ) * * where (inner_rel_bytes / nbatch) is the size of the in-memory hash * table and (2 * nbatch * BLCKSZ) is the amount of memory used by file - * buffers. But for sufficiently large values of inner_rel_bytes value - * there may not be a nbatch value that would make both parts fit into - * hash_table_bytes. - * - * In this case we can't enforce the memory limit - we're going to exceed - * it. We can however minimize the impact and use as little memory as - * possible. (We haven't really enforced it before either, as we simply - * ignored the batch files.) - * - * The formula for total memory usage says that given an inner relation of - * size inner_rel_bytes, we may divide it into an arbitrary number of - * batches. This determines both the size of the in-memory hash table and - * the amount of memory needed for batch files. These two terms work in - * opposite ways - when one decreases, the other increases. + * buffers. * - * For low nbatch values, the hash table takes most of the memory, but at - * some point the batch files start to dominate. If you combine these two - * terms, the memory consumption (for a fixed size of the inner relation) - * has a u-shape, with a minimum at some nbatch value. + * For very large inner_rel_bytes, there may be no nbatch that keeps total + * memory usage under the budget (work_mem * hash_mem_multiplier). In that + * case, we choose nbatch to minimize total memory consumption across both + * the hashtable and file buffers. * - * Our goal is to find this nbatch value, minimizing the memory usage. We - * calculate the memory usage with half the batches (i.e. nbatch/2), and - * if it's lower than the current memory usage we know it's better to use - * fewer batches. We repeat this until reducing the number of batches does - * not reduce the memory usage - we found the optimum. We know the optimum - * exists, thanks to the u-shape. + * As we increase the size of the hashtable and decrease the number of + * batches, the total memory usage follows a U-shaped curve. We find the + * minimum nbatch by "walking back" -- checking if halving nbatch would + * lower total memory usage. We stop when it no longer helps. * - * We only want to do this when exceeding the memory limit, not every - * time. The goal is not to minimize memory usage in every case, but to - * minimize the memory usage when we can't stay within the memory limit. + * We only want to do this when we are already over the memory budget. We + * don't explore increasing nbatch since that could force batching when it + * is not needed. * - * For this reason we only consider reducing the number of batches. We - * could try the opposite direction too, but that would save memory only - * when most of the memory is used by the hash table. And the hash table - * was used for the initial sizing, so we shouldn't be exceeding the - * memory limit too much. We might save memory by using more batches, but - * it would result in spilling more batch files, which does not seem like - * a great trade off. + * While growing the hashtable, we also adjust the number of buckets to + * maintain a load factor of NTUP_PER_BUCKET while squeezing tuples back + * from batches into the hashtable. * - * While growing the hashtable, we also adjust the number of buckets, to - * not have more than one tuple per bucket (load factor 1). We can only do - * this during the initial sizing - once we start building the hash, - * nbucket is fixed. + * Note that we can only change nbuckets during initial hashtable sizing. + * Once we start building the hash, nbuckets is fixed. */ - while (nbatch > 0) + while (nbatch > 1) { - /* how much memory are we using with current nbatch value */ - size_t current_space = hash_table_bytes + (2 * nbatch * BLCKSZ); + /* + * Ensure that nbuckets can be allocated. nbuckets may have been + * increased above hash_table_size / sizeof(HashJoinTuple) if work_mem + * was very small, so checking if MaxAllocSize is enough for the + * buckets is the safest check. + */ + if (nbuckets > MaxAllocSize / sizeof(HashJoinTuple) / 2) + break; - /* how much memory would we use with half the batches */ - size_t new_space = hash_table_bytes * 2 + (nbatch * BLCKSZ); + /* + * We check that space_allowed wouldn't overflow because it will be + * larger than hash_table_bytes if there is a skew hashtable. + */ + if ((*space_allowed) > SIZE_MAX / 2) + break; - /* If the memory usage would not decrease, we found the optimum. */ - if (current_space < new_space) + /* + * This is the same as: + * + * hash_table_bytes + 2 * nbatch * BLCKSZ < hash_table_bytes * 2 + + * nbatch * BLCKSZ + * + * avoiding intermediate overflow. + * + * which is to say: will halving the number of batches and doubling + * the size of the hashtable reduce overall memory usage? + */ + if (nbatch < hash_table_bytes / BLCKSZ) break; /* - * It's better to use half the batches, so do that and adjust the - * nbucket in the opposite direction, and double the allowance. + * MaxAllocSize is sufficiently small that we are not worried about + * overflowing nbuckets. */ - nbatch /= 2; nbuckets *= 2; - + hash_table_bytes *= 2; *space_allowed = (*space_allowed) * 2; + + nbatch /= 2; } Assert(nbuckets > 0); Assert(nbatch > 0); - *numbuckets = nbuckets; *numbatches = nbatch; } @@ -994,7 +989,7 @@ ExecHashIncreaseBatchSize(HashJoinTable hashtable) * How much additional memory would doubling nbatch use? Each batch may * require two buffered files (inner/outer), with a BLCKSZ buffer. */ - size_t batchSpace = (hashtable->nbatch * 2 * BLCKSZ); + size_t batchSpace = (hashtable->nbatch * 2 * (size_t) BLCKSZ); /* * Compare the new space needed for doubling nbatch and for enlarging the -- 2.43.0