commit 7448239668ca4fedd6c1f5cf3d1d434efcc3291f Author: mithun Date: Tue Feb 21 12:08:11 2017 +0530 Feature : expand hash table efficiently. diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 3334089..4a9a409 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -48,7 +48,7 @@ bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) * Convert to absolute page number by adding the number of bucket pages * that exist before this split point. */ - return (BlockNumber) ((1 << i) + ovflbitnum); + return (BlockNumber) (_hash_get_tbuckets(i) + ovflbitnum); } /* @@ -66,9 +66,9 @@ _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) /* Determine the split number containing this page */ for (i = 1; i <= splitnum; i++) { - if (ovflblkno <= (BlockNumber) (1 << i)) + if (ovflblkno <= (BlockNumber) _hash_get_tbuckets(i)) break; /* oops */ - bitnum = ovflblkno - (1 << i); + bitnum = ovflblkno - _hash_get_tbuckets(i); /* * bitnum has to be greater than number of overflow page added in diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 9485978..1afcbd0 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -315,7 +315,7 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) int32 ffactor; double dnumbuckets; uint32 num_buckets; - uint32 log2_num_buckets; + uint32 spare_index; uint32 i; /* safety check */ @@ -350,11 +350,10 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else - num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets); + num_buckets = _hash_get_tbuckets(_hash_spareindex(dnumbuckets)); - log2_num_buckets = _hash_log2(num_buckets); - Assert(num_buckets == (((uint32) 1) << log2_num_buckets)); - Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); + spare_index = _hash_spareindex(num_buckets); + Assert(spare_index < HASH_MAX_SPLITPOINTS); /* * We initialize the metapage, the first N bucket pages, and the first @@ -400,18 +399,20 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) /* * We initialize the index with N buckets, 0 .. N-1, occupying physical - * blocks 1 to N. The first freespace bitmap page is in block N+1. Since - * N is a power of 2, we can set the masks this way: + * blocks 1 to N. The first freespace bitmap page is in block N+1. */ - metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; - metap->hashm_highmask = (num_buckets << 1) - 1; + metap->hashm_maxbucket = num_buckets - 1; + + /* set hishmask, which should be sufficient to cover num_buckets. */ + metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1; + metap->hashm_lowmask = (metap->hashm_highmask >> 1); MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ - metap->hashm_spares[log2_num_buckets] = 1; - metap->hashm_ovflpoint = log2_num_buckets; + metap->hashm_spares[spare_index] = 1; + metap->hashm_ovflpoint = spare_index; metap->hashm_firstfree = 0; /* @@ -619,8 +620,8 @@ restart_expand: { /* * Copy bucket mapping info now; refer to the comment in code below - * where we copy this information before calling _hash_splitbucket - * to see why this is okay. + * where we copy this information before calling _hash_splitbucket to + * see why this is okay. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; @@ -649,21 +650,23 @@ restart_expand: start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); /* - * If the split point is increasing (hashm_maxbucket's log base 2 - * increases), we need to allocate a new batch of bucket pages. + * If the split point is increasing we need to allocate a new batch of + * bucket pages. */ - spare_ndx = _hash_log2(new_bucket + 1); + spare_ndx = _hash_spareindex(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { + uint32 buckets_toadd = 0; + Assert(spare_ndx == metap->hashm_ovflpoint + 1); /* * The number of buckets in the new splitpoint is equal to the total - * number already in existence, i.e. new_bucket. Currently this maps - * one-to-one to blocks required, but someday we may need a more - * complicated calculation here. + * number already in existence, i.e. new_bucket. We add one fourth of + * total buckets to be added in this split point. */ - if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) + buckets_toadd = ((1 << ((spare_ndx >> 2) - 1)) >> 2); + if (!_hash_alloc_buckets(rel, start_nblkno, buckets_toadd)) { /* can't split due to BlockNumber overflow */ _hash_relbuf(rel, buf_oblkno); @@ -847,10 +850,9 @@ _hash_splitbucket(Relation rel, /* * Mark the old bucket to indicate that split is in progress. (At - * operation end, we will clear the split-in-progress flag.) Also, - * for a primary bucket page, hasho_prevblkno stores the number of - * buckets that existed as of the last split, so we must update that - * value here. + * operation end, we will clear the split-in-progress flag.) Also, for a + * primary bucket page, hasho_prevblkno stores the number of buckets that + * existed as of the last split, so we must update that value here. */ oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; oopaque->hasho_prevblkno = maxbucket; @@ -1206,11 +1208,11 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, * _hash_getcachedmetap() -- Returns cached metapage data. * * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on - * the metapage. If not set, we'll set it before returning if we have to - * refresh the cache, and return with a pin but no lock on it; caller is - * responsible for releasing the pin. + * the metapage. If not set, we'll set it before returning if we have to + * refresh the cache, and return with a pin but no lock on it; caller is + * responsible for releasing the pin. * - * We refresh the cache if it's not initialized yet or force_refresh is true. + * We refresh the cache if it's not initialized yet or force_refresh is true. */ HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh) @@ -1220,13 +1222,13 @@ _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh) Assert(metabuf); if (force_refresh || rel->rd_amcache == NULL) { - char *cache = NULL; + char *cache = NULL; /* - * It's important that we don't set rd_amcache to an invalid - * value. Either MemoryContextAlloc or _hash_getbuf could fail, - * so don't install a pointer to the newly-allocated storage in the - * actual relcache entry until both have succeeeded. + * It's important that we don't set rd_amcache to an invalid value. + * Either MemoryContextAlloc or _hash_getbuf could fail, so don't + * install a pointer to the newly-allocated storage in the actual + * relcache entry until both have succeeeded. */ if (rel->rd_amcache == NULL) cache = MemoryContextAlloc(rel->rd_indexcxt, @@ -1261,7 +1263,7 @@ _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh) * us an opportunity to use the previously saved metapage contents to reach * the target bucket buffer, instead of reading from the metapage every time. * This saves one buffer access every time we want to reach the target bucket - * buffer, which is very helpful savings in bufmgr traffic and contention. + * buffer, which is very helpful savings in bufmgr traffic and contention. * * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the * bucket buffer has to be locked for reading or writing. diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index c705531..2c0e975 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -149,6 +149,57 @@ _hash_log2(uint32 num) } /* + * _hash_spareindex -- returns spare index of the bucket + */ +uint32 +_hash_spareindex(uint32 num_bucket) +{ + uint32 i, + tbuckets, + bucket_pos; + + i = _hash_log2(num_bucket); + + /* + * Get to the buckets main spare index position and then add slot number + * to where this bucket is allocated. + */ + tbuckets = 1 << (i - 1); + bucket_pos = num_bucket - tbuckets; + return ((i << 2) + ((tbuckets >> 2) ? ((bucket_pos - 1) / (tbuckets >> 2)) : 0)); +} + +/* + * _hash_get_tbuckets -- returns total number of buckets for this split number. + */ +uint32 +_hash_get_tbuckets(uint32 split_num) +{ + uint32 tbuckets, + slot; + + /* + * For the first three groups of split_num we will not have enough buckets + * to distribute among the 4 slots. So just return their total buckets + * irrespective of the slot they occupy. + */ + if ((split_num >> 2) == 0) + return 1; + if ((split_num >> 2) == 1) + return 2; + if ((split_num >> 2) == 2) + return 4; + /* + * total_buckets = total number of buckets in previous split point group + + * number of buckets for our slot in the current split point group. + */ + tbuckets = (1 << ((split_num >> 2) - 1)); + slot = ((3 & split_num) + 1); + tbuckets = tbuckets + (slot * (tbuckets >> 2)); + return tbuckets; +} + +/* * _hash_checkpage -- sanity checks on the format of all hash pages * * If flags is not zero, it is a bitwise OR of the acceptable values of diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 3bf587b..3407659 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -36,7 +36,7 @@ typedef uint32 Bucket; #define InvalidBucket ((Bucket) 0xFFFFFFFF) #define BUCKET_TO_BLKNO(metap,B) \ - ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) + ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1) /* * Special space for hash index pages. @@ -168,7 +168,7 @@ typedef HashScanOpaqueData *HashScanOpaque; * needing to fit into the metapage. (With 8K block size, 128 bitmaps * limit us to 64 GB of overflow space...) */ -#define HASH_MAX_SPLITPOINTS 32 +#define HASH_MAX_SPLITPOINTS 128 #define HASH_MAX_BITMAPS 128 typedef struct HashMetaPageData @@ -364,6 +364,8 @@ extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype); extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask); extern uint32 _hash_log2(uint32 num); +extern uint32 _hash_spareindex(uint32 num); +extern uint32 _hash_get_tbuckets(uint32 num); extern void _hash_checkpage(Relation rel, Buffer buf, int flags); extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup); extern bool _hash_convert_tuple(Relation index,