Hash Index Build Patch v2 - Mailing list pgsql-patches
From | Tom Raney |
---|---|
Subject | Hash Index Build Patch v2 |
Date | |
Msg-id | 471BAB14.1060101@comcast.net Whole thread Raw |
Responses |
Re: Hash Index Build Patch v2
Re: Hash Index Build Patch v2 |
List | pgsql-patches |
This revised version of our patch uses the function estimate_rel_size() from plancat.c to estimate the number of tuples in the parent relation. This method is an alternative to scanning the parent relation to estimate the number of tuples, as we did in the first version of the patch. -Tom #include <stdio.h> #include <stdlib.h> extern int main(int argc, char **argv) { char *p; long i, tups, seed; if (argc < 3) { printf("Too few args. <tuples> <seed>\n"); return 0; } tups = strtol(argv[1],&p,0); seed = strtol(argv[2], &p, 0); srand(seed); for (i=0; i< tups; i++) { printf("%d\n", rand()); } return 0; } *** ./backend/access/hash/hash.c.orig 2007-09-23 19:01:09.000000000 -0700 --- ./backend/access/hash/hash.c 2007-10-21 12:07:48.455594000 -0700 *************** *** 22,33 **** #include "access/hash.h" #include "catalog/index.h" #include "commands/vacuum.h" /* Working state for hashbuild and its callback */ typedef struct { ! double indtuples; } HashBuildState; static void hashbuildCallback(Relation index, --- 22,36 ---- #include "access/hash.h" #include "catalog/index.h" #include "commands/vacuum.h" + #include "optimizer/plancat.h" /* Working state for hashbuild and its callback */ typedef struct { ! double indtuples; /* The current number of index tuples */ ! Relation heapRel; /* The index covers this heap relation */ ! HSpool *spool; /* Used to sort the index tuples before insertion into the index */ } HashBuildState; static void hashbuildCallback(Relation index, *************** *** 40,46 **** --- 43,80 ---- /* * hashbuild() -- build a new hash index. + * + * + * The algorithm: + * (1) Initialize the build state + * (2) Retrieve estimate of tuple count from estimate_rel_size(); + * (3) Transform the heap file tuples into index tuples (itups), + * while inserting them into a spool. If the spool overflows + * memory, sort it into runs and spill it to disk + * (4) Finish sorting the spool + * (5) Pre-initialize all the buckets of the final index + * (6) Insert itups from the spool into the index + * + * Sorting the tuples before inserting them into the index is a classical + * bulk-load technique, also used in the BTree code. The sort is done in + * hash value order. + * Pre-allocating the buckets minimizes the number of overflow pages. + * The reason for step (2) is that in order to sort, in step (3), one must + * know the hash value, which depends on the number of buckets, which in turn + * depends on the number of itups = the number of rows in the heap file. + * Steps (3),(4) and (6) parallel similar steps in the BTree code. + * + * Here is an alternative algorithm: + * (1') Same as (1) + * (2') Scan the heap file, counting the number of rows, forming index + * tuples and inserting them into a spool (the spool is not presorted). + * (3') Sort the spool + * (4') same as (5) + * (5') same as (6) + * Although this algorithm would be somewhat faster, we prefer the existing + * algorithm because it reuses existing BTree code. */ + Datum hashbuild(PG_FUNCTION_ARGS) { *************** *** 50,55 **** --- 84,94 ---- IndexBuildResult *result; double reltuples; HashBuildState buildstate; + double tuples; + BlockNumber pages; + HashMetaPage metap; + Buffer metabuf; + uint32 num_bkt; /* Estimates number of buckets in the final index */ /* * We expect to be called exactly once for any index relation. If that's *************** *** 59,81 **** elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); ! /* initialize the hash index metadata page */ ! _hash_metapinit(index); ! ! /* build the index */ buildstate.indtuples = 0; ! /* do the heap scan */ ! reltuples = IndexBuildHeapScan(heap, index, indexInfo, ! hashbuildCallback, (void *) &buildstate); - /* - * Return statistics - */ - result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); ! result->heap_tuples = reltuples; ! result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); } --- 98,158 ---- elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); ! /* initialize the build state */ buildstate.indtuples = 0; + buildstate.heapRel = heap; + buildstate.spool = h_spoolinit(index); ! /* ! * Retrieve an estimate of the number of rows ! * ! */ ! tuples=0; ! estimate_rel_size(heap, NULL, &pages, &tuples); ! ! num_bkt = h_bkt_num((uint32)tuples, index); /* calculate the number of buckets in the final index */ ! h_set_bkt_mask(buildstate.spool, num_bkt); /* set the bucket mask for the compare function */ ! ! /* ! * Pre-initialize all the buckets of the final index ! */ ! _hash_metapinit(index, num_bkt); ! ! /* ! * Transform the heap file tuples into index tuples (itups), ! * while inserting them into a spool. If the spool overflows ! * memory, sort it into runs and spill it to disk ! * ! */ ! reltuples = IndexBuildHeapScan(heap, index, indexInfo, ! hashbuildCallback, (void *) &buildstate); ! ! ! /* ! * Finish sorting the spool ! */ ! h_do_sort(buildstate.spool); ! ! /* ! * Insert itups from the spool into the index ! */ ! h_print_spool(buildstate.spool); ! ! ! /* Read the meta page just initialized */ ! metabuf = _hash_getbuf(index, HASH_METAPAGE, HASH_READ, LH_META_PAGE); ! _hash_checkpage(index, metabuf, LH_META_PAGE); ! metap = (HashMetaPage) BufferGetPage(metabuf); ! ! /* Gather result, destroy spool, return */ ! ! result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); ! _hash_relbuf(index,metabuf); ! result->heap_tuples = reltuples; ! result->index_tuples = buildstate.indtuples; ! h_spooldestroy(buildstate.spool); PG_RETURN_POINTER(result); } *************** *** 104,111 **** pfree(itup); return; } ! ! _hash_doinsert(index, itup); buildstate->indtuples += 1; --- 181,191 ---- pfree(itup); return; } ! else ! { ! /* Place each itup into the spool for sorting */ ! h_spool(itup, buildstate->spool); ! } buildstate->indtuples += 1; *** ./backend/access/hash/hashpage.c.orig 2007-09-23 19:31:22.000000000 -0700 --- ./backend/access/hash/hashpage.c 2007-09-23 20:46:09.000000000 -0700 *************** *** 313,326 **** /* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the two buckets that we begin with and the initial ! * bitmap page. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ void ! _hash_metapinit(Relation rel) { HashMetaPage metap; HashPageOpaque pageopaque; --- 313,326 ---- /* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the two buckets that we begin with and the initial ! * bitmap page, plus num_bkt buckets. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ void ! _hash_metapinit(Relation rel, uint32 num_bkt) { HashMetaPage metap; HashPageOpaque pageopaque; *************** *** 330,342 **** int32 data_width; int32 item_width; int32 ffactor; ! uint16 i; /* safety check */ if (RelationGetNumberOfBlocks(rel) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full --- 330,354 ---- int32 data_width; int32 item_width; int32 ffactor; ! uint32 i; ! uint32 lg2buckets; ! uint32 pwr2; ! BlockNumber start_blk; /* safety check */ if (RelationGetNumberOfBlocks(rel) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); + + /* The minimum number of buckets is 2, so if we are below + * that number, let's fix that here + */ + + if (num_bkt < 2) + num_bkt = 2; + + /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full *************** *** 401,431 **** * We initialize the index with two buckets, 0 and 1, occupying physical * blocks 1 and 2. The first freespace bitmap page is in block 3. */ ! metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ ! metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ ! metap->hashm_ovflpoint = 1; metap->hashm_firstfree = 0; ! /* ! * Initialize the first two buckets ! */ ! for (i = 0; i <= 1; i++) ! { ! buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i)); ! pg = BufferGetPage(buf); ! pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); ! pageopaque->hasho_prevblkno = InvalidBlockNumber; ! pageopaque->hasho_nextblkno = InvalidBlockNumber; ! pageopaque->hasho_bucket = i; ! pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; ! _hash_wrtbuf(rel, buf); ! } /* * Initialize first bitmap page --- 413,488 ---- * We initialize the index with two buckets, 0 and 1, occupying physical * blocks 1 and 2. The first freespace bitmap page is in block 3. */ ! ! /* We need this calculation to correctly set the splits ! * and to set the value of the low/high mask. ! */ ! lg2buckets = _hash_log2_floor(num_bkt); ! ! pwr2 = 1 << lg2buckets; ! ! ! metap->hashm_maxbucket = num_bkt -1; ! /* We want the highmask to mask the next larger ! * power of 2 value greated than the number of buckets ! * we need. So, if we need 9 buckets, our high mask ! * value would be 15. Our low mask value will be 7 ! */ ! ! metap->hashm_highmask = (pwr2 << 1) -1; ! metap->hashm_lowmask = pwr2 - 1; ! MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ ! ! ! /* ! * No overflows will have occurred during this initialization process ! * so we need to just copy the value '1' into each position in the ! * hashm_spares array. This is needed to correctly determine how each ! * bucket maps to the logical page on disk. ! * ! */ ! ! for (i = 2; i <= _hash_log2(num_bkt); i++) ! metap->hashm_spares[i] = 1; ! ! ! /* This value needs to be the ceiling log to properly set the overflow ! * beyond our max bucket */ ! metap->hashm_ovflpoint = _hash_log2(num_bkt); ! metap->hashm_firstfree = 0; ! ! /* We need to make sure the file system can handle ! * this big block of pages we are going to allocate ! * _hash_alloc_buckets() will do that for us ! */ ! ! start_blk = BUCKET_TO_BLKNO(metap, 2); ! _hash_alloc_buckets(rel, start_blk, (pwr2<<1)-2); ! ! /* ! * Initialize the first 'num_bkt' buckets ! */ ! for (i = 0; i < num_bkt; i++) ! { ! ! buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i)); ! pg = BufferGetPage(buf); ! _hash_pageinit(pg, BufferGetPageSize(buf)); ! pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); ! pageopaque->hasho_prevblkno = InvalidBlockNumber; ! pageopaque->hasho_nextblkno = InvalidBlockNumber; ! pageopaque->hasho_bucket = i; ! pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; ! _hash_wrtbuf(rel, buf); ! } /* * Initialize first bitmap page *** ./backend/access/hash/hashutil.c.orig 2007-09-23 19:47:27.000000000 -0700 --- ./backend/access/hash/hashutil.c 2007-09-23 19:49:44.000000000 -0700 *************** *** 120,125 **** --- 120,144 ---- return bucket; } + + /* _hash_log2_floor -- returns floor(lg2(num)) + * + */ + uint32 + _hash_log2_floor(uint32 num) + { + uint32 i, limit; + limit = 1; + for (i=0; limit < num; limit <<= 1, i++) + ; + if (limit == num) + return i; + else + return i-1; + + } + + /* * _hash_log2 -- returns ceil(lg2(num)) */ *** ./backend/access/hash/Makefile.orig 2007-09-23 20:30:39.000000000 -0700 --- ./backend/access/hash/Makefile 2007-09-23 20:31:12.000000000 -0700 *************** *** 13,19 **** include $(top_builddir)/src/Makefile.global OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \ ! hashsearch.o hashutil.o all: SUBSYS.o --- 13,19 ---- include $(top_builddir)/src/Makefile.global OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \ ! hashsearch.o hashutil.o hashsort.o all: SUBSYS.o *** ./backend/access/hash/README.orig 2007-09-24 00:07:32.000000000 -0700 --- ./backend/access/hash/README 2007-10-21 12:22:00.569522000 -0700 *************** *** 105,110 **** --- 105,137 ---- number 3, which is the first bitmap page and is allocated during index creation. + Building an index on a full table + ---------------------------------- + + An index can be created on a table already loaded with tuples. Before it + is built, an estimate of the number of bucket pages that might be needed + to fit all the index tuples, is calculated. This is done by calling the + function estimate_rel_size(). A Fill factor (either DEFAULT or + user-defined as applicable) is then used to get the estimate, which is + the number of bucket pages initialized. If the estimate falls below two, + then a minimum of two bucket pages is initialized. However, the number + of bucket pages allocated is always a power of 2 value greater than the + estimate. I.e., if the estimated number of buckets is 3124, then the + number of buckets allocated is 4096 (and the number of bucket pages + initialized is 3124). If the estimate is 5456, then number allocated is + 8192 (and the number initialized is 5456) and so on. + + A spool holds all the index tuples before they are inserted into the + index pages. The content of the spool is sorted on the hash value order so + that there will be less backtracking to the bucket pages we have already + visited. + + The intention of creating as many as the estimated number of pages is to + avoid bucket page splits at splitpoint S and hence, redistribution of + tuples. Since we create all the bucket pages we need and insert the tuples + based on the bucket number they belong to, there will not be any need for + splitting. + Lock definitions ---------------- *** ./backend/utils/sort/tuplesort.c.orig 2007-09-23 19:51:30.000000000 -0700 --- ./backend/utils/sort/tuplesort.c 2007-09-24 00:03:25.000000000 -0700 *************** *** 341,346 **** --- 341,347 ---- Relation indexRel; ScanKey indexScanKey; bool enforceUnique; /* complain if we find duplicate tuples */ + uint32 bkt_mask; /* Bucket mask for hash sort */ /* * These variables are specific to the Datum case; they are set by *************** *** 439,444 **** --- 440,448 ---- static void reversedirection_heap(Tuplesortstate *state); static int comparetup_index(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); + static int comparetup_hashindex(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); static void writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup); *************** *** 2621,2626 **** --- 2625,2673 ---- &stup->isnull1); } + /* + * Set state parameters for sorting hash index tuples + */ + void tuplesort_set_hashindex(Tuplesortstate *state) + { + state->comparetup = comparetup_hashindex; + state->indexScanKey = NULL; /* Scan key is not needed in hash index */ + state->enforceUnique = false; /* no reason to enforce uniqueness with a hash table */ + + } + + + void tuplesort_set_bktmask(Tuplesortstate *state, uint32 mask) + { + state->bkt_mask = mask; + } + + + /* Special compare function called for the hash sort algorithm + * We are comparing hash values of the key, which then are masked to + * determine the proper hash table bucket. + */ + + + static int comparetup_hashindex(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) + { + uint32 bkta, bktb; + + /* Allow interupting long sorts */ + CHECK_FOR_INTERRUPTS(); + + bkta = (_hash_datum2hashkey(state->indexRel, a->datum1)) & (uint32)state->bkt_mask; + bktb = (_hash_datum2hashkey(state->indexRel, b->datum1)) & (uint32)state->bkt_mask; + + if (bkta > bktb) + return 1; + else if (bkta < bktb) + return -1; + else + return 0; + + } + static void reversedirection_heap(Tuplesortstate *state) { *** ./include/access/hash.h.orig 2007-09-23 17:50:40.000000000 -0700 --- ./include/access/hash.h 2007-09-23 17:40:21.000000000 -0700 *************** *** 282,287 **** --- 282,297 ---- Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy); + /*hashsort.c*/ + typedef struct HSpool HSpool; + extern HSpool *h_spoolinit(Relation index); + extern void h_spool(IndexTuple itup, HSpool *hspool); + extern uint32 h_bkt_num(uint32 tuples, Relation rel); + extern void h_set_bkt_mask(HSpool *hspool, uint32 bkts); + extern void h_do_sort(HSpool *hspool); + extern void h_print_spool(HSpool *hspool); + extern void h_spooldestroy(HSpool *hspool); + /* hashpage.c */ extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access); extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access); *************** *** 298,304 **** extern void _hash_wrtbuf(Relation rel, Buffer buf); extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access); ! extern void _hash_metapinit(Relation rel); extern void _hash_pageinit(Page page, Size size); extern void _hash_expandtable(Relation rel, Buffer metabuf); --- 308,314 ---- extern void _hash_wrtbuf(Relation rel, Buffer buf); extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access); ! extern void _hash_metapinit(Relation rel, uint32 pages); extern void _hash_pageinit(Page page, Size size); extern void _hash_expandtable(Relation rel, Buffer metabuf); *************** *** 320,325 **** --- 330,336 ---- extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask); extern uint32 _hash_log2(uint32 num); + extern uint32 _hash_log2_floor(uint32 num); extern void _hash_checkpage(Relation rel, Buffer buf, int flags); /* hash.c */ *** ./include/utils/tuplesort.h.orig 2007-09-23 18:52:07.000000000 -0700 --- ./include/utils/tuplesort.h 2007-09-23 18:59:13.000000000 -0700 *************** *** 55,60 **** --- 55,63 ---- Oid sortOperator, bool nullsFirstFlag, int workMem, bool randomAccess); + extern void tuplesort_set_hashindex(Tuplesortstate *state); + extern void tuplesort_set_bktmask(Tuplesortstate *state, uint32 mask); + extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); extern void tuplesort_puttupleslot(Tuplesortstate *state, /*--------------------------------------------------------------------------- * hashsort.c *-------------------------------------------------------------------------*/ /* * Functions needed to support initializing and sorting tuples in the spool * in hash.c */ #include "postgres.h" #include "access/hash.h" #include "miscadmin.h" #include "storage/smgr.h" #include "utils/tuplesort.h" struct HSpool { /* State data for tuplesort.c */ Tuplesortstate *sortstate; Relation index; }; /* create and initialize the spool structure */ HSpool *h_spoolinit(Relation index) { HSpool *hspool; int hKbytes; hspool = (HSpool *) palloc0(sizeof(HSpool)); hspool->index = index; /* hKbytes is the amount of memory we are going to use * to sort the spool. */ hKbytes = maintenance_work_mem; hspool->sortstate = tuplesort_begin_index(index,false,hKbytes,false); /* Substitute the default compare call-back function * for a specific hash index build compare function. */ tuplesort_set_hashindex(hspool->sortstate); return hspool; } void h_spool(IndexTuple itup, HSpool *hspool) { tuplesort_putindextuple(hspool->sortstate, itup); } /* * h_bkt_num() estimates the number of buckets * in the final hash table. * */ uint32 h_bkt_num(uint32 tuples, Relation rel) { int32 ffactor; int32 data_width; int32 item_width; uint32 bkt_num; /* * Calculate the fill factor. We could get this from the meta page * but at the point this method is called, the meta page has not been * created. * */ data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid, RelationGetDescr(rel)->attrs[0]->atttypmod); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; if (ffactor < 10) ffactor = 10; bkt_num = tuples / ffactor; bkt_num = bkt_num +1; return bkt_num; } /* In order to define an order for the index tuples, there must be a mask * applied to the 32 bit hash value of the index key during the sort * process. * For example, if there are 4,555 buckets approximated, the mask (or modulo) * would be 8,191 (hex value 1FFF). * */ void h_set_bkt_mask(HSpool *spool, uint32 bkts) { uint32 bkt_pwr2, bkt_mask; bkt_pwr2 = _hash_log2(bkts); bkt_mask = (1<<(bkt_pwr2))-1; tuplesort_set_bktmask(spool->sortstate, bkt_mask); } void h_do_sort(HSpool *spool) { tuplesort_performsort(spool->sortstate); } void h_spooldestroy(HSpool *hspool) { tuplesort_end(hspool->sortstate); pfree(hspool); } /* * h_print_spool() reads the itups back from the spool, which are now * in sorted order by hash value. As each itup is read from the spool, it is * inserted into the hash index. * */ void h_print_spool(HSpool *spool) { bool isfirstnull; bool should_free; IndexTuple itup= NULL; Datum attr1; TupleDesc tupd = RelationGetDescr(spool->index); while (itup = tuplesort_getindextuple(spool->sortstate, true, &should_free)) { attr1 = index_getattr(itup,1, tupd, &isfirstnull); if(!isfirstnull) { _hash_doinsert(spool->index, itup); } } }
pgsql-patches by date: