From 0fbf98e98a3c18730790c45f28a96d3f47d3b0c4 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 5 Jul 2024 19:22:32 +0200 Subject: [PATCH v20240712 08/10] Use a single GIN tuplesort The previous approach was to sort the data on a private sort, then read it back, merge the GinTuples, and write it into the shared sort, to later be used by the shared tuple sort. The new approach is to use a single sort, merging tuples as we write them to disk. This reduces temporary disk space. An optimization was added to GinBuffer in which we don't deserialize tuples unless we need access to the itemIds. This modifies TUplesort to have a new flushwrites callback. Sort's writetup can now decide to buffer writes until the next flushwrites() callback. --- src/backend/access/gin/gininsert.c | 427 +++++++++------------ src/backend/utils/sort/tuplesort.c | 5 + src/backend/utils/sort/tuplesortvariants.c | 102 ++++- src/include/access/gin_private.h | 3 + src/include/access/gin_tuple.h | 10 + src/include/utils/tuplesort.h | 10 +- 6 files changed, 307 insertions(+), 250 deletions(-) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index f79c9a7d83..e02cb6d0e6 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -163,14 +163,6 @@ typedef struct * build callback etc. */ Tuplesortstate *bs_sortstate; - - /* - * The sortstate used only within a single worker for the first merge pass - * happenning there. In principle it doesn't need to be part of the build - * state and we could pass it around directly, but it's more convenient - * this way. And it's part of the build state, after all. - */ - Tuplesortstate *bs_worker_sort; } GinBuildState; @@ -195,8 +187,7 @@ static Datum _gin_parse_tuple_key(GinTuple *a); static GinTuple *_gin_build_tuple(GinBuildState *state, OffsetNumber attrnum, unsigned char category, Datum key, int16 typlen, bool typbyval, - ItemPointerData *items, uint32 nitems, - Size *len); + ItemPointerData *items, uint32 nitems); /* * Adds array of item pointers to tuple's posting list, or @@ -499,16 +490,15 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index) /* GIN tuple and tuple length */ GinTuple *tup; - Size tuplen; /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); tup = _gin_build_tuple(buildstate, attnum, category, key, attr->attlen, attr->attbyval, - list, nlist, &tuplen); + list, nlist); - tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen); + tuplesort_putgintuple(buildstate->bs_sortstate, tup); pfree(tup); } @@ -1169,8 +1159,14 @@ _gin_parallel_heapscan(GinBuildState *state) * synchronized (and thus may wrap around), and when combininng values from * multiple workers. */ -typedef struct GinBuffer +struct GinBuffer { + /* + * The memory context holds the dynamic allocation of items, key, and any + * produced GinTuples. + */ + MemoryContext context; + GinTuple *cached; /* copy of previous GIN tuple */ OffsetNumber attnum; GinNullCategory category; Datum key; /* 0 if no key (and keylen == 0) */ @@ -1188,7 +1184,7 @@ typedef struct GinBuffer int nfrozen; SortSupport ssup; /* for sorting/comparing keys */ ItemPointerData *items; -} GinBuffer; +}; /* * Check that TID array contains valid values, and that it's sorted (if we @@ -1203,8 +1199,7 @@ AssertCheckItemPointers(GinBuffer *buffer, bool sorted) { #ifdef USE_ASSERT_CHECKING /* we should not have a buffer with no TIDs to sort */ - Assert(buffer->items != NULL); - Assert(buffer->nitems > 0); + Assert(buffer->nitems == 0 || buffer->items != NULL); for (int i = 0; i < buffer->nitems; i++) { @@ -1224,7 +1219,7 @@ AssertCheckGinBuffer(GinBuffer *buffer) { #ifdef USE_ASSERT_CHECKING /* if we have any items, the array must exist */ - Assert(!((buffer->nitems > 0) && (buffer->items == NULL))); + Assert((buffer->nitems == 0) || (buffer->items != NULL)); /* * we don't know if the TID array is expected to be sorted or not @@ -1244,7 +1239,7 @@ AssertCheckGinBuffer(GinBuffer *buffer) * * Initializes sort support procedures for all index attributes. */ -static GinBuffer * +GinBuffer * GinBufferInit(Relation index) { GinBuffer *buffer = palloc0(sizeof(GinBuffer)); @@ -1294,15 +1289,18 @@ GinBufferInit(Relation index) PrepareSortSupportFromOrderingOp(typentry->lt_opr, sortKey); } + buffer->context = GenerationContextCreate(CurrentMemoryContext, + "Gin Buffer", + ALLOCSET_DEFAULT_SIZES); return buffer; } /* Is the buffer empty, i.e. has no TID values in the array? */ -static bool +bool GinBufferIsEmpty(GinBuffer *buffer) { - return (buffer->nitems == 0); + return (buffer->nitems == 0 && buffer->cached == NULL); } /* @@ -1314,37 +1312,71 @@ GinBufferIsEmpty(GinBuffer *buffer) static bool GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup) { + MemoryContext prev; int r; + AttrNumber attnum; Datum tupkey; + Datum bufkey; AssertCheckGinBuffer(buffer); + if (buffer->cached) + { + GinTuple *cached = buffer->cached; - if (tup->attrnum != buffer->attnum) - return false; + if (tup->attrnum != cached->attrnum) + return false; - /* same attribute should have the same type info */ - Assert(tup->typbyval == buffer->typbyval); - Assert(tup->typlen == buffer->typlen); + Assert(tup->typbyval == cached->typbyval); + Assert(tup->typlen == cached->typlen); - if (tup->category != buffer->category) - return false; + if (tup->category != cached->category) + return false; - /* - * For NULL/empty keys, this means equality, for normal keys we need to - * compare the actual key value. - */ - if (buffer->category != GIN_CAT_NORM_KEY) - return true; + /* + * For NULL/empty keys, this means equality, for normal keys we need to + * compare the actual key value. + */ + if (cached->category != GIN_CAT_NORM_KEY) + return true; + + attnum = cached->attrnum; + bufkey = _gin_parse_tuple_key(cached); + } + else + { + if (tup->attrnum != buffer->attnum) + return false; + + /* same attribute should have the same type info */ + Assert(tup->typbyval == buffer->typbyval); + Assert(tup->typlen == buffer->typlen); + + if (tup->category != buffer->category) + return false; + + /* + * For NULL/empty keys, this means equality, for normal keys we need to + * compare the actual key value. + */ + if (buffer->category != GIN_CAT_NORM_KEY) + return true; + attnum = buffer->attnum; + bufkey = buffer->key; + } /* * For the tuple, get either the first sizeof(Datum) bytes for byval * types, or a pointer to the beginning of the data array. */ - tupkey = (buffer->typbyval) ? *(Datum *) tup->data : PointerGetDatum(tup->data); + tupkey = _gin_parse_tuple_key(tup); + + prev = MemoryContextSwitchTo(buffer->context); - r = ApplySortComparator(buffer->key, false, + r = ApplySortComparator(bufkey, false, tupkey, false, - &buffer->ssup[buffer->attnum - 1]); + &buffer->ssup[attnum - 1]); + + MemoryContextSwitchTo(prev); return (r == 0); } @@ -1397,6 +1429,55 @@ GinBufferShouldTrim(GinBuffer *buffer, GinTuple *tup) return true; } +static void +GinBufferUnpackCached(GinBuffer *buffer, int reserve_space) +{ + Datum key; + ItemPointer items; + GinTuple *cached; + int totitems; + + cached = buffer->cached; + totitems = cached->nitems + reserve_space; + key = _gin_parse_tuple_key(cached); + + buffer->category = cached->category; + buffer->keylen = cached->keylen; + buffer->attnum = cached->attrnum; + + buffer->typlen = cached->typlen; + buffer->typbyval = cached->typbyval; + + if (cached->category == GIN_CAT_NORM_KEY) + buffer->key = datumCopy(key, buffer->typbyval, buffer->typlen); + else + buffer->key = (Datum) 0; + + items = _gin_parse_tuple_items(cached); + + if (buffer->items == NULL) + { + buffer->items = palloc0(totitems * sizeof(ItemPointerData)); + buffer->maxitems = totitems; + } + else if (buffer->maxitems < totitems) + { + buffer->items = repalloc(buffer->items, + totitems * sizeof(ItemPointerData)); + buffer->maxitems = totitems; + } + else { + Assert(PointerIsValid(buffer->items) && + buffer->maxitems >= totitems); + } + memcpy(buffer->items, items, buffer->nitems * sizeof(ItemPointerData)); + buffer->nitems = cached->nitems; + + buffer->cached = NULL; + pfree(cached); + pfree(items); +} + /* * GinBufferStoreTuple * Add data (especially TID list) from a GIN tuple to the buffer. @@ -1431,32 +1512,28 @@ GinBufferShouldTrim(GinBuffer *buffer, GinTuple *tup) * as that does palloc internally, but if we detected the append case, * we could do without it. Not sure how much overhead it is, though. */ -static void -GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup) +void +GinBufferMergeTuple(GinBuffer *buffer, GinTuple *tup) { + MemoryContext prev; ItemPointerData *items; - Datum key; + prev = MemoryContextSwitchTo(buffer->context); AssertCheckGinBuffer(buffer); - key = _gin_parse_tuple_key(tup); - items = _gin_parse_tuple_items(tup); - /* if the buffer is empty, set the fields (and copy the key) */ if (GinBufferIsEmpty(buffer)) { - buffer->category = tup->category; - buffer->keylen = tup->keylen; - buffer->attnum = tup->attrnum; - - buffer->typlen = tup->typlen; - buffer->typbyval = tup->typbyval; - - if (tup->category == GIN_CAT_NORM_KEY) - buffer->key = datumCopy(key, buffer->typbyval, buffer->typlen); - else - buffer->key = (Datum) 0; + GinTuple *tuple = palloc(tup->tuplen); + memcpy(tuple, tup, tup->tuplen); + buffer->cached = tuple; } + else if (buffer->cached != NULL) + { + GinBufferUnpackCached(buffer, tup->nitems); + } + + items = _gin_parse_tuple_items(tup); /* * Try freeze TIDs at the beginning of the list, i.e. exclude them from @@ -1530,6 +1607,33 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup) /* free the decompressed TID list */ pfree(items); + + MemoryContextSwitchTo(prev); +} + +GinTuple * +GinBufferBuildTuple(GinBuffer *buffer) +{ + MemoryContext prev = MemoryContextSwitchTo(buffer->context); + GinTuple *result; + + if (buffer->cached) + { + result = buffer->cached; + buffer->cached = NULL; + } + else + { + result = _gin_build_tuple(NULL, buffer->attnum, buffer->category, + buffer->key, buffer->typlen, + buffer->typbyval, buffer->items, + buffer->nitems); + } + + GinBufferReset(buffer); + + MemoryContextSwitchTo(prev); + return result; } /* @@ -1543,14 +1647,21 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup) * * XXX Might be better to have a separate memory context for the buffer. */ -static void +void GinBufferReset(GinBuffer *buffer) { Assert(!GinBufferIsEmpty(buffer)); - /* release byref values, do nothing for by-val ones */ - if ((buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval) - pfree(DatumGetPointer(buffer->key)); + /* release cached buffer tuple, if present */ + if (buffer->cached) + pfree(buffer->cached); + else + { + /* release byref values, do nothing for by-val ones */ + if ((buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval + && PointerIsValid(DatumGetPointer(buffer->key))) + pfree(DatumGetPointer(buffer->key)); + } /* * Not required, but makes it more likely to trigger NULL derefefence if @@ -1566,6 +1677,7 @@ GinBufferReset(GinBuffer *buffer) buffer->typlen = 0; buffer->typbyval = 0; + /* Note that we don't reset the memory context, this is deliberate */ } /* @@ -1589,7 +1701,7 @@ GinBufferTrim(GinBuffer *buffer) * GinBufferFree * Release memory associated with the GinBuffer (including TID array). */ -static void +void GinBufferFree(GinBuffer *buffer) { if (buffer->items) @@ -1600,6 +1712,7 @@ GinBufferFree(GinBuffer *buffer) (buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval) pfree(DatumGetPointer(buffer->key)); + MemoryContextDelete(buffer->context); pfree(buffer); } @@ -1613,7 +1726,7 @@ GinBufferFree(GinBuffer *buffer) * the TID array, and returning false if it's too large (more thant work_mem, * for example). */ -static bool +bool GinBufferCanAddKey(GinBuffer *buffer, GinTuple *tup) { /* empty buffer can accept data for any key */ @@ -1690,6 +1803,7 @@ _gin_parallel_merge(GinBuildState *state) * GinTuple. */ AssertCheckItemPointers(buffer, true); + Assert(!PointerIsValid(buffer->cached)); ginEntryInsert(&state->ginstate, buffer->attnum, buffer->key, buffer->category, @@ -1718,6 +1832,7 @@ _gin_parallel_merge(GinBuildState *state) * GinTuple. */ AssertCheckItemPointers(buffer, true); + Assert(!PointerIsValid(buffer->cached)); ginEntryInsert(&state->ginstate, buffer->attnum, buffer->key, buffer->category, @@ -1731,7 +1846,10 @@ _gin_parallel_merge(GinBuildState *state) * Remember data for the current tuple (either remember the new key, * or append if to the existing data). */ - GinBufferStoreTuple(buffer, tup); + GinBufferMergeTuple(buffer, tup); + + if (buffer->cached) + GinBufferUnpackCached(buffer, 0); } /* flush data remaining in the buffer (for the last key) */ @@ -1739,6 +1857,7 @@ _gin_parallel_merge(GinBuildState *state) { AssertCheckItemPointers(buffer, true); + Assert(!PointerIsValid(buffer->cached)); ginEntryInsert(&state->ginstate, buffer->attnum, buffer->key, buffer->category, buffer->items, buffer->nitems, &state->buildStats); @@ -1790,162 +1909,6 @@ _gin_leader_participate_as_worker(GinBuildState *buildstate, Relation heap, Rela ginleader->sharedsort, heap, index, sortmem, true); } -/* - * _gin_process_worker_data - * First phase of the key merging, happening in the worker. - * - * Depending on the number of distinct keys, the TID lists produced by the - * callback may be very short (due to frequent evictions in the callback). - * But combining many tiny lists is expensive, so we try to do as much as - * possible in the workers and only then pass the results to the leader. - * - * We read the tuples sorted by the key, and merge them into larger lists. - * At the moment there's no memory limit, so this will just produce one - * huge (sorted) list per key in each worker. Which means the leader will - * do a very limited number of mergesorts, which is good. - */ -static void -_gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort) -{ - GinTuple *tup; - Size tuplen; - - GinBuffer *buffer; - - /* - * Initialize buffer to combine entries for the same key. - * - * The workers are limited to the same amount of memory as during the sort - * in ginBuildCallbackParallel. But this probably should be the 32MB used - * during planning, just like there. - */ - buffer = GinBufferInit(state->ginstate.index); - - /* sort the raw per-worker data */ - tuplesort_performsort(state->bs_worker_sort); - - /* print some basic info */ - elog(LOG, "_gin_parallel_scan_and_build raw %zu compressed %zu ratio %.2f%%", - state->buildStats.sizeRaw, state->buildStats.sizeCompressed, - (100.0 * state->buildStats.sizeCompressed) / state->buildStats.sizeRaw); - - /* reset before the second phase */ - state->buildStats.sizeCompressed = 0; - state->buildStats.sizeRaw = 0; - - /* - * Read the GIN tuples from the shared tuplesort, sorted by the key, and - * merge them into larger chunks for the leader to combine. - */ - while ((tup = tuplesort_getgintuple(worker_sort, &tuplen, true)) != NULL) - { - - CHECK_FOR_INTERRUPTS(); - - /* - * If the buffer can accept the new GIN tuple, just store it there and - * we're done. If it's a different key (or maybe too much data) flush - * the current contents into the index first. - */ - if (!GinBufferCanAddKey(buffer, tup)) - { - GinTuple *ntup; - Size ntuplen; - - /* - * Buffer is not empty and it's storing a different key - flush - * the data into the insert, and start a new entry for current - * GinTuple. - */ - AssertCheckItemPointers(buffer, true); - - ntup = _gin_build_tuple(state, buffer->attnum, buffer->category, - buffer->key, buffer->typlen, buffer->typbyval, - buffer->items, buffer->nitems, &ntuplen); - - tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen); - - pfree(ntup); - - /* discard the existing data */ - GinBufferReset(buffer); - } - - /* - * We're about to add a GIN tuple to the buffer - check the memory - * limit first, and maybe write out some of the data into the index - * first, if needed (and possible). We only flush the part of the TID - * list that we know won't change, and only if there's enough data for - * compression to work well. - */ - if (GinBufferShouldTrim(buffer, tup)) - { - GinTuple *ntup; - Size ntuplen; - - Assert(buffer->nfrozen > 0); - - state->buildStats.nTrims++; - - /* - * Buffer is not empty and it's storing a different key - flush - * the data into the insert, and start a new entry for current - * GinTuple. - */ - AssertCheckItemPointers(buffer, true); - - ntup = _gin_build_tuple(state, buffer->attnum, buffer->category, - buffer->key, buffer->typlen, buffer->typbyval, - buffer->items, buffer->nfrozen, &ntuplen); - - tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen); - - pfree(ntup); - - /* truncate the data we've just discarded */ - GinBufferTrim(buffer); - } - - /* - * Remember data for the current tuple (either remember the new key, - * or append if to the existing data). - */ - GinBufferStoreTuple(buffer, tup); - } - - /* flush data remaining in the buffer (for the last key) */ - if (!GinBufferIsEmpty(buffer)) - { - GinTuple *ntup; - Size ntuplen; - - AssertCheckItemPointers(buffer, true); - - ntup = _gin_build_tuple(state, buffer->attnum, buffer->category, - buffer->key, buffer->typlen, buffer->typbyval, - buffer->items, buffer->nitems, &ntuplen); - - tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen); - - pfree(ntup); - - /* discard the existing data */ - GinBufferReset(buffer); - } - - /* relase all the memory */ - GinBufferFree(buffer); - - /* print some basic info */ - elog(LOG, "_gin_process_worker_data raw %zu compressed %zu ratio %.2f%%", - state->buildStats.sizeRaw, state->buildStats.sizeCompressed, - (100.0 * state->buildStats.sizeCompressed) / state->buildStats.sizeRaw); - - elog(LOG, "_gin_process_worker_data trims " INT64_FORMAT, state->buildStats.nTrims); - - tuplesort_end(worker_sort); -} - /* * Perform a worker's portion of a parallel sort. * @@ -1978,11 +1941,6 @@ _gin_parallel_scan_and_build(GinBuildState *state, sortmem, coordinate, TUPLESORT_NONE); - /* Local per-worker sort of raw-data */ - state->bs_worker_sort = tuplesort_begin_index_gin(heap, index, - sortmem, NULL, - TUPLESORT_NONE); - /* Join parallel scan */ indexInfo = BuildIndexInfo(index); indexInfo->ii_Concurrent = ginshared->isconcurrent; @@ -1996,13 +1954,6 @@ _gin_parallel_scan_and_build(GinBuildState *state, /* write remaining accumulated entries */ ginFlushBuildState(state, index); - /* - * Do the first phase of in-worker processing - sort the data produced by - * the callback, and combine them into much larger chunks and place that - * into the shared tuplestore for leader to process. - */ - _gin_process_worker_data(state, state->bs_worker_sort); - /* sort the GIN tuples built by this worker */ tuplesort_performsort(state->bs_sortstate); @@ -2159,8 +2110,7 @@ static GinTuple * _gin_build_tuple(GinBuildState *state, OffsetNumber attrnum, unsigned char category, Datum key, int16 typlen, bool typbyval, - ItemPointerData *items, uint32 nitems, - Size *len) + ItemPointerData *items, uint32 nitems) { GinTuple *tuple; char *ptr; @@ -2228,8 +2178,6 @@ _gin_build_tuple(GinBuildState *state, */ tuplen = MAXALIGN(offsetof(GinTuple, data) + keylen) + compresslen; - *len = tuplen; - /* * Allocate space for the whole GIN tuple. * @@ -2291,12 +2239,15 @@ _gin_build_tuple(GinBuildState *state, pfree(seginfo); } - /* how large would the tuple be without compression? */ - state->buildStats.sizeRaw += MAXALIGN(offsetof(GinTuple, data) + keylen) + - nitems * sizeof(ItemPointerData); + if (state) + { + /* how large would the tuple be without compression? */ + state->buildStats.sizeRaw += MAXALIGN(offsetof(GinTuple, data) + keylen) + + nitems * sizeof(ItemPointerData); - /* compressed size */ - state->buildStats.sizeCompressed += tuplen; + /* compressed size */ + state->buildStats.sizeCompressed += tuplen; + } return tuple; } diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index a3921373c5..cfaa17d9bc 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -399,6 +399,7 @@ struct Sharedsort #define REMOVEABBREV(state,stup,count) ((*(state)->base.removeabbrev) (state, stup, count)) #define COMPARETUP(state,a,b) ((*(state)->base.comparetup) (a, b, state)) #define WRITETUP(state,tape,stup) ((*(state)->base.writetup) (state, tape, stup)) +#define FLUSHWRITES(state,tape) ((state)->base.flushwrites ? (*(state)->base.flushwrites) (state, tape) : (void) 0) #define READTUP(state,stup,tape,len) ((*(state)->base.readtup) (state, stup, tape, len)) #define FREESTATE(state) ((state)->base.freestate ? (*(state)->base.freestate) (state) : (void) 0) #define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) @@ -2277,6 +2278,8 @@ mergeonerun(Tuplesortstate *state) } } + FLUSHWRITES(state, state->destTape); + /* * When the heap empties, we're done. Write an end-of-run marker on the * output tape. @@ -2406,6 +2409,8 @@ dumptuples(Tuplesortstate *state, bool alltuples) WRITETUP(state, state->destTape, stup); } + FLUSHWRITES(state, state->destTape); + state->memtupcount = 0; /* diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index ea15af104d..516c85f80e 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -32,6 +32,7 @@ #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/tuplesort.h" +#include "access/gin.h" /* sort-type codes for sort__start probes */ @@ -90,6 +91,7 @@ static void readtup_index_brin(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len); static void writetup_index_gin(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup); +static void flushwrites_index_gin(Tuplesortstate *state, LogicalTape *tape); static void readtup_index_gin(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len); static int comparetup_datum(const SortTuple *a, const SortTuple *b, @@ -101,6 +103,7 @@ static void writetup_datum(Tuplesortstate *state, LogicalTape *tape, static void readtup_datum(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len); static void freestate_cluster(Tuplesortstate *state); +static void freestate_index_gin(Tuplesortstate *state); /* * Data structure pointed by "TuplesortPublic.arg" for the CLUSTER case. Set by @@ -135,6 +138,16 @@ typedef struct bool uniqueNullsNotDistinct; /* unique constraint null treatment */ } TuplesortIndexBTreeArg; +/* + * Data structure pointed by "TuplesortPublic.arg" for the index_gin subcase. + */ +typedef struct +{ + TuplesortIndexArg index; + GinBuffer *buffer; +} TuplesortIndexGinArg; + + /* * Data structure pointed by "TuplesortPublic.arg" for the index_hash subcase. */ @@ -211,6 +224,7 @@ tuplesort_begin_heap(TupleDesc tupDesc, base->comparetup = comparetup_heap; base->comparetup_tiebreak = comparetup_heap_tiebreak; base->writetup = writetup_heap; + base->flushwrites = NULL; base->readtup = readtup_heap; base->haveDatum1 = true; base->arg = tupDesc; /* assume we need not copy tupDesc */ @@ -289,6 +303,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, base->comparetup = comparetup_cluster; base->comparetup_tiebreak = comparetup_cluster_tiebreak; base->writetup = writetup_cluster; + base->flushwrites = NULL; base->readtup = readtup_cluster; base->freestate = freestate_cluster; base->arg = arg; @@ -399,6 +414,7 @@ tuplesort_begin_index_btree(Relation heapRel, base->comparetup = comparetup_index_btree; base->comparetup_tiebreak = comparetup_index_btree_tiebreak; base->writetup = writetup_index; + base->flushwrites = NULL; base->readtup = readtup_index; base->haveDatum1 = true; base->arg = arg; @@ -480,6 +496,7 @@ tuplesort_begin_index_hash(Relation heapRel, base->comparetup = comparetup_index_hash; base->comparetup_tiebreak = comparetup_index_hash_tiebreak; base->writetup = writetup_index; + base->flushwrites = NULL; base->readtup = readtup_index; base->haveDatum1 = true; base->arg = arg; @@ -526,6 +543,7 @@ tuplesort_begin_index_gist(Relation heapRel, base->comparetup = comparetup_index_btree; base->comparetup_tiebreak = comparetup_index_btree_tiebreak; base->writetup = writetup_index; + base->flushwrites = NULL; base->readtup = readtup_index; base->haveDatum1 = true; base->arg = arg; @@ -583,6 +601,7 @@ tuplesort_begin_index_brin(int workMem, base->removeabbrev = removeabbrev_index_brin; base->comparetup = comparetup_index_brin; base->writetup = writetup_index_brin; + base->flushwrites = NULL; base->readtup = readtup_index_brin; base->haveDatum1 = true; base->arg = NULL; @@ -602,6 +621,7 @@ tuplesort_begin_index_gin(Relation heapRel, Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, sortopt); TuplesortPublic *base = TuplesortstateGetPublic(state); + TuplesortIndexGinArg *arg; MemoryContext oldcontext; int i; TupleDesc desc = RelationGetDescr(indexRel); @@ -626,6 +646,10 @@ tuplesort_begin_index_gin(Relation heapRel, /* Prepare SortSupport data for each column */ base->sortKeys = (SortSupport) palloc0(base->nKeys * sizeof(SortSupportData)); + arg = palloc0(sizeof(TuplesortIndexGinArg)); + arg->index.indexRel = indexRel; + arg->index.heapRel = heapRel; + arg->buffer = GinBufferInit(indexRel); for (i = 0; i < base->nKeys; i++) { @@ -657,9 +681,11 @@ tuplesort_begin_index_gin(Relation heapRel, base->removeabbrev = removeabbrev_index_gin; base->comparetup = comparetup_index_gin; base->writetup = writetup_index_gin; + base->flushwrites = flushwrites_index_gin; base->readtup = readtup_index_gin; + base->freestate = freestate_index_gin; base->haveDatum1 = false; - base->arg = NULL; + base->arg = arg; MemoryContextSwitchTo(oldcontext); @@ -702,6 +728,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, base->comparetup = comparetup_datum; base->comparetup_tiebreak = comparetup_datum_tiebreak; base->writetup = writetup_datum; + base->flushwrites = NULL; base->readtup = readtup_datum; base->haveDatum1 = true; base->arg = arg; @@ -904,17 +931,17 @@ tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size) } void -tuplesort_putgintuple(Tuplesortstate *state, GinTuple *tuple, Size size) +tuplesort_putgintuple(Tuplesortstate *state, GinTuple *tuple) { SortTuple stup; GinTuple *ctup; TuplesortPublic *base = TuplesortstateGetPublic(state); MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext); - Size tuplen; + Size tuplen = tuple->tuplen; /* copy the GinTuple into the right memory context */ - ctup = palloc(size); - memcpy(ctup, tuple, size); + ctup = palloc(tuplen); + memcpy(ctup, tuple, tuplen); stup.tuple = ctup; stup.datum1 = (Datum) 0; @@ -922,7 +949,7 @@ tuplesort_putgintuple(Tuplesortstate *state, GinTuple *tuple, Size size) /* GetMemoryChunkSpace is not supported for bump contexts */ if (TupleSortUseBumpTupleCxt(base->sortopt)) - tuplen = MAXALIGN(size); + tuplen = MAXALIGN(tuplen); else tuplen = GetMemoryChunkSpace(ctup); @@ -1942,19 +1969,61 @@ comparetup_index_gin(const SortTuple *a, const SortTuple *b, } static void -writetup_index_gin(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +_writetup_index_gin(Tuplesortstate *state, LogicalTape *tape, GinTuple *tup) { TuplesortPublic *base = TuplesortstateGetPublic(state); - GinTuple *tuple = (GinTuple *) stup->tuple; - unsigned int tuplen = tuple->tuplen; - + unsigned int tuplen = tup->tuplen; tuplen = tuplen + sizeof(tuplen); + LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); - LogicalTapeWrite(tape, tuple, tuple->tuplen); + LogicalTapeWrite(tape, tup, tup->tuplen); + if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); } +static void +writetup_index_gin(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +{ + TuplesortPublic *base = TuplesortstateGetPublic(state); + GinTuple *otup; + GinTuple *ntup = (GinTuple *) stup->tuple; + TuplesortIndexGinArg *arg = (TuplesortIndexGinArg *) base->arg; + + Assert(PointerIsValid(arg)); + + if (GinBufferCanAddKey(arg->buffer, ntup)) + { + GinBufferMergeTuple(arg->buffer, ntup); + return; + } + + otup = GinBufferBuildTuple(arg->buffer); + + _writetup_index_gin(state, tape, otup); + + pfree(otup); + + Assert(GinBufferCanAddKey(arg->buffer, ntup)); + + GinBufferMergeTuple(arg->buffer, ntup); +} + +static void +flushwrites_index_gin(Tuplesortstate *state, LogicalTape *tape) +{ + TuplesortPublic *base = TuplesortstateGetPublic(state); + TuplesortIndexGinArg *arg = (TuplesortIndexGinArg *) base->arg; + + if (!GinBufferIsEmpty(arg->buffer)) + { + GinTuple *tuple = GinBufferBuildTuple(arg->buffer); + _writetup_index_gin(state, tape, tuple); + pfree(tuple); + Assert(GinBufferIsEmpty(arg->buffer)); + } +} + static void readtup_index_gin(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len) @@ -1980,6 +2049,17 @@ readtup_index_gin(Tuplesortstate *state, SortTuple *stup, stup->datum1 = (Datum) 0; } +static void +freestate_index_gin(Tuplesortstate *state) +{ + TuplesortPublic *base = TuplesortstateGetPublic(state); + TuplesortIndexGinArg *arg = (TuplesortIndexGinArg *) base->arg; + + Assert(arg != NULL); + Assert(GinBufferIsEmpty(arg->buffer)); + GinBufferFree(arg->buffer); +} + /* * Routines specialized for DatumTuple case */ diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 3013a44bae..149191b7df 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -475,6 +475,9 @@ extern int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, TI extern ItemPointer ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out); +extern bool ginPostingListDecodeAllSegmentsInto(GinPostingList *segment, int len, + ItemPointer into, int capacity, + int *ndecoded_out); extern ItemPointer ginPostingListDecode(GinPostingList *plist, int *ndecoded_out); extern ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb, diff --git a/src/include/access/gin_tuple.h b/src/include/access/gin_tuple.h index 55dd8544b2..4ac8cfcc2b 100644 --- a/src/include/access/gin_tuple.h +++ b/src/include/access/gin_tuple.h @@ -35,6 +35,16 @@ typedef struct GinTuple char data[FLEXIBLE_ARRAY_MEMBER]; } GinTuple; +typedef struct GinBuffer GinBuffer; + extern int _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup); +extern GinBuffer *GinBufferInit(Relation index); +extern bool GinBufferIsEmpty(GinBuffer *buffer); +extern bool GinBufferCanAddKey(GinBuffer *buffer, GinTuple *tup); +extern void GinBufferReset(GinBuffer *buffer); +extern void GinBufferFree(GinBuffer *buffer); +extern void GinBufferMergeTuple(GinBuffer *buffer, GinTuple *tup); +extern GinTuple *GinBufferBuildTuple(GinBuffer *buffer); + #endif /* GIN_TUPLE_H */ diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 0ed71ae922..6c56e40bff 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -194,6 +194,14 @@ typedef struct */ void (*writetup) (Tuplesortstate *state, LogicalTape *tape, SortTuple *stup); + /* + * Flush any buffered writetup() writes. + * + * This is useful when writetup() buffers writes for more efficient + * use of the tape's resources, e.g. when deduplicating or merging + * values. + */ + void (*flushwrites) (Tuplesortstate *state, LogicalTape *tape); /* * Function to read a stored tuple from tape back into memory. 'len' is @@ -461,7 +469,7 @@ extern void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, const Datum *values, const bool *isnull); extern void tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size); -extern void tuplesort_putgintuple(Tuplesortstate *state, GinTuple *tuple, Size size); +extern void tuplesort_putgintuple(Tuplesortstate *state, struct GinTuple *tuple); extern void tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull); -- 2.39.2