From 75b252aa2fb1058dd72580024c3c45313b732655 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 20 Jun 2024 20:50:51 +0200 Subject: [PATCH v20240705 7/9] Detect wrap around in parallel callback When sync scan during index build wraps around, that may result in some keys having very long TID lists, requiring "full" merge sort runs when combining data in workers. It also causes problems with enforcing memory limit, because we can't just dump the data - the index build requires append-only posting lists, and violating may result in errors like ERROR: could not split GIN page; all old items didn't fit because after the scan wrap around some of the TIDs may belong to the beginning of the list, affecting the compression. But we can deal with this in the callback - if we see the TID to jump back, that must mean a wraparound happened. In that case we simply dump all the data accumulated in memory, and start from scratch. This means there won't be any tuples with very wide TID ranges, instead there'll be one tuple with a range at the end of the table, and another tuple at the beginning. And all the lists in the worker will be non-overlapping, and sort nicely based on first TID. For the leader, we still need to do the full merge - the lists may overlap and interleave in various ways. But there should be only very few of those lists, about one per worker, making it not an issue. --- src/backend/access/gin/gininsert.c | 132 ++++++++++++++--------------- 1 file changed, 63 insertions(+), 69 deletions(-) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index cc380f0359..4483eedcbe 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -143,6 +143,7 @@ typedef struct MemoryContext tmpCtx; MemoryContext funcCtx; BuildAccumulator accum; + ItemPointerData tid; /* FIXME likely duplicate with indtuples */ double bs_numtuples; @@ -474,6 +475,47 @@ ginBuildCallback(Relation index, ItemPointer tid, Datum *values, MemoryContextSwitchTo(oldCtx); } +/* + * ginFlushBuildState + * Write all data from BuildAccumulator into the tuplesort. + */ +static void +ginFlushBuildState(GinBuildState *buildstate, Relation index) +{ + ItemPointerData *list; + Datum key; + GinNullCategory category; + uint32 nlist; + OffsetNumber attnum; + TupleDesc tdesc = RelationGetDescr(index); + + ginBeginBAScan(&buildstate->accum); + while ((list = ginGetBAEntry(&buildstate->accum, + &attnum, &key, &category, &nlist)) != NULL) + { + /* information about the key */ + Form_pg_attribute attr = TupleDescAttr(tdesc, (attnum - 1)); + + /* GIN tuple and tuple length */ + GinTuple *tup; + Size tuplen; + + /* there could be many entries, so be willing to abort here */ + CHECK_FOR_INTERRUPTS(); + + tup = _gin_build_tuple(buildstate, attnum, category, + key, attr->attlen, attr->attbyval, + list, nlist, &tuplen); + + tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen); + + pfree(tup); + } + + MemoryContextReset(buildstate->tmpCtx); + ginInitBA(&buildstate->accum); +} + /* * ginBuildCallbackParallel * Callback for the parallel index build. @@ -498,6 +540,11 @@ ginBuildCallback(Relation index, ItemPointer tid, Datum *values, * The disadvantage is increased disk space usage, possibly up to 2x, if * no entries get combined at the worker level. * + * To detect a wraparound (which can happen with sync scans), we remember the + * last TID seen by each worker - if the next TID seen by the worker is lower, + * the scan must have wrapped around. We handle that by flushing the current + * buildstate to the tuplesort, so that we don't end up with wide TID lists. + * * XXX It would be possible to partition the data into multiple tuplesorts * per worker (by hashing) - we don't need the data produced by workers * to be perfectly sorted, and we could even live with multiple entries @@ -514,6 +561,16 @@ ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values, oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); + /* scan wrapped around - flush accumulated entries and start anew */ + if (ItemPointerCompare(tid, &buildstate->tid) < 0) + { + elog(LOG, "calling ginFlushBuildState"); + ginFlushBuildState(buildstate, index); + } + + /* remember the TID we're about to process */ + memcpy(&buildstate->tid, tid, sizeof(ItemPointerData)); + for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++) ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1), values[i], isnull[i], tid); @@ -532,40 +589,7 @@ ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values, * maintenance command. */ if (buildstate->accum.allocatedMemory >= (Size) work_mem * 1024L) - { - ItemPointerData *list; - Datum key; - GinNullCategory category; - uint32 nlist; - OffsetNumber attnum; - TupleDesc tdesc = RelationGetDescr(index); - - ginBeginBAScan(&buildstate->accum); - while ((list = ginGetBAEntry(&buildstate->accum, - &attnum, &key, &category, &nlist)) != NULL) - { - /* information about the index key */ - Form_pg_attribute attr = TupleDescAttr(tdesc, (attnum - 1)); - - /* GIN tuple and tuple length that we'll use for tuplesort */ - GinTuple *tup; - Size tuplen; - - /* there could be many entries, so be willing to abort here */ - CHECK_FOR_INTERRUPTS(); - - tup = _gin_build_tuple(buildstate, attnum, category, - key, attr->attlen, attr->attbyval, - list, nlist, &tuplen); - - tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen); - - pfree(tup); - } - - MemoryContextReset(buildstate->tmpCtx); - ginInitBA(&buildstate->accum); - } + ginFlushBuildState(buildstate, index); MemoryContextSwitchTo(oldCtx); } @@ -602,6 +626,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.bs_numtuples = 0; buildstate.bs_reltuples = 0; buildstate.bs_leader = NULL; + memset(&buildstate.tid, 0, sizeof(ItemPointerData)); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); @@ -1231,8 +1256,8 @@ GinBufferInit(Relation index) * with too many TIDs. and 64kB seems more than enough. But maybe this * should be tied to maintenance_work_mem or something like that? * - * XXX This is not enough to prevent repeated merges after a wraparound - * of the parallel scan, but it should be enough to make the merges cheap + * XXX This is not enough to prevent repeated merges after a wraparound of + * the parallel scan, but it should be enough to make the merges cheap * because it quickly reaches the end of the second list and can just * memcpy the rest without walking it item by item. */ @@ -1964,39 +1989,7 @@ _gin_parallel_scan_and_build(GinBuildState *state, ginBuildCallbackParallel, state, scan); /* write remaining accumulated entries */ - { - ItemPointerData *list; - Datum key; - GinNullCategory category; - uint32 nlist; - OffsetNumber attnum; - TupleDesc tdesc = RelationGetDescr(index); - - ginBeginBAScan(&state->accum); - while ((list = ginGetBAEntry(&state->accum, - &attnum, &key, &category, &nlist)) != NULL) - { - /* information about the key */ - Form_pg_attribute attr = TupleDescAttr(tdesc, (attnum - 1)); - - GinTuple *tup; - Size len; - - /* there could be many entries, so be willing to abort here */ - CHECK_FOR_INTERRUPTS(); - - tup = _gin_build_tuple(state, attnum, category, - key, attr->attlen, attr->attbyval, - list, nlist, &len); - - tuplesort_putgintuple(state->bs_worker_sort, tup, len); - - pfree(tup); - } - - MemoryContextReset(state->tmpCtx); - ginInitBA(&state->accum); - } + ginFlushBuildState(state, index); /* * Do the first phase of in-worker processing - sort the data produced by @@ -2081,6 +2074,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) buildstate.indtuples = 0; /* XXX Shouldn't this initialize the other fields too, like ginbuild()? */ memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); + memset(&buildstate.tid, 0, sizeof(ItemPointerData)); /* * create a temporary memory context that is used to hold data not yet -- 2.40.1