From dfe964ae6b9d8bbb58143ed3ebd74ccb44cb340e Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Sun, 26 Oct 2025 21:23:37 +0100 Subject: [PATCH v3 2/3] Split TID lists during parallel GIN build When building intermediate TID lists during parallel GIN builds, split the sorted lists into smaller chunks, to limit the amount of memory needed when merging the chunks later. The leader may need to keep in memory up to one chunk per worker, and possibly one extra chunk (before evicting some of the data). We limit the chunk size so that memory usage does not exceed MaxAllocSize (1GB). This is desirable even with huge allocations allowed. Larger chunks do not improve performance, so that the increased memory usage is useless. Report by Greg Smith, investigation and fix by me. Batchpatched to 18, where parallel GIN builds were introduced. Reported-by: Gregory Smith Discussion: https://postgr.es/m/CAHLJuCWDwn-PE2BMZE4Kux7x5wWt_6RoWtA0mUQffEDLeZ6sfA@mail.gmail.com Backpatch-through: 18 --- src/backend/access/gin/gininsert.c | 48 +++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 2355b96b351..d15e9a0cb0b 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -152,7 +152,9 @@ typedef struct * only in the leader process. */ GinLeader *bs_leader; - int bs_worker_id; + + /* number of participating workers (including leader) */ + int bs_num_workers; /* used to pass information from workers to leader */ double bs_numtuples; @@ -483,6 +485,11 @@ ginBuildCallback(Relation index, ItemPointer tid, Datum *values, /* * ginFlushBuildState * Write all data from BuildAccumulator into the tuplesort. + * + * The number of TIDs written to the tuplesort at once is limited, to reduce + * the amount of memory needed when merging the intermediate results later. + * The leader will see up to two chunks per worker, so calculate the limit to + * not need more than MaxAllocSize overall. */ static void ginFlushBuildState(GinBuildState *buildstate, Relation index) @@ -493,6 +500,11 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index) uint32 nlist; OffsetNumber attnum; TupleDesc tdesc = RelationGetDescr(index); + uint32 maxlen; + + /* maximum number of TIDs per chunk (two chunks per worker) */ + maxlen = MaxAllocSize / sizeof(ItemPointerData); + maxlen /= (2 * buildstate->bs_num_workers); ginBeginBAScan(&buildstate->accum); while ((list = ginGetBAEntry(&buildstate->accum, @@ -501,20 +513,31 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index) /* information about the key */ CompactAttribute *attr = TupleDescCompactAttr(tdesc, (attnum - 1)); - /* GIN tuple and tuple length */ - GinTuple *tup; - Size tuplen; + /* start of the chunk */ + uint32 offset = 0; - /* there could be many entries, so be willing to abort here */ - CHECK_FOR_INTERRUPTS(); + /* split the entry into smaller chunk with up to maxlen items */ + while (offset < nlist) + { + /* GIN tuple and tuple length */ + GinTuple *tup; + Size tuplen; + uint32 len = Min(maxlen, nlist - offset); - tup = _gin_build_tuple(attnum, category, - key, attr->attlen, attr->attbyval, - list, nlist, &tuplen); + /* there could be many entries, so be willing to abort here */ + CHECK_FOR_INTERRUPTS(); + + tup = _gin_build_tuple(attnum, category, + key, attr->attlen, attr->attbyval, + &list[offset], len, + &tuplen); + + offset += maxlen; - tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen); + tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen); - pfree(tup); + pfree(tup); + } } MemoryContextReset(buildstate->tmpCtx); @@ -2018,6 +2041,9 @@ _gin_parallel_scan_and_build(GinBuildState *state, /* remember how much space is allowed for the accumulated entries */ state->work_mem = (sortmem / 2); + /* remember how many workers participate in the build */ + state->bs_num_workers = ginshared->scantuplesortstates; + /* Begin "partial" tuplesort */ state->bs_sortstate = tuplesort_begin_index_gin(heap, index, state->work_mem, -- 2.51.0