From dfe964ae6b9d8bbb58143ed3ebd74ccb44cb340e Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Sun, 26 Oct 2025 21:23:37 +0100
Subject: [PATCH v3 2/3] Split TID lists during parallel GIN build

When building intermediate TID lists during parallel GIN builds, split
the sorted lists into smaller chunks, to limit the amount of memory
needed when merging the chunks later.

The leader may need to keep in memory up to one chunk per worker, and
possibly one extra chunk (before evicting some of the data). We limit
the chunk size so that memory usage does not exceed MaxAllocSize (1GB).

This is desirable even with huge allocations allowed. Larger chunks do
not improve performance, so that the increased memory usage is useless.

Report by Greg Smith, investigation and fix by me. Batchpatched to 18,
where parallel GIN builds were introduced.

Reported-by: Gregory Smith <gregsmithpgsql@gmail.com>
Discussion: https://postgr.es/m/CAHLJuCWDwn-PE2BMZE4Kux7x5wWt_6RoWtA0mUQffEDLeZ6sfA@mail.gmail.com
Backpatch-through: 18
---
 src/backend/access/gin/gininsert.c | 48 +++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 2355b96b351..d15e9a0cb0b 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -152,7 +152,9 @@ typedef struct
 	 * only in the leader process.
 	 */
 	GinLeader  *bs_leader;
-	int			bs_worker_id;
+
+	/* number of participating workers (including leader) */
+	int			bs_num_workers;
 
 	/* used to pass information from workers to leader */
 	double		bs_numtuples;
@@ -483,6 +485,11 @@ ginBuildCallback(Relation index, ItemPointer tid, Datum *values,
 /*
  * ginFlushBuildState
  *		Write all data from BuildAccumulator into the tuplesort.
+ *
+ * The number of TIDs written to the tuplesort at once is limited, to reduce
+ * the amount of memory needed when merging the intermediate results later.
+ * The leader will see up to two chunks per worker, so calculate the limit to
+ * not need more than MaxAllocSize overall.
  */
 static void
 ginFlushBuildState(GinBuildState *buildstate, Relation index)
@@ -493,6 +500,11 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index)
 	uint32		nlist;
 	OffsetNumber attnum;
 	TupleDesc	tdesc = RelationGetDescr(index);
+	uint32		maxlen;
+
+	/* maximum number of TIDs per chunk (two chunks per worker) */
+	maxlen = MaxAllocSize / sizeof(ItemPointerData);
+	maxlen /= (2 * buildstate->bs_num_workers);
 
 	ginBeginBAScan(&buildstate->accum);
 	while ((list = ginGetBAEntry(&buildstate->accum,
@@ -501,20 +513,31 @@ ginFlushBuildState(GinBuildState *buildstate, Relation index)
 		/* information about the key */
 		CompactAttribute *attr = TupleDescCompactAttr(tdesc, (attnum - 1));
 
-		/* GIN tuple and tuple length */
-		GinTuple   *tup;
-		Size		tuplen;
+		/* start of the chunk */
+		uint32		offset = 0;
 
-		/* there could be many entries, so be willing to abort here */
-		CHECK_FOR_INTERRUPTS();
+		/* split the entry into smaller chunk with up to maxlen items */
+		while (offset < nlist)
+		{
+			/* GIN tuple and tuple length */
+			GinTuple   *tup;
+			Size		tuplen;
+			uint32		len = Min(maxlen, nlist - offset);
 
-		tup = _gin_build_tuple(attnum, category,
-							   key, attr->attlen, attr->attbyval,
-							   list, nlist, &tuplen);
+			/* there could be many entries, so be willing to abort here */
+			CHECK_FOR_INTERRUPTS();
+
+			tup = _gin_build_tuple(attnum, category,
+								   key, attr->attlen, attr->attbyval,
+								   &list[offset], len,
+								   &tuplen);
+
+			offset += maxlen;
 
-		tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
+			tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
 
-		pfree(tup);
+			pfree(tup);
+		}
 	}
 
 	MemoryContextReset(buildstate->tmpCtx);
@@ -2018,6 +2041,9 @@ _gin_parallel_scan_and_build(GinBuildState *state,
 	/* remember how much space is allowed for the accumulated entries */
 	state->work_mem = (sortmem / 2);
 
+	/* remember how many workers participate in the build */
+	state->bs_num_workers = ginshared->scantuplesortstates;
+
 	/* Begin "partial" tuplesort */
 	state->bs_sortstate = tuplesort_begin_index_gin(heap, index,
 													state->work_mem,
-- 
2.51.0