From b1f9e315cf40b75929431236e200fbdf5a0068c4 Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Mon, 27 Mar 2023 15:54:39 +0200
Subject: [PATCH] Allocate hash batches related BufFile in a dedicated context

---
 src/backend/executor/nodeHash.c     | 43 +++++++++++++++++++++++++----
 src/backend/executor/nodeHashjoin.c | 18 ++++++++----
 src/include/executor/hashjoin.h     | 15 ++++++++--
 src/include/executor/nodeHashjoin.h |  2 +-
 4 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 748c9b0024..3da83ac22a 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -484,7 +484,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 	 *
 	 * The hashtable control block is just palloc'd from the executor's
 	 * per-query memory context.  Everything else should be kept inside the
-	 * subsidiary hashCxt or batchCxt.
+	 * subsidiary hashCxt, batchCxt or fileCxt.
 	 */
 	hashtable = palloc_object(HashJoinTableData);
 	hashtable->nbuckets = nbuckets;
@@ -538,6 +538,10 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 												"HashBatchContext",
 												ALLOCSET_DEFAULT_SIZES);
 
+	hashtable->fileCxt = AllocSetContextCreate(CurrentMemoryContext,
+								 "HashBatchFiles",
+								 ALLOCSET_DEFAULT_SIZES);
+
 	/* Allocate data that will live for the life of the hashjoin */
 
 	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
@@ -570,15 +574,21 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 
 	if (nbatch > 1 && hashtable->parallel_state == NULL)
 	{
+		MemoryContext oldctx;
+
 		/*
 		 * allocate and initialize the file arrays in hashCxt (not needed for
 		 * parallel case which uses shared tuplestores instead of raw files)
 		 */
+		oldctx = MemoryContextSwitchTo(hashtable->fileCxt);
+
 		hashtable->innerBatchFile = palloc0_array(BufFile *, nbatch);
 		hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch);
 		/* The files will not be opened until needed... */
 		/* ... but make sure we have temp tablespaces established for them */
 		PrepareTempTablespaces();
+
+		MemoryContextSwitchTo(oldctx);
 	}
 
 	MemoryContextSwitchTo(oldcxt);
@@ -929,12 +939,18 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 	nbatch = oldnbatch * 2;
 	Assert(nbatch > 1);
 
+	elog(WARNING, "increasing number of batches from %d to %d", oldnbatch, nbatch);
+
+	elog(LOG, "ExecHashIncreaseNumBatches ======= context stats start =======");
+	MemoryContextStats(TopMemoryContext);
+
+
 #ifdef HJDEBUG
 	printf("Hashjoin %p: increasing nbatch to %d because space = %zu\n",
 		   hashtable, nbatch, hashtable->spaceUsed);
 #endif
 
-	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
+	oldcxt = MemoryContextSwitchTo(hashtable->fileCxt);
 
 	if (hashtable->innerBatchFile == NULL)
 	{
@@ -1022,9 +1038,11 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 			{
 				/* dump it out */
 				Assert(batchno > curbatch);
+
 				ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple),
 									  hashTuple->hashvalue,
-									  &hashtable->innerBatchFile[batchno]);
+									  &hashtable->innerBatchFile[batchno],
+									  hashtable->fileCxt);
 
 				hashtable->spaceUsed -= hashTupleSize;
 				nfreed++;
@@ -1042,6 +1060,13 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 		oldchunks = nextchunk;
 	}
 
+	if (hashtable->fileCxt->mem_allocated > hashtable->spaceAllowed)
+		elog(NOTICE,
+			 "Growing number of hash batch to %d is exhausting allowed memory (%ld > %ld)",
+			 nbatch,
+			 hashtable->fileCxt->mem_allocated,
+			 hashtable->spaceAllowed);
+
 #ifdef HJDEBUG
 	printf("Hashjoin %p: freed %ld of %ld tuples, space now %zu\n",
 		   hashtable, nfreed, ninmemory, hashtable->spaceUsed);
@@ -1063,6 +1088,9 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 			   hashtable);
 #endif
 	}
+
+	elog(LOG, "ExecHashIncreaseNumBatches ======= context stats end =======");
+	MemoryContextStats(TopMemoryContext);
 }
 
 /*
@@ -1681,9 +1709,11 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		 * put the tuple into a temp file for later batches
 		 */
 		Assert(batchno > hashtable->curbatch);
+
 		ExecHashJoinSaveTuple(tuple,
 							  hashvalue,
-							  &hashtable->innerBatchFile[batchno]);
+							  &hashtable->innerBatchFile[batchno],
+							  hashtable->fileCxt);
 	}
 
 	if (shouldFree)
@@ -2534,8 +2564,11 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
 		{
 			/* Put the tuple into a temp file for later batches */
 			Assert(batchno > hashtable->curbatch);
+
 			ExecHashJoinSaveTuple(tuple, hashvalue,
-								  &hashtable->innerBatchFile[batchno]);
+								  &hashtable->innerBatchFile[batchno],
+								  hashtable->fileCxt);
+
 			pfree(hashTuple);
 			hashtable->spaceUsed -= tupleSize;
 			hashtable->spaceUsedSkew -= tupleSize;
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index f189fb4d28..6055abde49 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -432,8 +432,10 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 					 */
 					Assert(parallel_state == NULL);
 					Assert(batchno > hashtable->curbatch);
+
 					ExecHashJoinSaveTuple(mintuple, hashvalue,
-										  &hashtable->outerBatchFile[batchno]);
+										  &hashtable->outerBatchFile[batchno],
+										  hashtable->fileCxt);
 
 					if (shouldFree)
 						heap_free_minimal_tuple(mintuple);
@@ -1234,21 +1236,27 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
  * The data recorded in the file for each tuple is its hash value,
  * then the tuple in MinimalTuple format.
  *
- * Note: it is important always to call this in the regular executor
- * context, not in a shorter-lived context; else the temp file buffers
- * will get messed up.
+ * Note: it is important always to call this in the HashBatchFiles context,
+ * not in a shorter-lived context; else the temp file buffers will get messed
+ * up.
  */
 void
 ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
-					  BufFile **fileptr)
+					  BufFile **fileptr, MemoryContext filecxt)
 {
 	BufFile    *file = *fileptr;
 
 	if (file == NULL)
 	{
+		MemoryContext oldctx;
+
+		oldctx = MemoryContextSwitchTo(filecxt);
+
 		/* First write to this batch file, so open it. */
 		file = BufFileCreateTemp(false);
 		*fileptr = file;
+
+		MemoryContextSwitchTo(oldctx);
 	}
 
 	BufFileWrite(file, &hashvalue, sizeof(uint32));
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index acb7592ca0..d759235d7f 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -25,10 +25,14 @@
  *
  * Each active hashjoin has a HashJoinTable control block, which is
  * palloc'd in the executor's per-query context.  All other storage needed
- * for the hashjoin is kept in private memory contexts, two for each hashjoin.
+ * for the hashjoin is kept in private memory contexts, three for each
+ * hashjoin:
+ * - HashTableContext (hashCxt): the control block associated to the hash table
+ * - HashBatchContext (batchCxt): storages for batches
+ * - HashBatchFiles (fileCxt): storage for temp files buffers
+ *
  * This makes it easy and fast to release the storage when we don't need it
- * anymore.  (Exception: data associated with the temp files lives in the
- * per-query context too, since we always call buffile.c in that context.)
+ * anymore.
  *
  * The hashtable contexts are made children of the per-query context, ensuring
  * that they will be discarded at end of statement even if the join is
@@ -39,6 +43,10 @@
  * "hashCxt", while storage that is only wanted for the current batch is
  * allocated in the "batchCxt".  By resetting the batchCxt at the end of
  * each batch, we free all the per-batch storage reliably and without tedium.
+ * Note that data associated with the temp files lives in the "fileCxt" context
+ * which lives during the entire join as temp files might need to survives
+ * batches. These files are explicitly destroyed by calling BufFileClose()
+ * when the code is done with them.
  *
  * During first scan of inner relation, we get its tuples from executor.
  * If nbatch > 1 then tuples that don't belong in first batch get saved
@@ -348,6 +356,7 @@ typedef struct HashJoinTableData
 
 	MemoryContext hashCxt;		/* context for whole-hash-join storage */
 	MemoryContext batchCxt;		/* context for this-batch-only storage */
+	MemoryContext fileCxt;		/* context for the BufFile related storage */
 
 	/* used for dense allocation of tuples (into linked chunks) */
 	HashMemoryChunk chunks;		/* one list for the whole batch */
diff --git a/src/include/executor/nodeHashjoin.h b/src/include/executor/nodeHashjoin.h
index d367070883..a8f9ae1989 100644
--- a/src/include/executor/nodeHashjoin.h
+++ b/src/include/executor/nodeHashjoin.h
@@ -29,6 +29,6 @@ extern void ExecHashJoinInitializeWorker(HashJoinState *state,
 										 ParallelWorkerContext *pwcxt);
 
 extern void ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
-								  BufFile **fileptr);
+								  BufFile **fileptr, MemoryContext filecxt);
 
 #endif							/* NODEHASHJOIN_H */
-- 
2.39.2