From 7e5d82b65006523702221bb1c7c9e1d079781cc9 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Sun, 29 Dec 2019 18:56:42 -0800 Subject: [PATCH v5 1/5] Implement Adaptive Hashjoin Serial Hashloop Fallback: "Chunk" the inner file into arbitrary partitions of work_mem size offset along tuple bounds while loading the batch into the hashtable. Note that this makes it impossible to increase nbatches during the loading of batches after initial hashtable creation. In preparation for doing this chunking, separate "advance batch" and "load batch". Implement outer tuple batch rewinding per chunk of inner batch. Would be a simple rewind and replay of outer side for each chunk of inner if it weren't for LOJ. Because we need to wait to emit NULL-extended tuples for LOJ until after all chunks of inner have been processed. To do this without incurring additional memory pressure, use a temporary BufFile to capture the match status of each outer side tuple. Use one bit per tuple to represent the match status, and, since for parallel-oblivious hashjoin the outer side tuples are encountered in a deterministic order, synchronizing the outer tuples match status file with the outer tuples in the batch file to decide which ones to emit NULL-extended is easy and can be done with a simple counter. For non-hashloop fallback scenario (including batch 0), this file is not created and unmatched outer tuples should be emitted as they are encountered. Parallel Hashloop Fallback: During initial allocation of the hashtable, each time the number of batches is increased, a new variable in the ParallelHashJoinState, batch_increases, is incremented. In PHJ_GROW_BATCHES_DECIDING, if pstate->batch_increases >= 2, parallel_hashloop_fallback will be enabled for qualifying batches. From then on, if a batch is still too large to fit into the space_allowed, then parallel_hashloop_fallback is set on that batch. It will not be allowed to divide further and, during execution, the fallback strategy will be used. For a batch which has parallel_hashloop_fallback set, tuples inserted into the the batch's inner and outer batch files will have an additional piece of metadata (other than the hashvalue). For the inner side, this additional metadata is the chunk number, For the outer side, this additional metadata is the tuple identifier--needed when rescanning the outer side batch file for each chunk of the inner. During execution of a parallel hashjoin batch which needs to fall back, the worker will create an "outer match status file" which contains a bitmap tracking which outer tuples have matched an inner tuple. All bits in the worker's outer match status file are initially unset. During probing, the worker will set the corresponding bit (the bit at the index of the tuple identifier) in the outer match status bitmap for an outer tuple which matches any inner tuple. Workers probing a fallback batch will wait until all workers have finished probing before moving on so that an elected worker can read and combine the outer match status files into a single bitmap and use it to emit unmatched outer tuples after all chunks of the inner side have been processed. --- src/backend/executor/Makefile | 1 + src/backend/executor/adaptiveHashjoin.c | 349 +++++ src/backend/executor/nodeHash.c | 127 +- src/backend/executor/nodeHashjoin.c | 1202 +++++++++++----- src/backend/postmaster/pgstat.c | 21 + src/backend/storage/file/buffile.c | 65 + src/backend/storage/ipc/barrier.c | 85 ++ src/backend/utils/sort/sharedtuplestore.c | 133 ++ src/include/executor/adaptiveHashjoin.h | 9 + src/include/executor/hashjoin.h | 28 +- src/include/executor/nodeHash.h | 5 +- src/include/executor/tuptable.h | 3 +- src/include/nodes/execnodes.h | 17 + src/include/pgstat.h | 8 + src/include/storage/barrier.h | 1 + src/include/storage/buffile.h | 3 + src/include/storage/lwlock.h | 1 + src/include/utils/sharedtuplestore.h | 22 + src/test/regress/expected/adaptive_hj.out | 1233 +++++++++++++++++ .../regress/expected/parallel_adaptive_hj.out | 343 +++++ src/test/regress/parallel_schedule | 2 +- src/test/regress/post_schedule | 8 + src/test/regress/pre_schedule | 120 ++ src/test/regress/serial_schedule | 2 + src/test/regress/sql/adaptive_hj.sql | 240 ++++ src/test/regress/sql/parallel_adaptive_hj.sql | 182 +++ 26 files changed, 3841 insertions(+), 369 deletions(-) create mode 100644 src/backend/executor/adaptiveHashjoin.c create mode 100644 src/include/executor/adaptiveHashjoin.h create mode 100644 src/test/regress/expected/adaptive_hj.out create mode 100644 src/test/regress/expected/parallel_adaptive_hj.out create mode 100644 src/test/regress/post_schedule create mode 100644 src/test/regress/pre_schedule create mode 100644 src/test/regress/sql/adaptive_hj.sql create mode 100644 src/test/regress/sql/parallel_adaptive_hj.sql diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile index a983800e4b..54799d7644 100644 --- a/src/backend/executor/Makefile +++ b/src/backend/executor/Makefile @@ -46,6 +46,7 @@ OBJS = \ nodeGroup.o \ nodeHash.o \ nodeHashjoin.o \ + adaptiveHashjoin.o \ nodeIndexonlyscan.o \ nodeIndexscan.o \ nodeLimit.o \ diff --git a/src/backend/executor/adaptiveHashjoin.c b/src/backend/executor/adaptiveHashjoin.c new file mode 100644 index 0000000000..dff5b38d38 --- /dev/null +++ b/src/backend/executor/adaptiveHashjoin.c @@ -0,0 +1,349 @@ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/parallel.h" +#include "executor/executor.h" +#include "executor/hashjoin.h" +#include "executor/nodeHash.h" +#include "executor/nodeHashjoin.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/memutils.h" +#include "utils/sharedtuplestore.h" + +#include "executor/adaptiveHashjoin.h" + + + + +bool +ExecParallelHashJoinNewChunk(HashJoinState *hjstate, bool advance_from_probing) +{ + HashJoinTable hashtable; + int batchno; + ParallelHashJoinBatch *phj_batch; + SharedTuplestoreAccessor *outer_tuples; + SharedTuplestoreAccessor *inner_tuples; + Barrier *chunk_barrier; + + hashtable = hjstate->hj_HashTable; + batchno = hashtable->curbatch; + phj_batch = hashtable->batches[batchno].shared; + outer_tuples = hashtable->batches[batchno].outer_tuples; + inner_tuples = hashtable->batches[batchno].inner_tuples; + + /* + * This chunk_barrier is initialized in the ELECTING phase when this + * worker attached to the batch in ExecParallelHashJoinNewBatch() + */ + chunk_barrier = &hashtable->batches[batchno].shared->chunk_barrier; + + /* + * If this worker just came from probing (from HJ_SCAN_BUCKET) we need to + * advance the chunk number here. Otherwise this worker isn't attached yet + * to the chunk barrier. + */ + if (advance_from_probing) + { + /* + * The current chunk number can't be incremented if *any* worker isn't + * done yet (otherwise they might access the wrong data structure!) + */ + if (BarrierArriveAndWait(chunk_barrier, + WAIT_EVENT_HASH_CHUNK_PROBING)) + phj_batch->current_chunk_num++; + + /* Once the barrier is advanced we'll be in the DONE phase */ + } + else + BarrierAttach(chunk_barrier); + + /* + * The outer side is exhausted and either 1) the current chunk of the + * inner side is exhausted and it is time to advance the chunk 2) the last + * chunk of the inner side is exhausted and it is time to advance the + * batch + */ + switch (BarrierPhase(chunk_barrier)) + { + /* + * TODO: remove this phase and coordinate access to hashtable + * above goto and after incrementing current_chunk_num + */ + case PHJ_CHUNK_ELECTING: + phj_chunk_electing: + BarrierArriveAndWait(chunk_barrier, + WAIT_EVENT_HASH_CHUNK_ELECTING); + /* Fall through. */ + + case PHJ_CHUNK_LOADING: + /* Start (or join in) loading the next chunk of inner tuples. */ + sts_begin_parallel_scan(inner_tuples); + + MinimalTuple tuple; + tupleMetadata metadata; + + while ((tuple = sts_parallel_scan_next(inner_tuples, &metadata))) + { + if (metadata.tupleid != phj_batch->current_chunk_num) + continue; + + ExecForceStoreMinimalTuple(tuple, + hjstate->hj_HashTupleSlot, + false); + + ExecParallelHashTableInsertCurrentBatch( + hashtable, + hjstate->hj_HashTupleSlot, + metadata.hashvalue); + } + sts_end_parallel_scan(inner_tuples); + BarrierArriveAndWait(chunk_barrier, + WAIT_EVENT_HASH_CHUNK_LOADING); + /* Fall through. */ + + case PHJ_CHUNK_PROBING: + sts_begin_parallel_scan(outer_tuples); + return true; + + case PHJ_CHUNK_DONE: + + BarrierArriveAndWait(chunk_barrier, WAIT_EVENT_HASH_CHUNK_DONE); + + if (phj_batch->current_chunk_num > phj_batch->total_num_chunks) + { + BarrierDetach(chunk_barrier); + return false; + } + + /* + * Otherwise it is time for the next chunk. One worker should + * reset the hashtable + */ + if (BarrierArriveExplicitAndWait(chunk_barrier, PHJ_CHUNK_ELECTING, WAIT_EVENT_HASH_ADVANCE_CHUNK)) + { + /* + * rewind/reset outer tuplestore and rewind outer match status + * files + */ + sts_reinitialize(outer_tuples); + + /* + * reset inner's hashtable and recycle the existing bucket + * array. + */ + dsa_pointer_atomic *buckets = (dsa_pointer_atomic *) + dsa_get_address(hashtable->area, phj_batch->buckets); + + for (size_t i = 0; i < hashtable->nbuckets; ++i) + dsa_pointer_atomic_write(&buckets[i], InvalidDsaPointer); + + /* + * TODO: this will unfortunately rescan all inner tuples in + * the batch for each chunk + */ + + /* + * should be able to save the block in the file which starts + * the next chunk instead + */ + sts_reinitialize(inner_tuples); + } + goto phj_chunk_electing; + + case PHJ_CHUNK_FINAL: + BarrierDetach(chunk_barrier); + return false; + + default: + elog(ERROR, "unexpected chunk phase %d. pid %i. batch %i.", + BarrierPhase(chunk_barrier), MyProcPid, batchno); + } + + return false; +} + + +/* + * Choose a batch to work on, and attach to it. Returns true if successful, + * false if there are no more batches. + */ +bool +ExecParallelHashJoinNewBatch(HashJoinState *hjstate) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + int start_batchno; + int batchno; + + /* + * If we started up so late that the batch tracking array has been freed + * already by ExecHashTableDetach(), then we are finished. See also + * ExecParallelHashEnsureBatchAccessors(). + */ + if (hashtable->batches == NULL) + return false; + + /* + * For hashloop fallback only Only the elected worker who was chosen to + * combine the outer match status bitmaps should reach here. This worker + * must do some final cleanup and then detach from the batch + */ + if (hjstate->combined_bitmap != NULL) + { + BufFileClose(hjstate->combined_bitmap); + hjstate->combined_bitmap = NULL; + hashtable->batches[hashtable->curbatch].done = true; + ExecHashTableDetachBatch(hashtable); + } + + /* + * If we were already attached to a batch, remember not to bother checking + * it again, and detach from it (possibly freeing the hash table if we are + * last to detach). curbatch is set when the batch_barrier phase is either + * PHJ_BATCH_LOADING or PHJ_BATCH_CHUNKING (note that the + * PHJ_BATCH_LOADING case will fall through to the PHJ_BATCH_CHUNKING + * case). The PHJ_BATCH_CHUNKING case returns to the caller. So when this + * function is reentered with a curbatch >= 0 then we must be done + * probing. + */ + if (hashtable->curbatch >= 0) + { + ParallelHashJoinBatchAccessor *accessor = hashtable->batches + hashtable->curbatch; + ParallelHashJoinBatch *batch = accessor->shared; + + /* + * End the parallel scan on the outer tuples before we arrive at the + * next barrier so that the last worker to arrive at that barrier can + * reinitialize the SharedTuplestore for another parallel scan. + */ + + if (!batch->parallel_hashloop_fallback) + BarrierArriveAndWait(&batch->batch_barrier, + WAIT_EVENT_HASH_BATCH_PROBING); + else + { + sts_close_outer_match_status_file(accessor->outer_tuples); + + /* + * If all workers (including this one) have finished probing the + * batch, one worker is elected to Combine all the outer match + * status files from the workers who were attached to this batch + * Loop through the outer match status files from all workers that + * were attached to this batch Combine them into one bitmap Use + * the bitmap, loop through the outer batch file again, and emit + * unmatched tuples + */ + + if (BarrierArriveAndWait(&batch->batch_barrier, + WAIT_EVENT_HASH_BATCH_PROBING)) + { + hjstate->combined_bitmap = sts_combine_outer_match_status_files(accessor->outer_tuples); + hjstate->last_worker = true; + return true; + } + } + + /* the elected combining worker should not reach here */ + hashtable->batches[hashtable->curbatch].done = true; + ExecHashTableDetachBatch(hashtable); + } + + /* + * Search for a batch that isn't done. We use an atomic counter to start + * our search at a different batch in every participant when there are + * more batches than participants. + */ + batchno = start_batchno = + pg_atomic_fetch_add_u32(&hashtable->parallel_state->distributor, 1) % + hashtable->nbatch; + + do + { + if (!hashtable->batches[batchno].done) + { + Barrier *batch_barrier = + &hashtable->batches[batchno].shared->batch_barrier; + + switch (BarrierAttach(batch_barrier)) + { + case PHJ_BATCH_ELECTING: + /* One backend allocates the hash table. */ + if (BarrierArriveAndWait(batch_barrier, + WAIT_EVENT_HASH_BATCH_ELECTING)) + { + ExecParallelHashTableAlloc(hashtable, batchno); + Barrier *chunk_barrier = + &hashtable->batches[batchno].shared->chunk_barrier; + + BarrierInit(chunk_barrier, 0); + hashtable->batches[batchno].shared->current_chunk_num = 1; + } + /* Fall through. */ + + case PHJ_BATCH_ALLOCATING: + /* Wait for allocation to complete. */ + BarrierArriveAndWait(batch_barrier, + WAIT_EVENT_HASH_BATCH_ALLOCATING); + /* Fall through. */ + + case PHJ_BATCH_CHUNKING: + + /* + * This batch is ready to probe. Return control to + * caller. We stay attached to batch_barrier so that the + * hash table stays alive until everyone's finished + * probing it, but no participant is allowed to wait at + * this barrier again (or else a deadlock could occur). + * All attached participants must eventually call + * BarrierArriveAndDetach() so that the final phase + * PHJ_BATCH_DONE can be reached. + */ + ExecParallelHashTableSetCurrentBatch(hashtable, batchno); + + if (batchno == 0) + sts_begin_parallel_scan(hashtable->batches[batchno].outer_tuples); + + /* + * Create an outer match status file for this batch for + * this worker This file must be accessible to the other + * workers But *only* written to by this worker. Written + * to by this worker and readable by any worker + */ + if (hashtable->batches[batchno].shared->parallel_hashloop_fallback) + sts_make_outer_match_status_file(hashtable->batches[batchno].outer_tuples); + + return true; + + case PHJ_BATCH_OUTER_MATCH_STATUS_PROCESSING: + + /* + * The batch isn't done but this worker can't contribute + * anything to it so it might as well be done from this + * worker's perspective. (Only one worker can do work in + * this phase). + */ + + /* Fall through. */ + + case PHJ_BATCH_DONE: + + /* + * Already done. Detach and go around again (if any + * remain). + */ + BarrierDetach(batch_barrier); + + hashtable->batches[batchno].done = true; + hashtable->curbatch = -1; + break; + + default: + elog(ERROR, "unexpected batch phase %d. pid %i. batchno %i.", + BarrierPhase(batch_barrier), MyProcPid, batchno); + } + } + batchno = (batchno + 1) % hashtable->nbatch; + } while (batchno != start_batchno); + + return false; +} diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index b6d5084908..c5420b169e 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -588,7 +588,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations, * Attach to the build barrier. The corresponding detach operation is * in ExecHashTableDetach. Note that we won't attach to the * batch_barrier for batch 0 yet. We'll attach later and start it out - * in PHJ_BATCH_PROBING phase, because batch 0 is allocated up front + * in PHJ_BATCH_CHUNKING phase, because batch 0 is allocated up front * and then loaded while hashing (the standard hybrid hash join * algorithm), and we'll coordinate that using build_barrier. */ @@ -1061,6 +1061,9 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) int i; Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASHING_INNER); + LWLockAcquire(&pstate->lock, LW_EXCLUSIVE); + pstate->batch_increases++; + LWLockRelease(&pstate->lock); /* * It's unlikely, but we need to be prepared for new participants to show @@ -1216,11 +1219,17 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) { bool space_exhausted = false; bool extreme_skew_detected = false; + bool excessive_batch_num_increases = false; /* Make sure that we have the current dimensions and buckets. */ ExecParallelHashEnsureBatchAccessors(hashtable); ExecParallelHashTableSetCurrentBatch(hashtable, 0); + LWLockAcquire(&pstate->lock, LW_EXCLUSIVE); + if (pstate->batch_increases >= 2) + excessive_batch_num_increases = true; + LWLockRelease(&pstate->lock); + /* Are any of the new generation of batches exhausted? */ for (i = 0; i < hashtable->nbatch; ++i) { @@ -1233,6 +1242,36 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) space_exhausted = true; + /* + * only once we've increased the number of batches + * overall many times should we start setting + */ + + /* + * some batches to use the fallback strategy. Those + * that are still too big will have this option set + */ + + /* + * we better not repartition again (growth should be + * disabled), so that we don't overwrite this value + */ + + /* + * this tells us if we have set fallback to true or + * not and how many chunks -- useful for seeing how + * many chunks + */ + + /* + * we can get to before setting it to true (since we + * still mark chunks (work_mem sized chunks)) in + * batches even if we don't fall back + */ + /* same for below but opposite */ + if (excessive_batch_num_increases == true) + batch->parallel_hashloop_fallback = true; + /* * Did this batch receive ALL of the tuples from its * parent batch? That would indicate that further @@ -1248,6 +1287,8 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable) /* Don't keep growing if it's not helping or we'd overflow. */ if (extreme_skew_detected || hashtable->nbatch >= INT_MAX / 2) pstate->growth = PHJ_GROWTH_DISABLED; + else if (excessive_batch_num_increases && space_exhausted) + pstate->growth = PHJ_GROWTH_DISABLED; else if (space_exhausted) pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES; else @@ -1315,9 +1356,27 @@ ExecParallelHashRepartitionFirst(HashJoinTable hashtable) MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); /* It belongs in a later batch. */ + ParallelHashJoinBatch *phj_batch = hashtable->batches[batchno].shared; + + LWLockAcquire(&phj_batch->lock, LW_EXCLUSIVE); + /* TODO: should I check batch estimated size here at all? */ + if (phj_batch->parallel_hashloop_fallback == true && (phj_batch->estimated_chunk_size + tuple_size > hashtable->parallel_state->space_allowed)) + { + phj_batch->total_num_chunks++; + phj_batch->estimated_chunk_size = tuple_size; + } + else + phj_batch->estimated_chunk_size += tuple_size; + + tupleMetadata metadata; + + metadata.hashvalue = hashTuple->hashvalue; + metadata.tupleid = phj_batch->total_num_chunks; + LWLockRelease(&phj_batch->lock); + hashtable->batches[batchno].estimated_size += tuple_size; sts_puttuple(hashtable->batches[batchno].inner_tuples, - &hashTuple->hashvalue, tuple); + &metadata, tuple); } /* Count this tuple. */ @@ -1369,12 +1428,15 @@ ExecParallelHashRepartitionRest(HashJoinTable hashtable) /* Scan one partition from the previous generation. */ sts_begin_parallel_scan(old_inner_tuples[i]); - while ((tuple = sts_parallel_scan_next(old_inner_tuples[i], &hashvalue))) + tupleMetadata metadata; + + while ((tuple = sts_parallel_scan_next(old_inner_tuples[i], &metadata))) { size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); int bucketno; int batchno; + hashvalue = metadata.hashvalue; /* Decide which partition it goes to in the new generation. */ ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno); @@ -1383,10 +1445,27 @@ ExecParallelHashRepartitionRest(HashJoinTable hashtable) ++hashtable->batches[batchno].ntuples; ++hashtable->batches[i].old_ntuples; + ParallelHashJoinBatch *phj_batch = hashtable->batches[batchno].shared; + + LWLockAcquire(&phj_batch->lock, LW_EXCLUSIVE); + /* TODO: should I check batch estimated size here at all? */ + if (phj_batch->parallel_hashloop_fallback == true && (phj_batch->estimated_chunk_size + tuple_size > pstate->space_allowed)) + { + phj_batch->total_num_chunks++; + phj_batch->estimated_chunk_size = tuple_size; + } + else + phj_batch->estimated_chunk_size += tuple_size; + metadata.tupleid = phj_batch->total_num_chunks; + LWLockRelease(&phj_batch->lock); /* Store the tuple its new batch. */ sts_puttuple(hashtable->batches[batchno].inner_tuples, - &hashvalue, tuple); + &metadata, tuple); + /* + * TODO: should I zero out metadata here to make sure old values + * aren't reused? + */ CHECK_FOR_INTERRUPTS(); } sts_end_parallel_scan(old_inner_tuples[i]); @@ -1719,6 +1798,7 @@ retry: size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len); Assert(batchno > 0); + ParallelHashJoinState *pstate = hashtable->parallel_state; /* Try to preallocate space in the batch if necessary. */ if (hashtable->batches[batchno].preallocated < tuple_size) @@ -1729,7 +1809,31 @@ retry: Assert(hashtable->batches[batchno].preallocated >= tuple_size); hashtable->batches[batchno].preallocated -= tuple_size; - sts_puttuple(hashtable->batches[batchno].inner_tuples, &hashvalue, + ParallelHashJoinBatch *phj_batch = hashtable->batches[batchno].shared; + + LWLockAcquire(&phj_batch->lock, LW_EXCLUSIVE); + + /* TODO: should batch estimated size be considered here? */ + + /* + * TODO: should this be done in + * ExecParallelHashTableInsertCurrentBatch instead? + */ + if (phj_batch->parallel_hashloop_fallback == true && (phj_batch->estimated_chunk_size + tuple_size > pstate->space_allowed)) + { + phj_batch->total_num_chunks++; + phj_batch->estimated_chunk_size = tuple_size; + } + else + phj_batch->estimated_chunk_size += tuple_size; + + tupleMetadata metadata; + + metadata.hashvalue = hashvalue; + metadata.tupleid = phj_batch->total_num_chunks; + LWLockRelease(&phj_batch->lock); + + sts_puttuple(hashtable->batches[batchno].inner_tuples, &metadata, tuple); } ++hashtable->batches[batchno].ntuples; @@ -2936,6 +3040,13 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch) ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i); char name[MAXPGPATH]; + shared->parallel_hashloop_fallback = false; + LWLockInitialize(&shared->lock, + LWTRANCHE_PARALLEL_HASH_JOIN_BATCH); + shared->current_chunk_num = 0; + shared->total_num_chunks = 1; + shared->estimated_chunk_size = 0; + /* * All members of shared were zero-initialized. We just need to set * up the Barrier. @@ -2945,7 +3056,7 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch) { /* Batch 0 doesn't need to be loaded. */ BarrierAttach(&shared->batch_barrier); - while (BarrierPhase(&shared->batch_barrier) < PHJ_BATCH_PROBING) + while (BarrierPhase(&shared->batch_barrier) < PHJ_BATCH_CHUNKING) BarrierArriveAndWait(&shared->batch_barrier, 0); BarrierDetach(&shared->batch_barrier); } @@ -2959,7 +3070,7 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch) sts_initialize(ParallelHashJoinBatchInner(shared), pstate->nparticipants, ParallelWorkerNumber + 1, - sizeof(uint32), + sizeof(tupleMetadata), SHARED_TUPLESTORE_SINGLE_PASS, &pstate->fileset, name); @@ -2969,7 +3080,7 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch) pstate->nparticipants), pstate->nparticipants, ParallelWorkerNumber + 1, - sizeof(uint32), + sizeof(tupleMetadata), SHARED_TUPLESTORE_SINGLE_PASS, &pstate->fileset, name); diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index c901a80923..565b0c289f 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -81,11 +81,11 @@ * PHJ_BATCH_ELECTING -- initial state * PHJ_BATCH_ALLOCATING -- one allocates buckets * PHJ_BATCH_LOADING -- all load the hash table from disk - * PHJ_BATCH_PROBING -- all probe + * PHJ_BATCH_CHUNKING -- all probe * PHJ_BATCH_DONE -- end * * Batch 0 is a special case, because it starts out in phase - * PHJ_BATCH_PROBING; populating batch 0's hash table is done during + * PHJ_BATCH_CHUNKING; populating batch 0's hash table is done during * PHJ_BUILD_HASHING_INNER so we can skip loading. * * Initially we try to plan for a single-batch hash join using the combined @@ -98,7 +98,7 @@ * already arrived. Practically, that means that we never return a tuple * while attached to a barrier, unless the barrier has reached its final * state. In the slightly special case of the per-batch barrier, we return - * tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use + * tuples while in PHJ_BATCH_CHUNKING phase, but that's OK because we use * BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting. * *------------------------------------------------------------------------- @@ -117,6 +117,8 @@ #include "utils/memutils.h" #include "utils/sharedtuplestore.h" +#include "executor/adaptiveHashjoin.h" + /* * States of the ExecHashJoin state machine @@ -124,9 +126,11 @@ #define HJ_BUILD_HASHTABLE 1 #define HJ_NEED_NEW_OUTER 2 #define HJ_SCAN_BUCKET 3 -#define HJ_FILL_OUTER_TUPLE 4 -#define HJ_FILL_INNER_TUPLES 5 -#define HJ_NEED_NEW_BATCH 6 +#define HJ_FILL_INNER_TUPLES 4 +#define HJ_NEED_NEW_BATCH 5 +#define HJ_NEED_NEW_INNER_CHUNK 6 +#define HJ_ADAPTIVE_EMIT_UNMATCHED_OUTER_INIT 7 +#define HJ_ADAPTIVE_EMIT_UNMATCHED_OUTER 8 /* Returns true if doing null-fill on outer relation */ #define HJ_FILL_OUTER(hjstate) ((hjstate)->hj_NullInnerTupleSlot != NULL) @@ -143,10 +147,15 @@ static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate, BufFile *file, uint32 *hashvalue, TupleTableSlot *tupleSlot); -static bool ExecHashJoinNewBatch(HashJoinState *hjstate); -static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate); + +static bool ExecHashJoinAdvanceBatch(HashJoinState *hjstate); +static bool ExecHashJoinLoadInnerBatch(HashJoinState *hjstate); static void ExecParallelHashJoinPartitionOuter(HashJoinState *node); +static TupleTableSlot *emitUnmatchedOuterTuple(ExprState *otherqual, + ExprContext *econtext, + HashJoinState *hjstate); + /* ---------------------------------------------------------------- * ExecHashJoinImpl @@ -161,8 +170,15 @@ static void ExecParallelHashJoinPartitionOuter(HashJoinState *node); * the other one is "outer". * ---------------------------------------------------------------- */ -static pg_attribute_always_inline TupleTableSlot * -ExecHashJoinImpl(PlanState *pstate, bool parallel) + +/* ---------------------------------------------------------------- + * ExecHashJoin + * + * Parallel-oblivious version. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * /* return: a tuple or NULL */ +ExecHashJoin(PlanState *pstate) { HashJoinState *node = castNode(HashJoinState, pstate); PlanState *outerNode; @@ -174,7 +190,8 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) TupleTableSlot *outerTupleSlot; uint32 hashvalue; int batchno; - ParallelHashJoinState *parallel_state; + + BufFile *outerFileForAdaptiveRead; /* * get information from HashJoin node @@ -185,7 +202,6 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) outerNode = outerPlanState(node); hashtable = node->hj_HashTable; econtext = node->js.ps.ps_ExprContext; - parallel_state = hashNode->parallel_state; /* * Reset per-tuple memory context to free any expression evaluation @@ -243,18 +259,6 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) /* no chance to not build the hash table */ node->hj_FirstOuterTupleSlot = NULL; } - else if (parallel) - { - /* - * The empty-outer optimization is not implemented for - * shared hash tables, because no one participant can - * determine that there are no outer tuples, and it's not - * yet clear that it's worth the synchronization overhead - * of reaching consensus to figure that out. So we have - * to build the hash table. - */ - node->hj_FirstOuterTupleSlot = NULL; - } else if (HJ_FILL_OUTER(node) || (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost && !node->hj_OuterNotEmpty)) @@ -271,17 +275,533 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) else node->hj_FirstOuterTupleSlot = NULL; - /* - * Create the hash table. If using Parallel Hash, then - * whoever gets here first will create the hash table and any - * later arrivals will merely attach to it. - */ + /* Create the hash table. */ hashtable = ExecHashTableCreate(hashNode, node->hj_HashOperators, node->hj_Collations, HJ_FILL_INNER(node)); node->hj_HashTable = hashtable; + /* Execute the Hash node, to build the hash table. */ + hashNode->hashtable = hashtable; + (void) MultiExecProcNode((PlanState *) hashNode); + + /* + * If the inner relation is completely empty, and we're not + * doing a left outer join, we can quit without scanning the + * outer relation. + */ + if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node)) + return NULL; + + /* + * need to remember whether nbatch has increased since we + * began scanning the outer relation + */ + hashtable->nbatch_outstart = hashtable->nbatch; + + /* + * Reset OuterNotEmpty for scan. (It's OK if we fetched a + * tuple above, because ExecHashJoinOuterGetTuple will + * immediately set it again.) + */ + node->hj_OuterNotEmpty = false; + + node->hj_JoinState = HJ_NEED_NEW_OUTER; + + /* FALL THRU */ + + case HJ_NEED_NEW_OUTER: + + /* + * We don't have an outer tuple, try to get the next one + */ + outerTupleSlot = + ExecHashJoinOuterGetTuple(outerNode, node, &hashvalue); + + if (TupIsNull(outerTupleSlot)) + { + /* + * end of batch, or maybe whole join. for hashloop + * fallback, all we know is outer batch is exhausted. + * inner could have more chunks + */ + if (HJ_FILL_INNER(node)) + { + /* set up to scan for unmatched inner tuples */ + ExecPrepHashTableForUnmatched(node); + node->hj_JoinState = HJ_FILL_INNER_TUPLES; + break; + } + node->hj_JoinState = HJ_NEED_NEW_INNER_CHUNK; + break; + } + + econtext->ecxt_outertuple = outerTupleSlot; + + /* + * Find the corresponding bucket for this tuple in the main + * hash table or skew hash table. + */ + node->hj_CurHashValue = hashvalue; + ExecHashGetBucketAndBatch(hashtable, hashvalue, + &node->hj_CurBucketNo, &batchno); + node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable, + hashvalue); + node->hj_CurTuple = NULL; + + /* + * for the hashloop fallback case, only initialize + * hj_MatchedOuter to false during the first chunk. otherwise, + * we will be resetting hj_MatchedOuter to false for an outer + * tuple that has already matched an inner tuple. also, + * hj_MatchedOuter should be set to false for batch 0. there + * are no chunks for batch 0, and node->hj_InnerFirstChunk + * isn't set to true until HJ_NEED_NEW_BATCH, so need to + * handle batch 0 explicitly + */ + + if (!node->hashloop_fallback || hashtable->curbatch == 0 || node->hj_InnerFirstChunk) + node->hj_MatchedOuter = false; + + /* + * The tuple might not belong to the current batch (where + * "current batch" includes the skew buckets if any). + */ + if (batchno != hashtable->curbatch && + node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) + { + bool shouldFree; + MinimalTuple mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot, + &shouldFree); + + /* + * Need to postpone this outer tuple to a later batch. + * Save it in the corresponding outer-batch file. + */ + Assert(batchno > hashtable->curbatch); + ExecHashJoinSaveTuple(mintuple, hashvalue, + &hashtable->outerBatchFile[batchno]); + + if (shouldFree) + heap_free_minimal_tuple(mintuple); + + /* Loop around, staying in HJ_NEED_NEW_OUTER state */ + continue; + } + + if (node->hashloop_fallback) + { + /* first tuple of new batch */ + if (node->hj_OuterMatchStatusesFile == NULL) + { + node->hj_OuterTupleCount = 0; + node->hj_OuterMatchStatusesFile = BufFileCreateTemp(false); + } + + /* for fallback case, always increment tuple count */ + node->hj_OuterTupleCount++; + + /* Use the next byte on every 8th tuple */ + if ((node->hj_OuterTupleCount - 1) % 8 == 0) + { + /* + * first chunk of new batch, so write and initialize + * enough bytes in the outer tuple match status file + * to capture all tuples' match statuses + */ + if (node->hj_InnerFirstChunk) + { + node->hj_OuterCurrentByte = 0; + BufFileWrite(node->hj_OuterMatchStatusesFile, &node->hj_OuterCurrentByte, 1); + } + /* otherwise, just read the next byte */ + else + BufFileRead(node->hj_OuterMatchStatusesFile, &node->hj_OuterCurrentByte, 1); + } + } + + /* OK, let's scan the bucket for matches */ + node->hj_JoinState = HJ_SCAN_BUCKET; + + /* FALL THRU */ + + case HJ_SCAN_BUCKET: + + /* + * Scan the selected hash bucket for matches to current outer + */ + if (!ExecScanHashBucket(node, econtext)) + { + /* + * The current outer tuple has run out of matches, so + * check whether to emit a dummy outer-join tuple. + * Whether we emit one or not, the next state is + * NEED_NEW_OUTER. + */ + node->hj_JoinState = HJ_NEED_NEW_OUTER; + if (!node->hashloop_fallback || node->hj_HashTable->curbatch == 0) + { + TupleTableSlot *slot = emitUnmatchedOuterTuple(otherqual, econtext, node); + + if (slot != NULL) + return slot; + } + continue; + } + + if (joinqual != NULL && !ExecQual(joinqual, econtext)) + { + InstrCountFiltered1(node, 1); + break; + } + + /* + * We've got a match, but still need to test non-hashed quals. + * ExecScanHashBucket already set up all the state needed to + * call ExecQual. + * + * If we pass the qual, then save state for next call and have + * ExecProject form the projection, store it in the tuple + * table, and return the slot. + * + * Only the joinquals determine tuple match status, but all + * quals must pass to actually return the tuple. + */ + + node->hj_MatchedOuter = true; + + /* + * This is really only needed if HJ_FILL_INNER(node), + * but we'll avoid the branch and just set it always. + */ + HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)); + + /* In an antijoin, we never return a matched tuple */ + if (node->js.jointype == JOIN_ANTI) + { + node->hj_JoinState = HJ_NEED_NEW_OUTER; + continue; + } + + /* + * If we only need to join to the first matching inner tuple, + * then consider returning this one, but after that, continue + * with next outer tuple. + */ + /* TODO: is semi-join correct for AHJ */ + if (node->js.single_match) + node->hj_JoinState = HJ_NEED_NEW_OUTER; + + /* + * Set the match bit for this outer tuple in the match status + * file + */ + if (node->hj_OuterMatchStatusesFile != NULL) + { + Assert(node->hashloop_fallback == true); + int byte_to_set = (node->hj_OuterTupleCount - 1) / 8; + int bit_to_set_in_byte = (node->hj_OuterTupleCount - 1) % 8; + + BufFileSeek(node->hj_OuterMatchStatusesFile, 0, byte_to_set, SEEK_SET); + + node->hj_OuterCurrentByte = node->hj_OuterCurrentByte | (1 << bit_to_set_in_byte); + + BufFileWrite(node->hj_OuterMatchStatusesFile, &node->hj_OuterCurrentByte, 1); + } + + if (otherqual == NULL || ExecQual(otherqual, econtext)) + return ExecProject(node->js.ps.ps_ProjInfo); + InstrCountFiltered2(node, 1); + break; + + case HJ_FILL_INNER_TUPLES: + + /* + * We have finished a batch, but we are doing right/full join, + * so any unmatched inner tuples in the hashtable have to be + * emitted before we continue to the next batch. + */ + if (!ExecScanHashTableForUnmatched(node, econtext)) + { + /* no more unmatched tuples */ + node->hj_JoinState = HJ_NEED_NEW_INNER_CHUNK; + continue; + } + + /* + * Generate a fake join tuple with nulls for the outer tuple, + * and return it if it passes the non-join quals. + */ + econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot; + + if (otherqual == NULL || ExecQual(otherqual, econtext)) + return ExecProject(node->js.ps.ps_ProjInfo); + InstrCountFiltered2(node, 1); + break; + + case HJ_NEED_NEW_BATCH: + + /* + * Try to advance to next batch. Done if there are no more. + * for batches after batch 0 for which hashloop_fallback is + * true, if inner is exhausted, need to consider emitting + * unmatched tuples we should never get here when + * hashloop_fallback is false but hj_InnerExhausted is true, + * however, it felt more clear to check for hashloop_fallback + * explicitly + */ + if (node->hashloop_fallback && HJ_FILL_OUTER(node) && node->hj_InnerExhausted) + { + /* + * For hashloop fallback, outer tuples are not emitted + * until directly before advancing the batch (after all + * inner chunks have been processed). + * node->hashloop_fallback should be true because it is + * not reset to false until advancing the batches + */ + node->hj_InnerExhausted = false; + node->hj_JoinState = HJ_ADAPTIVE_EMIT_UNMATCHED_OUTER_INIT; + break; + } + + if (!ExecHashJoinAdvanceBatch(node)) + return NULL; + + /* + * TODO: need to find a better way to distinguish if I should + * load inner batch again than checking for outer batch file + */ + /* I need to also do this even if it is NULL when it is a ROJ */ + + /* + * need to load inner again if it is an inner or left outer + * join and there are outer tuples in the batch OR + */ + + /* + * if it is a ROJ and there are inner tuples in the batch -- + * should never have no tuples in either batch... + */ + if (BufFileRewindIfExists(node->hj_HashTable->outerBatchFile[node->hj_HashTable->curbatch]) != NULL || + (node->hj_HashTable->innerBatchFile[node->hj_HashTable->curbatch] != NULL && HJ_FILL_INNER(node))) + ExecHashJoinLoadInnerBatch(node); /* TODO: should I ever + * load inner when outer + * file is not present? */ + + node->hj_JoinState = HJ_NEED_NEW_OUTER; + break; + + case HJ_NEED_NEW_INNER_CHUNK: + + if (!node->hashloop_fallback) + { + node->hj_JoinState = HJ_NEED_NEW_BATCH; + break; + } + + /* + * it is the hashloop fallback case and there are no more + * chunks inner is exhausted, so we must advance the batches + */ + if (node->hj_InnerPageOffset == 0L) + { + node->hj_InnerExhausted = true; + node->hj_JoinState = HJ_NEED_NEW_BATCH; + break; + } + + /* + * This is the hashloop fallback case and we have more chunks + * in inner. curbatch > 0. Rewind outer batch file (if + * present) so that we can start reading it. Rewind outer + * match statuses file if present so that we can set match + * bits as needed. Reset the tuple count and load the next + * chunk of inner. Then proceed to get a new outer tuple from + * our rewound outer batch file + */ + node->hj_JoinState = HJ_NEED_NEW_OUTER; + + /* + * TODO: need to find a better way to distinguish if I should + * load inner batch again than checking for outer batch file + */ + /* I need to also do this even if it is NULL when it is a ROJ */ + + /* + * need to load inner again if it is an inner or left outer + * join and there are outer tuples in the batch OR + */ + + /* + * if it is a ROJ and there are inner tuples in the batch -- + * should never have no tuples in either batch... + */ + + /* + * if outer is not null or if it is a ROJ and inner is not + * null, must rewind outer match status and load inner + */ + if (BufFileRewindIfExists(node->hj_HashTable->outerBatchFile[node->hj_HashTable->curbatch]) != NULL || + (node->hj_HashTable->innerBatchFile[node->hj_HashTable->curbatch] != NULL && HJ_FILL_INNER(node))) + { + BufFileRewindIfExists(node->hj_OuterMatchStatusesFile); + node->hj_OuterTupleCount = 0; + ExecHashJoinLoadInnerBatch(node); + } + break; + + case HJ_ADAPTIVE_EMIT_UNMATCHED_OUTER_INIT: + + node->hj_OuterTupleCount = 0; + BufFileRewindIfExists(node->hj_OuterMatchStatusesFile); + + /* + * TODO: is it okay to use the hashtable to get the outer + * batch file here? + */ + outerFileForAdaptiveRead = hashtable->outerBatchFile[hashtable->curbatch]; + if (outerFileForAdaptiveRead == NULL) /* TODO: could this + * happen */ + { + node->hj_JoinState = HJ_NEED_NEW_BATCH; + break; + } + BufFileRewindIfExists(outerFileForAdaptiveRead); + + node->hj_JoinState = HJ_ADAPTIVE_EMIT_UNMATCHED_OUTER; + /* fall through */ + + case HJ_ADAPTIVE_EMIT_UNMATCHED_OUTER: + + outerFileForAdaptiveRead = hashtable->outerBatchFile[hashtable->curbatch]; + + while (true) + { + uint32 unmatchedOuterHashvalue; + TupleTableSlot *slot = ExecHashJoinGetSavedTuple(node, + outerFileForAdaptiveRead, + &unmatchedOuterHashvalue, + node->hj_OuterTupleSlot); + + node->hj_OuterTupleCount++; + + if (slot == NULL) + { + node->hj_JoinState = HJ_NEED_NEW_BATCH; + break; + } + + unsigned char bit = (node->hj_OuterTupleCount - 1) % 8; + + /* need to read the next byte */ + if (bit == 0) + BufFileRead(node->hj_OuterMatchStatusesFile, &node->hj_OuterCurrentByte, 1); + + /* if the match bit is set for this tuple, continue */ + if ((node->hj_OuterCurrentByte >> bit) & 1) + continue; + + /* if it is not a match then emit it NULL-extended */ + econtext->ecxt_outertuple = slot; + econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot; + return ExecProject(node->js.ps.ps_ProjInfo); + } + /* came here from HJ_NEED_NEW_BATCH, so go back there */ + node->hj_JoinState = HJ_NEED_NEW_BATCH; + break; + + default: + elog(ERROR, "unrecognized hashjoin state: %d", + (int) node->hj_JoinState); + } + } +} + +/* ---------------------------------------------------------------- + * ExecParallelHashJoin + * + * Parallel-aware version. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * /* return: a tuple or NULL */ +ExecParallelHashJoin(PlanState *pstate) +{ + HashJoinState *node = castNode(HashJoinState, pstate); + PlanState *outerNode; + HashState *hashNode; + ExprState *joinqual; + ExprState *otherqual; + ExprContext *econtext; + HashJoinTable hashtable; + TupleTableSlot *outerTupleSlot; + uint32 hashvalue; + int batchno; + ParallelHashJoinState *parallel_state; + + /* + * get information from HashJoin node + */ + joinqual = node->js.joinqual; + otherqual = node->js.ps.qual; + hashNode = (HashState *) innerPlanState(node); + outerNode = outerPlanState(node); + hashtable = node->hj_HashTable; + econtext = node->js.ps.ps_ExprContext; + parallel_state = hashNode->parallel_state; + + bool advance_from_probing = false; + + /* + * Reset per-tuple memory context to free any expression evaluation + * storage allocated in the previous tuple cycle. + */ + ResetExprContext(econtext); + + /* + * run the hash join state machine + */ + for (;;) + { + SharedTuplestoreAccessor *outer_acc; + + /* + * It's possible to iterate this loop many times before returning a + * tuple, in some pathological cases such as needing to move much of + * the current batch to a later batch. So let's check for interrupts + * each time through. + */ + CHECK_FOR_INTERRUPTS(); + + switch (node->hj_JoinState) + { + case HJ_BUILD_HASHTABLE: + + /* + * First time through: build hash table for inner relation. + */ + Assert(hashtable == NULL); + /* volatile int mybp = 0; while (mybp == 0); */ + + /* + * The empty-outer optimization is not implemented for shared + * hash tables, because no one participant can determine that + * there are no outer tuples, and it's not yet clear that it's + * worth the synchronization overhead of reaching consensus to + * figure that out. So we have to build the hash table. + */ + node->hj_FirstOuterTupleSlot = NULL; + + /* + * Create the hash table. If using Parallel Hash, then + * whoever gets here first will create the hash table and any + * later arrivals will merely attach to it. + */ + node->hj_HashTable = hashtable = ExecHashTableCreate(hashNode, + node->hj_HashOperators, + node->hj_Collations, + HJ_FILL_INNER(node)); + /* * Execute the Hash node, to build the hash table. If using * Parallel Hash, then we'll try to help hashing unless we @@ -311,66 +831,59 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) */ node->hj_OuterNotEmpty = false; - if (parallel) - { - Barrier *build_barrier; - - build_barrier = ¶llel_state->build_barrier; - Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER || - BarrierPhase(build_barrier) == PHJ_BUILD_DONE); - if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER) - { - /* - * If multi-batch, we need to hash the outer relation - * up front. - */ - if (hashtable->nbatch > 1) - ExecParallelHashJoinPartitionOuter(node); - BarrierArriveAndWait(build_barrier, - WAIT_EVENT_HASH_BUILD_HASHING_OUTER); - } - Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE); - - /* Each backend should now select a batch to work on. */ - hashtable->curbatch = -1; - node->hj_JoinState = HJ_NEED_NEW_BATCH; + Barrier *build_barrier; - continue; + build_barrier = ¶llel_state->build_barrier; + Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER || + BarrierPhase(build_barrier) == PHJ_BUILD_DONE); + if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER) + { + /* + * If multi-batch, we need to hash the outer relation up + * front. + */ + if (hashtable->nbatch > 1) + ExecParallelHashJoinPartitionOuter(node); + BarrierArriveAndWait(build_barrier, + WAIT_EVENT_HASH_BUILD_HASHING_OUTER); } - else - node->hj_JoinState = HJ_NEED_NEW_OUTER; + Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE); - /* FALL THRU */ + /* Each backend should now select a batch to work on. */ + hashtable->curbatch = -1; + node->hj_JoinState = HJ_NEED_NEW_BATCH; + + continue; case HJ_NEED_NEW_OUTER: /* * We don't have an outer tuple, try to get the next one */ - if (parallel) - outerTupleSlot = - ExecParallelHashJoinOuterGetTuple(outerNode, node, - &hashvalue); - else - outerTupleSlot = - ExecHashJoinOuterGetTuple(outerNode, node, &hashvalue); + outerTupleSlot = + ExecParallelHashJoinOuterGetTuple(outerNode, node, + &hashvalue); if (TupIsNull(outerTupleSlot)) { - /* end of batch, or maybe whole join */ + /* + * end of batch, or maybe whole join. for hashloop + * fallback, all we know is outer batch is exhausted. + * inner could have more chunks + */ if (HJ_FILL_INNER(node)) { /* set up to scan for unmatched inner tuples */ ExecPrepHashTableForUnmatched(node); node->hj_JoinState = HJ_FILL_INNER_TUPLES; + break; } - else - node->hj_JoinState = HJ_NEED_NEW_BATCH; - continue; + advance_from_probing = true; + node->hj_JoinState = HJ_NEED_NEW_INNER_CHUNK; + break; } econtext->ecxt_outertuple = outerTupleSlot; - node->hj_MatchedOuter = false; /* * Find the corresponding bucket for this tuple in the main @@ -384,33 +897,18 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) node->hj_CurTuple = NULL; /* - * The tuple might not belong to the current batch (where - * "current batch" includes the skew buckets if any). + * for the hashloop fallback case, only initialize + * hj_MatchedOuter to false during the first chunk. otherwise, + * we will be resetting hj_MatchedOuter to false for an outer + * tuple that has already matched an inner tuple. also, + * hj_MatchedOuter should be set to false for batch 0. there + * are no chunks for batch 0 */ - if (batchno != hashtable->curbatch && - node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) - { - bool shouldFree; - MinimalTuple mintuple = ExecFetchSlotMinimalTuple(outerTupleSlot, - &shouldFree); - /* - * Need to postpone this outer tuple to a later batch. - * Save it in the corresponding outer-batch file. - */ - Assert(parallel_state == NULL); - Assert(batchno > hashtable->curbatch); - ExecHashJoinSaveTuple(mintuple, hashvalue, - &hashtable->outerBatchFile[batchno]); - - if (shouldFree) - heap_free_minimal_tuple(mintuple); - - /* Loop around, staying in HJ_NEED_NEW_OUTER state */ - continue; - } + ParallelHashJoinBatch *phj_batch = node->hj_HashTable->batches[node->hj_HashTable->curbatch].shared; - /* OK, let's scan the bucket for matches */ + if (!phj_batch->parallel_hashloop_fallback || phj_batch->current_chunk_num == 1) + node->hj_MatchedOuter = false; node->hj_JoinState = HJ_SCAN_BUCKET; /* FALL THRU */ @@ -420,23 +918,25 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) /* * Scan the selected hash bucket for matches to current outer */ - if (parallel) - { - if (!ExecParallelScanHashBucket(node, econtext)) - { - /* out of matches; check for possible outer-join fill */ - node->hj_JoinState = HJ_FILL_OUTER_TUPLE; - continue; - } - } - else + phj_batch = node->hj_HashTable->batches[node->hj_HashTable->curbatch].shared; + + if (!ExecParallelScanHashBucket(node, econtext)) { - if (!ExecScanHashBucket(node, econtext)) + /* + * The current outer tuple has run out of matches, so + * check whether to emit a dummy outer-join tuple. + * Whether we emit one or not, the next state is + * NEED_NEW_OUTER. + */ + node->hj_JoinState = HJ_NEED_NEW_OUTER; + if (!phj_batch->parallel_hashloop_fallback) { - /* out of matches; check for possible outer-join fill */ - node->hj_JoinState = HJ_FILL_OUTER_TUPLE; - continue; + TupleTableSlot *slot = emitUnmatchedOuterTuple(otherqual, econtext, node); + + if (slot != NULL) + return slot; } + continue; } /* @@ -451,77 +951,55 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) * Only the joinquals determine tuple match status, but all * quals must pass to actually return the tuple. */ - if (joinqual == NULL || ExecQual(joinqual, econtext)) + if (joinqual != NULL && !ExecQual(joinqual, econtext)) { - node->hj_MatchedOuter = true; - - if (parallel) - { - /* - * Full/right outer joins are currently not supported - * for parallel joins, so we don't need to set the - * match bit. Experiments show that it's worth - * avoiding the shared memory traffic on large - * systems. - */ - Assert(!HJ_FILL_INNER(node)); - } - else - { - /* - * This is really only needed if HJ_FILL_INNER(node), - * but we'll avoid the branch and just set it always. - */ - HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)); - } - - /* In an antijoin, we never return a matched tuple */ - if (node->js.jointype == JOIN_ANTI) - { - node->hj_JoinState = HJ_NEED_NEW_OUTER; - continue; - } + InstrCountFiltered1(node, 1); + break; + } - /* - * If we only need to join to the first matching inner - * tuple, then consider returning this one, but after that - * continue with next outer tuple. - */ - if (node->js.single_match) - node->hj_JoinState = HJ_NEED_NEW_OUTER; + node->hj_MatchedOuter = true; + /* + * Full/right outer joins are currently not supported + * for parallel joins, so we don't need to set the + * match bit. Experiments show that it's worth + * avoiding the shared memory traffic on large + * systems. + */ + Assert(!HJ_FILL_INNER(node)); - if (otherqual == NULL || ExecQual(otherqual, econtext)) - return ExecProject(node->js.ps.ps_ProjInfo); - else - InstrCountFiltered2(node, 1); + /* + * TODO: how does this interact with PAHJ -- do I need to set + * matchbit? + */ + /* In an antijoin, we never return a matched tuple */ + if (node->js.jointype == JOIN_ANTI) + { + node->hj_JoinState = HJ_NEED_NEW_OUTER; + continue; } - else - InstrCountFiltered1(node, 1); - break; - - case HJ_FILL_OUTER_TUPLE: /* - * The current outer tuple has run out of matches, so check - * whether to emit a dummy outer-join tuple. Whether we emit - * one or not, the next state is NEED_NEW_OUTER. + * If we only need to join to the first matching inner tuple, + * then consider returning this one, but after that continue + * with next outer tuple. */ - node->hj_JoinState = HJ_NEED_NEW_OUTER; + if (node->js.single_match) + node->hj_JoinState = HJ_NEED_NEW_OUTER; - if (!node->hj_MatchedOuter && - HJ_FILL_OUTER(node)) + /* + * Set the match bit for this outer tuple in the match status + * file + */ + if (phj_batch->parallel_hashloop_fallback) { - /* - * Generate a fake join tuple with nulls for the inner - * tuple, and return it if it passes the non-join quals. - */ - econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot; + sts_set_outer_match_status(hashtable->batches[hashtable->curbatch].outer_tuples, + econtext->ecxt_outertuple->tuplenum); - if (otherqual == NULL || ExecQual(otherqual, econtext)) - return ExecProject(node->js.ps.ps_ProjInfo); - else - InstrCountFiltered2(node, 1); } + if (otherqual == NULL || ExecQual(otherqual, econtext)) + return ExecProject(node->js.ps.ps_ProjInfo); + else + InstrCountFiltered2(node, 1); break; case HJ_FILL_INNER_TUPLES: @@ -534,7 +1012,8 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) if (!ExecScanHashTableForUnmatched(node, econtext)) { /* no more unmatched tuples */ - node->hj_JoinState = HJ_NEED_NEW_BATCH; + advance_from_probing = true; + node->hj_JoinState = HJ_NEED_NEW_INNER_CHUNK; continue; } @@ -552,22 +1031,108 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) case HJ_NEED_NEW_BATCH: + phj_batch = hashtable->batches[hashtable->curbatch].shared; + /* * Try to advance to next batch. Done if there are no more. */ - if (parallel) + if (!ExecParallelHashJoinNewBatch(node)) + return NULL; /* end of parallel-aware join */ + + if (node->last_worker + && HJ_FILL_OUTER(node) && phj_batch->parallel_hashloop_fallback) { - if (!ExecParallelHashJoinNewBatch(node)) - return NULL; /* end of parallel-aware join */ + node->last_worker = false; + node->hj_JoinState = HJ_ADAPTIVE_EMIT_UNMATCHED_OUTER_INIT; + break; } - else + if (node->hj_HashTable->curbatch == 0) { - if (!ExecHashJoinNewBatch(node)) - return NULL; /* end of parallel-oblivious join */ + node->hj_JoinState = HJ_NEED_NEW_OUTER; + break; } - node->hj_JoinState = HJ_NEED_NEW_OUTER; + advance_from_probing = false; + node->hj_JoinState = HJ_NEED_NEW_INNER_CHUNK; + /* FALL THRU */ + + case HJ_NEED_NEW_INNER_CHUNK: + + if (hashtable->curbatch == -1 || hashtable->curbatch == 0) + + /* + * If we're not attached to a batch at all then we need to + * go to HJ_NEED_NEW_BATCH. Also batch 0 doesn't have more + * than 1 chunk. + */ + node->hj_JoinState = HJ_NEED_NEW_BATCH; + else if (!ExecParallelHashJoinNewChunk(node, advance_from_probing)) + /* If there's no next chunk then go to the next batch */ + node->hj_JoinState = HJ_NEED_NEW_BATCH; + else + node->hj_JoinState = HJ_NEED_NEW_OUTER; break; + case HJ_ADAPTIVE_EMIT_UNMATCHED_OUTER_INIT: + + outer_acc = hashtable->batches[hashtable->curbatch].outer_tuples; + sts_reinitialize(outer_acc); + sts_begin_parallel_scan(outer_acc); + + node->hj_JoinState = HJ_ADAPTIVE_EMIT_UNMATCHED_OUTER; + /* FALL THRU */ + + case HJ_ADAPTIVE_EMIT_UNMATCHED_OUTER: + + Assert(node->combined_bitmap != NULL); + + outer_acc = node->hj_HashTable->batches[node->hj_HashTable->curbatch].outer_tuples; + + MinimalTuple tuple; + + do + { + tupleMetadata metadata; + + if ((tuple = sts_parallel_scan_next(outer_acc, &metadata)) == NULL) + break; + + int bytenum = metadata.tupleid / 8; + unsigned char bit = metadata.tupleid % 8; + unsigned char byte_to_check = 0; + + /* seek to byte to check */ + if (BufFileSeek(node->combined_bitmap, 0, bytenum, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind shared outer temporary file: %m"))); + /* read byte containing ntuple bit */ + if (BufFileRead(node->combined_bitmap, &byte_to_check, 1) == 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read byte in outer match status bitmap: %m."))); + /* if bit is set */ + bool match = ((byte_to_check) >> bit) & 1; + + if (!match) + break; + } while (1); + + if (tuple == NULL) + { + sts_end_parallel_scan(outer_acc); + node->hj_JoinState = HJ_NEED_NEW_BATCH; + break; + } + + /* Emit the unmatched tuple */ + ExecForceStoreMinimalTuple(tuple, + econtext->ecxt_outertuple, + false); + econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot; + + return ExecProject(node->js.ps.ps_ProjInfo); + + default: elog(ERROR, "unrecognized hashjoin state: %d", (int) node->hj_JoinState); @@ -575,38 +1140,6 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) } } -/* ---------------------------------------------------------------- - * ExecHashJoin - * - * Parallel-oblivious version. - * ---------------------------------------------------------------- - */ -static TupleTableSlot * /* return: a tuple or NULL */ -ExecHashJoin(PlanState *pstate) -{ - /* - * On sufficiently smart compilers this should be inlined with the - * parallel-aware branches removed. - */ - return ExecHashJoinImpl(pstate, false); -} - -/* ---------------------------------------------------------------- - * ExecParallelHashJoin - * - * Parallel-aware version. - * ---------------------------------------------------------------- - */ -static TupleTableSlot * /* return: a tuple or NULL */ -ExecParallelHashJoin(PlanState *pstate) -{ - /* - * On sufficiently smart compilers this should be inlined with the - * parallel-oblivious branches removed. - */ - return ExecHashJoinImpl(pstate, true); -} - /* ---------------------------------------------------------------- * ExecInitHashJoin * @@ -641,6 +1174,18 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) hjstate->js.ps.ExecProcNode = ExecHashJoin; hjstate->js.jointype = node->join.jointype; + hjstate->hashloop_fallback = false; + hjstate->hj_InnerPageOffset = 0L; + hjstate->hj_InnerFirstChunk = false; + hjstate->hj_OuterCurrentByte = 0; + + hjstate->hj_OuterMatchStatusesFile = NULL; + hjstate->hj_OuterTupleCount = 0; + hjstate->hj_InnerExhausted = false; + + hjstate->last_worker = false; + hjstate->combined_bitmap = NULL; + /* * Miscellaneous initialization * @@ -792,6 +1337,30 @@ ExecEndHashJoin(HashJoinState *node) ExecEndNode(innerPlanState(node)); } + +static TupleTableSlot * +emitUnmatchedOuterTuple(ExprState *otherqual, ExprContext *econtext, HashJoinState *hjstate) +{ + if (hjstate->hj_MatchedOuter) + return NULL; + + if (!HJ_FILL_OUTER(hjstate)) + return NULL; + + econtext->ecxt_innertuple = hjstate->hj_NullInnerTupleSlot; + + /* + * Generate a fake join tuple with nulls for the inner tuple, and return + * it if it passes the non-join quals. + */ + + if (otherqual == NULL || ExecQual(otherqual, econtext)) + return ExecProject(hjstate->js.ps.ps_ProjInfo); + + InstrCountFiltered2(hjstate, 1); + return NULL; +} + /* * ExecHashJoinOuterGetTuple * @@ -919,13 +1488,20 @@ ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, { MinimalTuple tuple; + tupleMetadata metadata; + int tupleid; + tuple = sts_parallel_scan_next(hashtable->batches[curbatch].outer_tuples, - hashvalue); + &metadata); if (tuple != NULL) { + /* where is this hashvalue being used? */ + *hashvalue = metadata.hashvalue; + tupleid = metadata.tupleid; ExecForceStoreMinimalTuple(tuple, hjstate->hj_OuterTupleSlot, false); + hjstate->hj_OuterTupleSlot->tuplenum = tupleid; slot = hjstate->hj_OuterTupleSlot; return slot; } @@ -938,20 +1514,17 @@ ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, } /* - * ExecHashJoinNewBatch + * ExecHashJoinAdvanceBatch * switch to a new hashjoin batch * * Returns true if successful, false if there are no more batches. */ static bool -ExecHashJoinNewBatch(HashJoinState *hjstate) +ExecHashJoinAdvanceBatch(HashJoinState *hjstate) { HashJoinTable hashtable = hjstate->hj_HashTable; int nbatch; int curbatch; - BufFile *innerFile; - TupleTableSlot *slot; - uint32 hashvalue; nbatch = hashtable->nbatch; curbatch = hashtable->curbatch; @@ -1026,10 +1599,36 @@ ExecHashJoinNewBatch(HashJoinState *hjstate) curbatch++; } + hjstate->hj_InnerPageOffset = 0L; + hjstate->hj_InnerFirstChunk = true; + hjstate->hashloop_fallback = false; /* new batch, so start it off false */ + if (hjstate->hj_OuterMatchStatusesFile != NULL) + BufFileClose(hjstate->hj_OuterMatchStatusesFile); + hjstate->hj_OuterMatchStatusesFile = NULL; if (curbatch >= nbatch) return false; /* no more batches */ hashtable->curbatch = curbatch; + return true; +} + +/* + * Returns true if there are more chunks left, false otherwise + */ +static bool +ExecHashJoinLoadInnerBatch(HashJoinState *hjstate) +{ + HashJoinTable hashtable = hjstate->hj_HashTable; + int curbatch = hashtable->curbatch; + BufFile *innerFile; + TupleTableSlot *slot; + uint32 hashvalue; + + off_t tup_start_offset; + off_t chunk_start_offset; + off_t tup_end_offset; + int64 current_saved_size; + int current_fileno; /* * Reload the hash table with the new inner batch (which could be empty) @@ -1038,171 +1637,60 @@ ExecHashJoinNewBatch(HashJoinState *hjstate) innerFile = hashtable->innerBatchFile[curbatch]; + /* Reset this even if the innerfile is not null */ + hjstate->hj_InnerFirstChunk = hjstate->hj_InnerPageOffset == 0L; + if (innerFile != NULL) { - if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) + /* TODO: should fileno always be 0? */ + if (BufFileSeek(innerFile, 0, hjstate->hj_InnerPageOffset, SEEK_SET)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rewind hash-join temporary file: %m"))); + chunk_start_offset = hjstate->hj_InnerPageOffset; + tup_end_offset = hjstate->hj_InnerPageOffset; while ((slot = ExecHashJoinGetSavedTuple(hjstate, innerFile, &hashvalue, hjstate->hj_HashTupleSlot))) { + /* next tuple's start is last tuple's end */ + tup_start_offset = tup_end_offset; + /* after we got the tuple, figure out what the offset is */ + BufFileTell(innerFile, ¤t_fileno, &tup_end_offset); + current_saved_size = tup_end_offset - chunk_start_offset; + if (current_saved_size > work_mem) + { + hjstate->hj_InnerPageOffset = tup_start_offset; + hjstate->hashloop_fallback = true; + return true; + } + hjstate->hj_InnerPageOffset = tup_end_offset; + /* - * NOTE: some tuples may be sent to future batches. Also, it is - * possible for hashtable->nbatch to be increased here! + * NOTE: some tuples may be sent to future batches. With current + * hashloop patch, however, it is not possible for + * hashtable->nbatch to be increased here */ ExecHashTableInsert(hashtable, slot, hashvalue); } + /* this is the end of the file */ + hjstate->hj_InnerPageOffset = 0L; + /* - * after we build the hash table, the inner batch file is no longer + * after we processed all chunks, the inner batch file is no longer * needed */ BufFileClose(innerFile); hashtable->innerBatchFile[curbatch] = NULL; } - /* - * Rewind outer batch file (if present), so that we can start reading it. - */ - if (hashtable->outerBatchFile[curbatch] != NULL) - { - if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rewind hash-join temporary file: %m"))); - } - - return true; -} - -/* - * Choose a batch to work on, and attach to it. Returns true if successful, - * false if there are no more batches. - */ -static bool -ExecParallelHashJoinNewBatch(HashJoinState *hjstate) -{ - HashJoinTable hashtable = hjstate->hj_HashTable; - int start_batchno; - int batchno; - - /* - * If we started up so late that the batch tracking array has been freed - * already by ExecHashTableDetach(), then we are finished. See also - * ExecParallelHashEnsureBatchAccessors(). - */ - if (hashtable->batches == NULL) - return false; - - /* - * If we were already attached to a batch, remember not to bother checking - * it again, and detach from it (possibly freeing the hash table if we are - * last to detach). - */ - if (hashtable->curbatch >= 0) - { - hashtable->batches[hashtable->curbatch].done = true; - ExecHashTableDetachBatch(hashtable); - } - - /* - * Search for a batch that isn't done. We use an atomic counter to start - * our search at a different batch in every participant when there are - * more batches than participants. - */ - batchno = start_batchno = - pg_atomic_fetch_add_u32(&hashtable->parallel_state->distributor, 1) % - hashtable->nbatch; - do - { - uint32 hashvalue; - MinimalTuple tuple; - TupleTableSlot *slot; - - if (!hashtable->batches[batchno].done) - { - SharedTuplestoreAccessor *inner_tuples; - Barrier *batch_barrier = - &hashtable->batches[batchno].shared->batch_barrier; - - switch (BarrierAttach(batch_barrier)) - { - case PHJ_BATCH_ELECTING: - - /* One backend allocates the hash table. */ - if (BarrierArriveAndWait(batch_barrier, - WAIT_EVENT_HASH_BATCH_ELECTING)) - ExecParallelHashTableAlloc(hashtable, batchno); - /* Fall through. */ - - case PHJ_BATCH_ALLOCATING: - /* Wait for allocation to complete. */ - BarrierArriveAndWait(batch_barrier, - WAIT_EVENT_HASH_BATCH_ALLOCATING); - /* Fall through. */ - - case PHJ_BATCH_LOADING: - /* Start (or join in) loading tuples. */ - ExecParallelHashTableSetCurrentBatch(hashtable, batchno); - inner_tuples = hashtable->batches[batchno].inner_tuples; - sts_begin_parallel_scan(inner_tuples); - while ((tuple = sts_parallel_scan_next(inner_tuples, - &hashvalue))) - { - ExecForceStoreMinimalTuple(tuple, - hjstate->hj_HashTupleSlot, - false); - slot = hjstate->hj_HashTupleSlot; - ExecParallelHashTableInsertCurrentBatch(hashtable, slot, - hashvalue); - } - sts_end_parallel_scan(inner_tuples); - BarrierArriveAndWait(batch_barrier, - WAIT_EVENT_HASH_BATCH_LOADING); - /* Fall through. */ - - case PHJ_BATCH_PROBING: - - /* - * This batch is ready to probe. Return control to - * caller. We stay attached to batch_barrier so that the - * hash table stays alive until everyone's finished - * probing it, but no participant is allowed to wait at - * this barrier again (or else a deadlock could occur). - * All attached participants must eventually call - * BarrierArriveAndDetach() so that the final phase - * PHJ_BATCH_DONE can be reached. - */ - ExecParallelHashTableSetCurrentBatch(hashtable, batchno); - sts_begin_parallel_scan(hashtable->batches[batchno].outer_tuples); - return true; - - case PHJ_BATCH_DONE: - - /* - * Already done. Detach and go around again (if any - * remain). - */ - BarrierDetach(batch_barrier); - hashtable->batches[batchno].done = true; - hashtable->curbatch = -1; - break; - - default: - elog(ERROR, "unexpected batch phase %d", - BarrierPhase(batch_barrier)); - } - } - batchno = (batchno + 1) % hashtable->nbatch; - } while (batchno != start_batchno); - return false; } + /* * ExecHashJoinSaveTuple * save a tuple to a batch file. @@ -1396,6 +1884,8 @@ ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate) /* Execute outer plan, writing all tuples to shared tuplestores. */ for (;;) { + tupleMetadata metadata; + slot = ExecProcNode(outerState); if (TupIsNull(slot)) break; @@ -1413,8 +1903,11 @@ ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate) ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno); - sts_puttuple(hashtable->batches[batchno].outer_tuples, - &hashvalue, mintup); + metadata.hashvalue = hashvalue; + SharedTuplestoreAccessor *accessor = hashtable->batches[batchno].outer_tuples; + + metadata.tupleid = sts_increment_tuplenum(accessor); + sts_puttuple(accessor, &metadata, mintup); if (shouldFree) heap_free_minimal_tuple(mintup); @@ -1463,6 +1956,7 @@ ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt) * and space_allowed. */ pstate->nbatch = 0; + pstate->batch_increases = 0; pstate->space_allowed = 0; pstate->batches = InvalidDsaPointer; pstate->old_batches = InvalidDsaPointer; @@ -1502,7 +1996,7 @@ ExecHashJoinReInitializeDSM(HashJoinState *state, ParallelContext *cxt) /* * It would be possible to reuse the shared hash table in single-batch * cases by resetting and then fast-forwarding build_barrier to - * PHJ_BUILD_DONE and batch 0's batch_barrier to PHJ_BATCH_PROBING, but + * PHJ_BUILD_DONE and batch 0's batch_barrier to PHJ_BATCH_CHUNKING, but * currently shared hash tables are already freed by now (by the last * participant to detach from the batch). We could consider keeping it * around for single-batch joins. We'd also need to adjust diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 7169509a79..eeddf0009c 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3767,6 +3767,9 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_HASH_BATCH_LOADING: event_name = "Hash/Batch/Loading"; break; + case WAIT_EVENT_HASH_BATCH_PROBING: + event_name = "Hash/Batch/Probing"; + break; case WAIT_EVENT_HASH_BUILD_ALLOCATING: event_name = "Hash/Build/Allocating"; break; @@ -3779,6 +3782,9 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_HASH_BUILD_HASHING_OUTER: event_name = "Hash/Build/HashingOuter"; break; + case WAIT_EVENT_HASH_BUILD_CREATE_OUTER_MATCH_STATUS_BITMAP_FILES: + event_name = "Hash/Build/CreateOuterMatchStatusBitmapFiles"; + break; case WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATING: event_name = "Hash/GrowBatches/Allocating"; break; @@ -3803,6 +3809,21 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_HASH_GROW_BUCKETS_REINSERTING: event_name = "Hash/GrowBuckets/Reinserting"; break; + case WAIT_EVENT_HASH_CHUNK_ELECTING: + event_name = "Hash/Chunk/Electing"; + break; + case WAIT_EVENT_HASH_CHUNK_LOADING: + event_name = "Hash/Chunk/Loading"; + break; + case WAIT_EVENT_HASH_CHUNK_PROBING: + event_name = "Hash/Chunk/Probing"; + break; + case WAIT_EVENT_HASH_CHUNK_DONE: + event_name = "Hash/Chunk/Done"; + break; + case WAIT_EVENT_HASH_ADVANCE_CHUNK: + event_name = "Hash/Chunk/Final"; + break; case WAIT_EVENT_LOGICAL_SYNC_DATA: event_name = "LogicalSyncData"; break; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 35e8f12e62..cb49329d3f 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -269,6 +269,57 @@ BufFileCreateShared(SharedFileSet *fileset, const char *name) return file; } +/* + * Open a shared file created by any backend if it exists, otherwise return NULL + */ +BufFile * +BufFileOpenSharedIfExists(SharedFileSet *fileset, const char *name) +{ + BufFile *file; + char segment_name[MAXPGPATH]; + Size capacity = 16; + File *files; + int nfiles = 0; + + files = palloc(sizeof(File) * capacity); + + /* + * We don't know how many segments there are, so we'll probe the + * filesystem to find out. + */ + for (;;) + { + /* See if we need to expand our file segment array. */ + if (nfiles + 1 > capacity) + { + capacity *= 2; + files = repalloc(files, sizeof(File) * capacity); + } + /* Try to load a segment. */ + SharedSegmentName(segment_name, name, nfiles); + files[nfiles] = SharedFileSetOpen(fileset, segment_name); + if (files[nfiles] <= 0) + break; + ++nfiles; + + CHECK_FOR_INTERRUPTS(); + } + + /* + * If we didn't find any files at all, then no BufFile exists with this + * name. + */ + if (nfiles == 0) + return NULL; + file = makeBufFileCommon(nfiles); + file->files = files; + file->readOnly = true; /* Can't write to files opened this way */ + file->fileset = fileset; + file->name = pstrdup(name); + + return file; +} + /* * Open a file that was previously created in another backend (or this one) * with BufFileCreateShared in the same SharedFileSet using the same name. @@ -843,3 +894,17 @@ BufFileAppend(BufFile *target, BufFile *source) return startBlock; } + +BufFile * +BufFileRewindIfExists(BufFile *bufFile) +{ + if (bufFile != NULL) + { + if (BufFileSeek(bufFile, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); + return bufFile; + } + return NULL; +} diff --git a/src/backend/storage/ipc/barrier.c b/src/backend/storage/ipc/barrier.c index 3e200e02cc..58455dda1c 100644 --- a/src/backend/storage/ipc/barrier.c +++ b/src/backend/storage/ipc/barrier.c @@ -195,6 +195,91 @@ BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info) return elected; } +/* + * Arrive at this barrier, wait for all other attached participants to arrive + * too and then return. Sets the current phase to next_phase. The caller must + * be attached. + * + * While waiting, pg_stat_activity shows a wait_event_type and wait_event + * controlled by the wait_event_info passed in, which should be a value from + * one of the WaitEventXXX enums defined in pgstat.h. + * + * Return true in one arbitrarily chosen participant. Return false in all + * others. The return code can be used to elect one participant to execute a + * phase of work that must be done serially while other participants wait. + */ +bool +BarrierArriveExplicitAndWait(Barrier *barrier, int next_phase, uint32 wait_event_info) +{ + bool release = false; + bool elected; + int start_phase; + + SpinLockAcquire(&barrier->mutex); + start_phase = barrier->phase; + ++barrier->arrived; + if (barrier->arrived == barrier->participants) + { + release = true; + barrier->arrived = 0; + barrier->phase = next_phase; + barrier->elected = next_phase; + } + SpinLockRelease(&barrier->mutex); + + /* + * If we were the last expected participant to arrive, we can release our + * peers and return true to indicate that this backend has been elected to + * perform any serial work. + */ + if (release) + { + ConditionVariableBroadcast(&barrier->condition_variable); + + return true; + } + + /* + * Otherwise we have to wait for the last participant to arrive and + * advance the phase. + */ + elected = false; + ConditionVariablePrepareToSleep(&barrier->condition_variable); + for (;;) + { + /* + * We know that phase must either be start_phase, indicating that we + * need to keep waiting, or next_phase, indicating that the last + * participant that we were waiting for has either arrived or detached + * so that the next phase has begun. The phase cannot advance any + * further than that without this backend's participation, because + * this backend is attached. + */ + SpinLockAcquire(&barrier->mutex); + Assert(barrier->phase == start_phase || barrier->phase == next_phase); + release = barrier->phase == next_phase; + if (release && barrier->elected != next_phase) + { + /* + * Usually the backend that arrives last and releases the other + * backends is elected to return true (see above), so that it can + * begin processing serial work while it has a CPU timeslice. + * However, if the barrier advanced because someone detached, then + * one of the backends that is awoken will need to be elected. + */ + barrier->elected = barrier->phase; + elected = true; + } + SpinLockRelease(&barrier->mutex); + if (release) + break; + ConditionVariableSleep(&barrier->condition_variable, wait_event_info); + } + ConditionVariableCancelSleep(); + + return elected; +} + /* * Arrive at this barrier, but detach rather than waiting. Returns true if * the caller was the last to detach. diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c index c3ab494a45..3cd2ec2e2e 100644 --- a/src/backend/utils/sort/sharedtuplestore.c +++ b/src/backend/utils/sort/sharedtuplestore.c @@ -60,6 +60,8 @@ typedef struct SharedTuplestoreParticipant struct SharedTuplestore { int nparticipants; /* Number of participants that can write. */ + pg_atomic_uint32 ntuples; + //TODO:does this belong elsewhere int flags; /* Flag bits from SHARED_TUPLESTORE_XXX */ size_t meta_data_size; /* Size of per-tuple header. */ char name[NAMEDATALEN]; /* A name for this tuplestore. */ @@ -92,10 +94,15 @@ struct SharedTuplestoreAccessor BlockNumber write_page; /* The next page to write to. */ char *write_pointer; /* Current write pointer within chunk. */ char *write_end; /* One past the end of the current chunk. */ + + /* Bitmap of matched outer tuples (currently only used for hashjoin). */ + BufFile *outer_match_status_file; }; static void sts_filename(char *name, SharedTuplestoreAccessor *accessor, int participant); +static void + sts_bitmap_filename(char *name, SharedTuplestoreAccessor *accessor, int participant); /* * Return the amount of shared memory required to hold SharedTuplestore for a @@ -137,6 +144,7 @@ sts_initialize(SharedTuplestore *sts, int participants, Assert(my_participant_number < participants); sts->nparticipants = participants; + pg_atomic_init_u32(&sts->ntuples, 1); sts->meta_data_size = meta_data_size; sts->flags = flags; @@ -166,6 +174,7 @@ sts_initialize(SharedTuplestore *sts, int participants, accessor->sts = sts; accessor->fileset = fileset; accessor->context = CurrentMemoryContext; + accessor->outer_match_status_file = NULL; return accessor; } @@ -343,6 +352,7 @@ sts_puttuple(SharedTuplestoreAccessor *accessor, void *meta_data, sts_flush_chunk(accessor); } + /* TODO: exercise this code with a test (over-sized tuple) */ /* It may still not be enough in the case of a gigantic tuple. */ if (accessor->write_pointer + size >= accessor->write_end) { @@ -621,6 +631,129 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data) return NULL; } +/* TODO: fix signedness */ +int +sts_increment_tuplenum(SharedTuplestoreAccessor *accessor) +{ + return pg_atomic_fetch_add_u32(&accessor->sts->ntuples, 1); +} + +void +sts_make_outer_match_status_file(SharedTuplestoreAccessor *accessor) +{ + uint32 tuplenum = pg_atomic_read_u32(&accessor->sts->ntuples); + + /* don't make the outer match status file if there are no tuples */ + if (tuplenum == 0) + return; + + char name[MAXPGPATH]; + + sts_bitmap_filename(name, accessor, accessor->participant); + + accessor->outer_match_status_file = BufFileCreateShared(accessor->fileset, name); + + /* TODO: check this math. tuplenumber will be too high. */ + uint32 num_to_write = tuplenum / 8 + 1; + + unsigned char byteToWrite = 0; + + BufFileWrite(accessor->outer_match_status_file, &byteToWrite, num_to_write); + + if (BufFileSeek(accessor->outer_match_status_file, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); +} + +void +sts_set_outer_match_status(SharedTuplestoreAccessor *accessor, uint32 tuplenum) +{ + BufFile *parallel_outer_matchstatuses = accessor->outer_match_status_file; + unsigned char current_outer_byte; + + BufFileSeek(parallel_outer_matchstatuses, 0, tuplenum / 8, SEEK_SET); + BufFileRead(parallel_outer_matchstatuses, ¤t_outer_byte, 1); + + current_outer_byte |= 1U << (tuplenum % 8); + + if (BufFileSeek(parallel_outer_matchstatuses, 0, -1, SEEK_CUR) != 0) + elog(ERROR, "there is a problem with outer match status file. pid %i.", MyProcPid); + BufFileWrite(parallel_outer_matchstatuses, ¤t_outer_byte, 1); +} + +void +sts_close_outer_match_status_file(SharedTuplestoreAccessor *accessor) +{ + BufFileClose(accessor->outer_match_status_file); +} + +BufFile * +sts_combine_outer_match_status_files(SharedTuplestoreAccessor *accessor) +{ + /* TODO: this tries to close an outer match status file for */ + /* each participant in the tuplestore. technically, only participants */ + /* in the barrier could have outer match status files, however, */ + /* all but one participant continue on and detach from the barrier */ + /* so we won't have a reliable way to close only files for those attached */ + /* to the barrier */ + BufFile **statuses = palloc(sizeof(BufFile *) * accessor->sts->nparticipants); + + /* + * Open the bitmap shared BufFile from each participant. TODO: explain why + * file can be NULLs + */ + int statuses_length = 0; + + for (int i = 0; i < accessor->sts->nparticipants; i++) + { + char bitmap_filename[MAXPGPATH]; + + sts_bitmap_filename(bitmap_filename, accessor, i); + BufFile *file = BufFileOpenSharedIfExists(accessor->fileset, bitmap_filename); + + if (file != NULL) + statuses[statuses_length++] = file; + } + + BufFile *combined_bitmap_file = BufFileCreateTemp(false); + + for (int64 cur = 0; cur < BufFileSize(statuses[0]); cur++) + //make it while not + EOF + { + unsigned char combined_byte = 0; + + for (int i = 0; i < statuses_length; i++) + { + unsigned char read_byte; + + BufFileRead(statuses[i], &read_byte, 1); + combined_byte |= read_byte; + } + + BufFileWrite(combined_bitmap_file, &combined_byte, 1); + } + + if (BufFileSeek(combined_bitmap_file, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind hash-join temporary file: %m"))); + + for (int i = 0; i < statuses_length; i++) + BufFileClose(statuses[i]); + pfree(statuses); + + return combined_bitmap_file; +} + + +static void +sts_bitmap_filename(char *name, SharedTuplestoreAccessor *accessor, int participant) +{ + snprintf(name, MAXPGPATH, "%s.p%d.bitmap", accessor->sts->name, participant); +} + /* * Create the name used for the BufFile that a given participant will write. */ diff --git a/src/include/executor/adaptiveHashjoin.h b/src/include/executor/adaptiveHashjoin.h new file mode 100644 index 0000000000..030a04c5c0 --- /dev/null +++ b/src/include/executor/adaptiveHashjoin.h @@ -0,0 +1,9 @@ +#ifndef ADAPTIVE_HASHJOIN_H +#define ADAPTIVE_HASHJOIN_H + + +extern bool ExecParallelHashJoinNewChunk(HashJoinState *hjstate, bool advance_from_probing); +extern bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate); + + +#endif /* ADAPTIVE_HASHJOIN_H */ diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index 79b634e8ed..3e4f4bd574 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -148,11 +148,27 @@ typedef struct HashMemoryChunkData *HashMemoryChunk; * followed by variable-sized objects, they are arranged in contiguous memory * but not accessed directly as an array. */ +/* TODO: maybe remove lock from ParallelHashJoinBatch and use pstate->lock */ +/* and the PHJBatchAccessor to coordinate access to the PHJ batch similar to */ +/* other users of that lock */ typedef struct ParallelHashJoinBatch { dsa_pointer buckets; /* array of hash table buckets */ Barrier batch_barrier; /* synchronization for joining this batch */ + /* Parallel Adaptive Hash Join members */ + + /* + * after finishing build phase, parallel_hashloop_fallback cannot change, + * and does not require a lock to read + */ + bool parallel_hashloop_fallback; + int total_num_chunks; + int current_chunk_num; + size_t estimated_chunk_size; + Barrier chunk_barrier; + LWLock lock; + dsa_pointer chunks; /* chunks of tuples loaded */ size_t size; /* size of buckets + chunks in memory */ size_t estimated_size; /* size of buckets + chunks while writing */ @@ -243,6 +259,8 @@ typedef struct ParallelHashJoinState int nparticipants; size_t space_allowed; size_t total_tuples; /* total number of inner tuples */ + int batch_increases; /* TODO: make this an atomic so I don't + * need the lock to increment it? */ LWLock lock; /* lock protecting the above */ Barrier build_barrier; /* synchronization for the build phases */ @@ -263,10 +281,16 @@ typedef struct ParallelHashJoinState /* The phases for probing each batch, used by for batch_barrier. */ #define PHJ_BATCH_ELECTING 0 #define PHJ_BATCH_ALLOCATING 1 -#define PHJ_BATCH_LOADING 2 -#define PHJ_BATCH_PROBING 3 +#define PHJ_BATCH_CHUNKING 2 +#define PHJ_BATCH_OUTER_MATCH_STATUS_PROCESSING 3 #define PHJ_BATCH_DONE 4 +#define PHJ_CHUNK_ELECTING 0 +#define PHJ_CHUNK_LOADING 1 +#define PHJ_CHUNK_PROBING 2 +#define PHJ_CHUNK_DONE 3 +#define PHJ_CHUNK_FINAL 4 + /* The phases of batch growth while hashing, for grow_batches_barrier. */ #define PHJ_GROW_BATCHES_ELECTING 0 #define PHJ_GROW_BATCHES_ALLOCATING 1 diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h index 1336fde6b4..dfc221e6a1 100644 --- a/src/include/executor/nodeHash.h +++ b/src/include/executor/nodeHash.h @@ -40,9 +40,8 @@ extern void ExecHashTableInsert(HashJoinTable hashtable, extern void ExecParallelHashTableInsert(HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue); -extern void ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable, - TupleTableSlot *slot, - uint32 hashvalue); +extern void + ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue); extern bool ExecHashGetHashValue(HashJoinTable hashtable, ExprContext *econtext, List *hashkeys, diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h index f7df70b5ab..9497b10972 100644 --- a/src/include/executor/tuptable.h +++ b/src/include/executor/tuptable.h @@ -129,6 +129,7 @@ typedef struct TupleTableSlot MemoryContext tts_mcxt; /* slot itself is in this context */ ItemPointerData tts_tid; /* stored tuple's tid */ Oid tts_tableOid; /* table oid of tuple */ + uint32 tuplenum; } TupleTableSlot; /* routines for a TupleTableSlot implementation */ @@ -425,7 +426,7 @@ static inline TupleTableSlot * ExecClearTuple(TupleTableSlot *slot) { slot->tts_ops->clear(slot); - + slot->tuplenum = 0; return slot; } diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 5d5b38b879..93fe6dddb2 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -14,6 +14,7 @@ #ifndef EXECNODES_H #define EXECNODES_H +#include #include "access/tupconvert.h" #include "executor/instrument.h" #include "fmgr.h" @@ -1952,6 +1953,22 @@ typedef struct HashJoinState int hj_JoinState; bool hj_MatchedOuter; bool hj_OuterNotEmpty; + + /* hashloop fallback */ + bool hashloop_fallback; + /* hashloop fallback inner side */ + bool hj_InnerFirstChunk; + bool hj_InnerExhausted; + off_t hj_InnerPageOffset; + + /* hashloop fallback outer side */ + unsigned char hj_OuterCurrentByte; + BufFile *hj_OuterMatchStatusesFile; /* serial AHJ */ + int64 hj_OuterTupleCount; + + /* parallel hashloop fallback outer side */ + bool last_worker; + BufFile *combined_bitmap; } HashJoinState; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index aecb6013f0..340086a7e7 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -815,6 +815,7 @@ typedef enum * it is waiting for a notification from another process. * ---------- */ +/* TODO: add WAIT_EVENT_HASH_BUILD_CREATE_OUTER_MATCH_STATUS_BITMAP_FILES? */ typedef enum { WAIT_EVENT_BGWORKER_SHUTDOWN = PG_WAIT_IPC, @@ -827,10 +828,12 @@ typedef enum WAIT_EVENT_HASH_BATCH_ALLOCATING, WAIT_EVENT_HASH_BATCH_ELECTING, WAIT_EVENT_HASH_BATCH_LOADING, + WAIT_EVENT_HASH_BATCH_PROBING, WAIT_EVENT_HASH_BUILD_ALLOCATING, WAIT_EVENT_HASH_BUILD_ELECTING, WAIT_EVENT_HASH_BUILD_HASHING_INNER, WAIT_EVENT_HASH_BUILD_HASHING_OUTER, + WAIT_EVENT_HASH_BUILD_CREATE_OUTER_MATCH_STATUS_BITMAP_FILES, WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATING, WAIT_EVENT_HASH_GROW_BATCHES_DECIDING, WAIT_EVENT_HASH_GROW_BATCHES_ELECTING, @@ -839,6 +842,11 @@ typedef enum WAIT_EVENT_HASH_GROW_BUCKETS_ALLOCATING, WAIT_EVENT_HASH_GROW_BUCKETS_ELECTING, WAIT_EVENT_HASH_GROW_BUCKETS_REINSERTING, + WAIT_EVENT_HASH_CHUNK_ELECTING, + WAIT_EVENT_HASH_CHUNK_LOADING, + WAIT_EVENT_HASH_CHUNK_PROBING, + WAIT_EVENT_HASH_CHUNK_DONE, + WAIT_EVENT_HASH_ADVANCE_CHUNK, WAIT_EVENT_LOGICAL_SYNC_DATA, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE, WAIT_EVENT_MQ_INTERNAL, diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h index d71927cc2f..a3c867024c 100644 --- a/src/include/storage/barrier.h +++ b/src/include/storage/barrier.h @@ -36,6 +36,7 @@ typedef struct Barrier extern void BarrierInit(Barrier *barrier, int num_workers); extern bool BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info); +extern bool BarrierArriveExplicitAndWait(Barrier *barrier, int next_phase, uint32 wait_event_info); extern bool BarrierArriveAndDetach(Barrier *barrier); extern int BarrierAttach(Barrier *barrier); extern bool BarrierDetach(Barrier *barrier); diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h index 60433f35b4..f790f7e121 100644 --- a/src/include/storage/buffile.h +++ b/src/include/storage/buffile.h @@ -48,7 +48,10 @@ extern long BufFileAppend(BufFile *target, BufFile *source); extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name); extern void BufFileExportShared(BufFile *file); +extern BufFile *BufFileOpenSharedIfExists(SharedFileSet *fileset, const char *name); extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name); extern void BufFileDeleteShared(SharedFileSet *fileset, const char *name); +extern BufFile *BufFileRewindIfExists(BufFile *bufFile); + #endif /* BUFFILE_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 8fda8e4f78..793f660eb4 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -212,6 +212,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_LOCK_MANAGER, LWTRANCHE_PREDICATE_LOCK_MANAGER, LWTRANCHE_PARALLEL_HASH_JOIN, + LWTRANCHE_PARALLEL_HASH_JOIN_BATCH, LWTRANCHE_PARALLEL_QUERY_DSA, LWTRANCHE_SESSION_DSA, LWTRANCHE_SESSION_RECORD_TABLE, diff --git a/src/include/utils/sharedtuplestore.h b/src/include/utils/sharedtuplestore.h index 9754504cc5..6152ac163d 100644 --- a/src/include/utils/sharedtuplestore.h +++ b/src/include/utils/sharedtuplestore.h @@ -22,6 +22,19 @@ typedef struct SharedTuplestore SharedTuplestore; struct SharedTuplestoreAccessor; typedef struct SharedTuplestoreAccessor SharedTuplestoreAccessor; +struct tupleMetadata; +typedef struct tupleMetadata tupleMetadata; + +/* TODO: conflicting types for tupleid with accessor->sts->ntuples (uint32) */ +/* TODO: use a union for tupleid (uint32) (make this a uint64) and chunk number (int) */ +struct tupleMetadata +{ + uint32 hashvalue; + int tupleid; /* tuple id on outer side and chunk number for + * inner side */ +} __attribute__((packed)); + +/* TODO: make sure I can get rid of packed now that using sizeof(struct) */ /* * A flag indicating that the tuplestore will only be scanned once, so backing @@ -58,4 +71,13 @@ extern void sts_puttuple(SharedTuplestoreAccessor *accessor, extern MinimalTuple sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data); + +extern int sts_increment_tuplenum(SharedTuplestoreAccessor *accessor); + +extern void sts_make_outer_match_status_file(SharedTuplestoreAccessor *accessor); +extern void sts_set_outer_match_status(SharedTuplestoreAccessor *accessor, uint32 tuplenum); +extern void sts_close_outer_match_status_file(SharedTuplestoreAccessor *accessor); +extern BufFile *sts_combine_outer_match_status_files(SharedTuplestoreAccessor *accessor); + + #endif /* SHAREDTUPLESTORE_H */ diff --git a/src/test/regress/expected/adaptive_hj.out b/src/test/regress/expected/adaptive_hj.out new file mode 100644 index 0000000000..fe24acd255 --- /dev/null +++ b/src/test/regress/expected/adaptive_hj.out @@ -0,0 +1,1233 @@ +-- TODO: remove some of these tests and make the test file faster +create schema adaptive_hj; +set search_path=adaptive_hj; +drop table if exists t1; +NOTICE: table "t1" does not exist, skipping +drop table if exists t2; +NOTICE: table "t2" does not exist, skipping +create table t1(a int); +create table t2(b int); +-- serial setup +set work_mem=64; +set enable_mergejoin to off; +-- TODO: make this function general +create or replace function explain_multi_batch() returns setof text language plpgsql as +$$ +declare ln text; +begin + for ln in + explain (analyze, summary off, timing off, costs off) + select count(*) from t1 left outer join t2 on a = b + loop + ln := regexp_replace(ln, 'Memory Usage: \S*', 'Memory Usage: xxx'); + return next ln; + end loop; +end; +$$; +-- Serial_Test_1 reset +-- TODO: refactor into procedure or change to drop table +update pg_class set reltuples = 0, relpages = 0 where relname = 't2'; +update pg_class set reltuples = 0, relpages = 0 where relname = 't1'; +delete from pg_statistic where starelid = 't2'::regclass; +delete from pg_statistic where starelid = 't1'::regclass; +-- Serial_Test_1 setup +truncate table t1; +insert into t1 values(1),(2); +insert into t1 select i from generate_series(1,10)i; +insert into t1 select 2 from generate_series(1,5)i; +truncate table t2; +insert into t2 values(2),(3),(11); +insert into t2 select i from generate_series(2,10)i; +insert into t2 select 2 from generate_series(2,7)i; +-- Serial_Test_1.1 +-- TODO: automate the checking for expected number of chunks (explain option?) +-- spills in 4 batches +-- batch 1 falls back with 2 chunks with no unmatched tuples +-- batch 2 falls back with 2 chunks with 2 unmatched tuples emitted at EOB +-- batch 3 falls back with 5 chunks with no unmatched tuples +-- batch 4 does not fall back with no unmatched tuples +select * from explain_multi_batch(); + explain_multi_batch +------------------------------------------------------------ + Aggregate (actual rows=1 loops=1) + -> Hash Left Join (actual rows=67 loops=1) + Hash Cond: (t1.a = t2.b) + -> Seq Scan on t1 (actual rows=17 loops=1) + -> Hash (actual rows=18 loops=1) + Buckets: 2048 Batches: 4 Memory Usage: xxx + -> Seq Scan on t2 (actual rows=18 loops=1) +(7 rows) + +select * from t1 left outer join t2 on a = b order by b, a; + a | b +----+---- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 1 | + 1 | +(67 rows) + +select * from t1, t2 where a = b order by b; + a | b +----+---- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 +(65 rows) + +select * from t1 right outer join t2 on a = b order by a, b; + a | b +----+---- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + | 11 +(66 rows) + +select * from t1 full outer join t2 on a = b order by b, a; + a | b +----+---- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + | 11 + 1 | + 1 | +(68 rows) + +-- Serial_Test_1.2 setup +analyze t1; analyze t2; +-- Serial_Test_1.2 +-- doesn't spill (happens to do a hash right join) +select * from explain_multi_batch(); + explain_multi_batch +------------------------------------------------------------ + Aggregate (actual rows=1 loops=1) + -> Hash Right Join (actual rows=67 loops=1) + Hash Cond: (t2.b = t1.a) + -> Seq Scan on t2 (actual rows=18 loops=1) + -> Hash (actual rows=17 loops=1) + Buckets: 1024 Batches: 1 Memory Usage: xxx + -> Seq Scan on t1 (actual rows=17 loops=1) +(7 rows) + +-- Serial_Test_2 reset +update pg_class set reltuples = 0, relpages = 0 where relname = 't2'; +update pg_class set reltuples = 0, relpages = 0 where relname = 't1'; +delete from pg_statistic where starelid = 't2'::regclass; +delete from pg_statistic where starelid = 't1'::regclass; +-- Serial_Test_2 setup: +truncate table t1; +insert into t1 values (1),(2),(2),(3); +truncate table t2; +insert into t2 values(2),(2),(3),(3),(4); +-- Serial_Test_2.1 +-- spills in 4 batches +-- batch 1 falls back with 2 chunks with no unmatched tuples +-- batch 2 does not fall back with 1 unmatched tuple +-- batch 3 does not fall back with no unmatched tuples +-- batch 4 does not fall back with no unmatched tuples +select * from explain_multi_batch(); + explain_multi_batch +------------------------------------------------------------ + Aggregate (actual rows=1 loops=1) + -> Hash Left Join (actual rows=7 loops=1) + Hash Cond: (t1.a = t2.b) + -> Seq Scan on t1 (actual rows=4 loops=1) + -> Hash (actual rows=5 loops=1) + Buckets: 2048 Batches: 4 Memory Usage: xxx + -> Seq Scan on t2 (actual rows=5 loops=1) +(7 rows) + +select * from t1 left outer join t2 on a = b order by b, a; + a | b +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 1 | +(7 rows) + +select * from t1 right outer join t2 on a = b order by a, b; + a | b +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + | 4 +(7 rows) + +-- TODO: check coverage for emitting ummatched inner tuples +-- Serial_Test_2.1.a +-- results checking for inner join +select * from t1 left outer join t2 on a = b order by b, a; + a | b +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 1 | +(7 rows) + +select * from t1, t2 where a = b order by b; + a | b +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 +(6 rows) + +select * from t1 right outer join t2 on a = b order by a, b; + a | b +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + | 4 +(7 rows) + +select * from t1 full outer join t2 on a = b order by b, a; + a | b +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + | 4 + 1 | +(8 rows) + +select * from t1, t2 where a = b order by b; + a | b +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 +(6 rows) + +-- Serial_Test_2.2 +analyze t1; analyze t2; +-- doesn't spill (happens to do a hash right join) +select * from explain_multi_batch(); + explain_multi_batch +------------------------------------------------------------ + Aggregate (actual rows=1 loops=1) + -> Hash Right Join (actual rows=7 loops=1) + Hash Cond: (t2.b = t1.a) + -> Seq Scan on t2 (actual rows=5 loops=1) + -> Hash (actual rows=4 loops=1) + Buckets: 1024 Batches: 1 Memory Usage: xxx + -> Seq Scan on t1 (actual rows=4 loops=1) +(7 rows) + +-- Serial_Test_3 reset +update pg_class set reltuples = 0, relpages = 0 where relname = 't2'; +update pg_class set reltuples = 0, relpages = 0 where relname = 't1'; +delete from pg_statistic where starelid = 't2'::regclass; +delete from pg_statistic where starelid = 't1'::regclass; +-- Serial_Test_3 setup: +truncate table t1; +insert into t1 values(1),(1); +insert into t1 select 2 from generate_series(1,7)i; +insert into t1 select i from generate_series(3,10)i; +truncate table t2; +insert into t2 select 2 from generate_series(1,7)i; +insert into t2 values(3),(3); +insert into t2 select i from generate_series(5,9)i; +-- Serial_Test_3.1 +-- spills in 4 batches +-- batch 1 falls back with 2 chunks with 1 unmatched tuple +-- batch 2 does not fall back with 2 unmatched tuples +-- batch 3 falls back with 4 chunks with 1 unmatched tuple +-- batch 4 does not fall back with no unmatched tuples +select * from explain_multi_batch(); + explain_multi_batch +------------------------------------------------------------ + Aggregate (actual rows=1 loops=1) + -> Hash Left Join (actual rows=60 loops=1) + Hash Cond: (t1.a = t2.b) + -> Seq Scan on t1 (actual rows=17 loops=1) + -> Hash (actual rows=14 loops=1) + Buckets: 2048 Batches: 4 Memory Usage: xxx + -> Seq Scan on t2 (actual rows=14 loops=1) +(7 rows) + +select * from t1 left outer join t2 on a = b order by b, a; + a | b +----+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 1 | + 1 | + 4 | + 10 | +(60 rows) + +select * from t1, t2 where a = b order by b; + a | b +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 +(56 rows) + +select * from t1 right outer join t2 on a = b order by a, b; + a | b +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 +(56 rows) + +select * from t1 full outer join t2 on a = b order by b, a; + a | b +----+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 1 | + 1 | + 4 | + 10 | +(60 rows) + +select * from t1, t2 where a = b order by b; + a | b +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 +(56 rows) + +-- Serial_Test_3.2 +-- swap join order +select * from t2 left outer join t1 on a = b order by a, b; + b | a +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 +(56 rows) + +select * from t2, t1 where a = b order by a; + b | a +---+--- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 +(56 rows) + +select * from t2 right outer join t1 on a = b order by b, a; + b | a +---+---- + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + | 1 + | 1 + | 4 + | 10 +(60 rows) + +select * from t2 full outer join t1 on a = b order by a, b; + b | a +---+---- + | 1 + | 1 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 2 | 2 + 3 | 3 + 3 | 3 + | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + | 10 +(60 rows) + +-- Serial_Test_3.3 setup +analyze t1; analyze t2; +-- Serial_Test_3.3 +-- doesn't spill +select * from explain_multi_batch(); + explain_multi_batch +------------------------------------------------------------ + Aggregate (actual rows=1 loops=1) + -> Hash Left Join (actual rows=60 loops=1) + Hash Cond: (t1.a = t2.b) + -> Seq Scan on t1 (actual rows=17 loops=1) + -> Hash (actual rows=14 loops=1) + Buckets: 1024 Batches: 1 Memory Usage: xxx + -> Seq Scan on t2 (actual rows=14 loops=1) +(7 rows) + +-- Serial_Test_4 setup +drop table t1; +create table t1(b int); +insert into t1 select i from generate_series(1,111)i; +insert into t1 select 2 from generate_series(1,180)i; +analyze t1; +drop table t2; +create table t2(a int); +insert into t2 select i from generate_series(20,25000)i; +insert into t2 select 2 from generate_series(1,100)i; +analyze t2; +update pg_class + set reltuples = 10, relpages = pg_relation_size('t2') / 8192 + where relname = 't2'; +-- Serial_Test_4.1 +-- spills in 32 batches +--batch 0 does not fall back with 1 unmatched outer tuple (15) +--batch 1 falls back with 396 chunks. +--batch 2 falls back with 402 chunks with 1 unmatched outer tuple (1) +--batch 3 falls back with 389 chunks with 1 unmatched outer tuple (8) +--batch 4 falls back with 409 chunks with no unmatched outer tuples +--batch 5 falls back with 366 chunks with 1 unmatched outer tuple (4) +--batch 6 falls back with 407 chunks with 1 unmatched outer tuple (11) +--batch 7 falls back with 382 chunks with unmatched outer tuple (10) +--batch 8 falls back with 413 chunks with no unmatched outer tuples +--batch 9 falls back with 371 chunks with 1 unmatched outer tuple (3) +--batch 10 falls back with 389 chunks with no unmatched outer tuples +--batch 11 falls back with 408 chunks with no unmatched outer tuples +--batch 12 falls back with 387 chunks with no unmatched outer tuples +--batch 13 falls back with 402 chunks with 1 unmatched outer tuple (18) +--batch 14 falls back with 369 chunks with 1 unmatched outer tuple (9) +--batch 15 falls back with 387 chunks with no unmatched outer tuples +--batch 16 falls back with 365 chunks with no unmatched outer tuples +--batch 17 falls back with 403 chunks with 2 unmatched outer tuples (14,19) +--batch 18 falls back with 375 chunks with no unmatched outer tuples +--batch 19 falls back with 384 chunks with no unmatched outer tuples +--batch 20 falls back with 377 chunks with 1 unmatched outer tuple (12) +--batch 22 falls back with 401 chunks with no unmatched outer tuples +--batch 23 falls back with 396 chunks with no unmatched outer tuples +--batch 24 falls back with 387 chunks with 1 unmatched outer tuple (5) +--batch 25 falls back with 399 chunks with 1 unmatched outer tuple (7) +--batch 26 falls back with 387 chunks. +--batch 27 falls back with 442 chunks. +--batch 28 falls back with 385 chunks with 1 unmatched outer tuple (17) +--batch 29 falls back with 375 chunks. +--batch 30 falls back with 404 chunks with 1 unmatched outer tuple (6) +--batch 31 falls back with 396 chunks with 2 unmatched outer tuples (13,16) +select * from explain_multi_batch(); + explain_multi_batch +---------------------------------------------------------------------------------------------- + Aggregate (actual rows=1 loops=1) + -> Hash Left Join (actual rows=18210 loops=1) + Hash Cond: (t1.b = t2.a) + -> Seq Scan on t1 (actual rows=291 loops=1) + -> Hash (actual rows=25081 loops=1) + Buckets: 2048 (originally 1024) Batches: 32 (originally 1) Memory Usage: xxx + -> Seq Scan on t2 (actual rows=25081 loops=1) +(7 rows) + +select count(*) from t1 left outer join t2 on a = b; + count +------- + 18210 +(1 row) + +select count(a) from t1 left outer join t2 on a = b; + count +------- + 18192 +(1 row) + +select count(*) from t1, t2 where a = b; + count +------- + 18192 +(1 row) + +-- used to give wrong results because there is a whole batch of outer which is +-- empty and so the inner doesn't emit unmatched tuples with ROJ +select count(*) from t1 right outer join t2 on a = b; + count +------- + 43081 +(1 row) + +select count(*) from t1 full outer join t2 on a = b; + count +------- + 43099 +(1 row) + +-- Test_6 non-negligible amount of data test case +-- TODO: doesn't finish with my code when it is set to be serial +-- it does finish when it is parallel -- the serial version is either simply too +-- slow or has a bug -- I tried it with less data and it did finish, so it must +-- just be really slow +-- inner join shouldn't even need to make the unmatched files +-- it finishes eventually if I decrease data amount +--drop table simple; +--create table simple as + -- select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +--alter table simple set (parallel_workers = 2); +--analyze simple; +-- +--drop table extremely_skewed; +--create table extremely_skewed (id int, t text); +--alter table extremely_skewed set (autovacuum_enabled = 'false'); +--alter table extremely_skewed set (parallel_workers = 2); +--analyze extremely_skewed; +--insert into extremely_skewed +-- select 42 as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' +-- from generate_series(1, 20000); +--update pg_class +-- set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 +-- where relname = 'extremely_skewed'; +--set work_mem=64; +--set enable_mergejoin to off; +--explain (analyze, costs off, timing off) + --select * from simple r join extremely_skewed s using (id); +--select * from explain_multi_batch(); +drop table t1; +drop table t2; +drop function explain_multi_batch(); +reset enable_mergejoin; +reset work_mem; +reset search_path; +drop schema adaptive_hj; diff --git a/src/test/regress/expected/parallel_adaptive_hj.out b/src/test/regress/expected/parallel_adaptive_hj.out new file mode 100644 index 0000000000..e5e7f9aa4f --- /dev/null +++ b/src/test/regress/expected/parallel_adaptive_hj.out @@ -0,0 +1,343 @@ +create schema parallel_adaptive_hj; +set search_path=parallel_adaptive_hj; +-- TODO: anti-semi-join and semi-join tests +-- TODO: check if test2 and 3 are different at all +-- TODO: add test for parallel-oblivious parallel hash join +-- TODO: make this function general +create or replace function explain_parallel_multi_batch() returns setof text language plpgsql as +$$ +declare ln text; +begin + for ln in + explain (analyze, summary off, timing off, costs off) + select count(*) from t1 left outer join t2 on a = b + loop + ln := regexp_replace(ln, 'Memory Usage: \S*', 'Memory Usage: xxx'); + return next ln; + end loop; +end; +$$; +-- parallel setup +set enable_nestloop to off; +set enable_mergejoin to off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set enable_parallel_hash = on; +set enable_hashjoin = on; +set max_parallel_workers_per_gather = 1; +set work_mem = 64; +-- Parallel_Test_1 setup +drop table if exists t1; +NOTICE: table "t1" does not exist, skipping +create table t1(a int); +insert into t1 select i from generate_series(1,11)i; +insert into t1 select 2 from generate_series(1,18)i; +analyze t1; +drop table if exists t2; +NOTICE: table "t2" does not exist, skipping +create table t2(b int); +insert into t2 select i from generate_series(4,2500)i; +insert into t2 select 2 from generate_series(1,10)i; +analyze t2; +alter table t2 set (autovacuum_enabled = 'false'); +update pg_class + set reltuples = 10, relpages = pg_relation_size('t2') / 8192 + where relname = 't2'; +-- Parallel_Test_1.1 +-- spills in 4 batches +-- 1 resize of nbatches +-- no batch falls back +select * from explain_parallel_multi_batch(); + explain_parallel_multi_batch +--------------------------------------------------------------------------------------------------------- + Finalize Aggregate (actual rows=1 loops=1) + -> Gather (actual rows=2 loops=1) + Workers Planned: 1 + Workers Launched: 1 + -> Partial Aggregate (actual rows=1 loops=2) + -> Parallel Hash Left Join (actual rows=100 loops=2) + Hash Cond: (t1.a = t2.b) + -> Parallel Seq Scan on t1 (actual rows=29 loops=1) + -> Parallel Hash (actual rows=1254 loops=2) + Buckets: 1024 (originally 1024) Batches: 4 (originally 1) Memory Usage: xxx + -> Parallel Seq Scan on t2 (actual rows=2507 loops=1) +(11 rows) + +-- need an aggregate to exercise the code but still want to know if we are +-- emitting the right unmatched outer tuples +select count(a) from t1 left outer join t2 on a = b; + count +------- + 200 +(1 row) + +select count(*) from t1 left outer join t2 on a = b; + count +------- + 200 +(1 row) + +-- Parallel_Test_1.1.a +-- results checking for inner join +-- doesn't fall back +select count(*) from t1, t2 where a = b; + count +------- + 198 +(1 row) + +-- Parallel_Test_1.1.b +-- results checking for right outer join +-- doesn't exercise the fallback code but just checking results +select count(*) from t1 right outer join t2 on a = b; + count +------- + 2687 +(1 row) + +-- Parallel_Test_1.1.c +-- results checking for full outer join +select count(*) from t1 full outer join t2 on a = b; + count +------- + 2689 +(1 row) + +-- Parallel_Test_1.2 +-- spill and doesn't have to resize nbatches +analyze t2; +select * from explain_parallel_multi_batch(); + explain_parallel_multi_batch +---------------------------------------------------------------------------------- + Finalize Aggregate (actual rows=1 loops=1) + -> Gather (actual rows=2 loops=1) + Workers Planned: 1 + Workers Launched: 1 + -> Partial Aggregate (actual rows=1 loops=2) + -> Parallel Hash Left Join (actual rows=100 loops=2) + Hash Cond: (t1.a = t2.b) + -> Parallel Seq Scan on t1 (actual rows=29 loops=1) + -> Parallel Hash (actual rows=1254 loops=2) + Buckets: 2048 Batches: 4 Memory Usage: xxx + -> Parallel Seq Scan on t2 (actual rows=2507 loops=1) +(11 rows) + +select count(a) from t1 left outer join t2 on a = b; + count +------- + 200 +(1 row) + +-- Parallel_Test_1.3 +-- doesn't spill +-- does resize nbuckets +set work_mem = '4MB'; +select * from explain_parallel_multi_batch(); + explain_parallel_multi_batch +---------------------------------------------------------------------------------- + Finalize Aggregate (actual rows=1 loops=1) + -> Gather (actual rows=2 loops=1) + Workers Planned: 1 + Workers Launched: 1 + -> Partial Aggregate (actual rows=1 loops=2) + -> Parallel Hash Left Join (actual rows=100 loops=2) + Hash Cond: (t1.a = t2.b) + -> Parallel Seq Scan on t1 (actual rows=29 loops=1) + -> Parallel Hash (actual rows=1254 loops=2) + Buckets: 4096 Batches: 1 Memory Usage: xxx + -> Parallel Seq Scan on t2 (actual rows=2507 loops=1) +(11 rows) + +select count(a) from t1 left outer join t2 on a = b; + count +------- + 200 +(1 row) + +set work_mem = 64; +-- Parallel_Test_3 +-- big example +drop table if exists t2; +create table t2(b int); +insert into t2 select i from generate_series(20,25000)i; +insert into t2 select 2 from generate_series(1,100)i; +analyze t2; +update pg_class + set reltuples = 10, relpages = pg_relation_size('t2') / 8192 + where relname = 't2'; +drop table if exists t1; +create table t1(a int); +insert into t1 select i from generate_series(1,111)i; +insert into t1 select 2 from generate_series(1,180)i; +analyze t1; +select * from explain_parallel_multi_batch(); + explain_parallel_multi_batch +---------------------------------------------------------------------------------------------------------- + Finalize Aggregate (actual rows=1 loops=1) + -> Gather (actual rows=2 loops=1) + Workers Planned: 1 + Workers Launched: 1 + -> Partial Aggregate (actual rows=1 loops=2) + -> Parallel Hash Left Join (actual rows=9105 loops=2) + Hash Cond: (t1.a = t2.b) + -> Parallel Seq Scan on t1 (actual rows=146 loops=2) + -> Parallel Hash (actual rows=12540 loops=2) + Buckets: 1024 (originally 1024) Batches: 16 (originally 1) Memory Usage: xxx + -> Parallel Seq Scan on t2 (actual rows=12540 loops=2) +(11 rows) + +select count(*) from t1 left outer join t2 on a = b; + count +------- + 18210 +(1 row) + +-- TODO: check what each of these is exercising -- chunk num, etc and write that +-- down +-- also, note that this example did reveal with ROJ that it wasn't working, so +-- maybe keep that but it is not parallel +-- make sure the plans make sense for the code we are writing +select count(*) from t1 left outer join t2 on a = b; + count +------- + 18210 +(1 row) + +select count(*) from t1, t2 where a = b; + count +------- + 18192 +(1 row) + +select count(*) from t1 right outer join t2 on a = b; + count +------- + 43081 +(1 row) + +select count(*) from t1 full outer join t2 on a = b; + count +------- + 43099 +(1 row) + +-- Parallel_Test_4 +-- spill and resize nbatches 2x +drop table if exists t2; +create table t2(b int); +insert into t2 select i from generate_series(4,1000)i; +insert into t2 select 2 from generate_series(1,4000)i; +analyze t2; +alter table t2 set (autovacuum_enabled = 'false'); +update pg_class +set reltuples = 10, relpages = pg_relation_size('t2') / 8192 +where relname = 't2'; +drop table if exists t1; +create table t1(a int); +insert into t1 select i from generate_series(1,11)i; +insert into t1 select 2 from generate_series(1,18)i; +insert into t1 values(500); +analyze t1; +select * from explain_parallel_multi_batch(); + explain_parallel_multi_batch +---------------------------------------------------------------------------------------------------------- + Finalize Aggregate (actual rows=1 loops=1) + -> Gather (actual rows=2 loops=1) + Workers Planned: 1 + Workers Launched: 1 + -> Partial Aggregate (actual rows=1 loops=2) + -> Parallel Hash Left Join (actual rows=38006 loops=2) + Hash Cond: (t1.a = t2.b) + -> Parallel Seq Scan on t1 (actual rows=15 loops=2) + -> Parallel Hash (actual rows=2498 loops=2) + Buckets: 1024 (originally 1024) Batches: 16 (originally 1) Memory Usage: xxx + -> Parallel Seq Scan on t2 (actual rows=2498 loops=2) +(11 rows) + +select count(*) from t1 left outer join t2 on a = b; + count +------- + 76011 +(1 row) + +select count(*) from t1, t2 where a = b; + count +------- + 76009 +(1 row) + +select count(*) from t1 right outer join t2 on a = b; + count +------- + 76997 +(1 row) + +select count(*) from t1 full outer join t2 on a = b; + count +------- + 76999 +(1 row) + +select count(a) from t1 left outer join t2 on a = b; + count +------- + 76011 +(1 row) + +-- Parallel_Test_5 +-- revealed race condition because two workers are working on a chunked batch +-- only 2 unmatched tuples +drop table if exists t2; +create table t2(b int); +insert into t2 select i%1111 from generate_series(200,10000)i; +delete from t2 where b = 115; +delete from t2 where b = 200; +insert into t2 select 2 from generate_series(1,4000); +analyze t2; +alter table t2 set (autovacuum_enabled = 'false'); +update pg_class + set reltuples = 10, relpages = pg_relation_size('t2') / 8192 + where relname = 't2'; +drop table if exists t1; +create table t1(a int); +insert into t1 select i from generate_series(1,111)i; +insert into t1 values(115); +insert into t1 values(200); +insert into t1 select 2 from generate_series(1,180)i; +analyze t1; +select * from explain_parallel_multi_batch(); + explain_parallel_multi_batch +---------------------------------------------------------------------------------------------------------- + Finalize Aggregate (actual rows=1 loops=1) + -> Gather (actual rows=2 loops=1) + Workers Planned: 1 + Workers Launched: 1 + -> Partial Aggregate (actual rows=1 loops=2) + -> Parallel Hash Left Join (actual rows=363166 loops=2) + Hash Cond: (t1.a = t2.b) + -> Parallel Seq Scan on t1 (actual rows=146 loops=2) + -> Parallel Hash (actual rows=6892 loops=2) + Buckets: 1024 (originally 1024) Batches: 32 (originally 1) Memory Usage: xxx + -> Parallel Seq Scan on t2 (actual rows=6892 loops=2) +(11 rows) + +select count(*) from t1 left outer join t2 on a = b; + count +-------- + 726331 +(1 row) + +-- without count(*), can't reproduce desired plan so can't rely on results +select count(*) from t1 left outer join t2 on a = b; + count +-------- + 726331 +(1 row) + +drop table if exists t1; +drop table if exists t2; +drop function explain_parallel_multi_batch(); +reset enable_mergejoin; +reset work_mem; +reset search_path; +drop schema parallel_adaptive_hj; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index d2b17dd3ea..518dd6d021 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,7 +78,7 @@ test: brin gin gist spgist privileges init_privs security_label collate matview # ---------- # Another group of parallel tests # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan collate.icu.utf8 +test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan collate.icu.utf8 adaptive_hj parallel_adaptive_hj # rules cannot run concurrently with any test that creates # a view or rule in the public schema diff --git a/src/test/regress/post_schedule b/src/test/regress/post_schedule new file mode 100644 index 0000000000..7824ecf7bf --- /dev/null +++ b/src/test/regress/post_schedule @@ -0,0 +1,8 @@ +test: object_address +test: tablesample +test: groupingsets +test: drop_operator +test: password +test: identity +test: generated +test: join_hash diff --git a/src/test/regress/pre_schedule b/src/test/regress/pre_schedule new file mode 100644 index 0000000000..4105b0fa03 --- /dev/null +++ b/src/test/regress/pre_schedule @@ -0,0 +1,120 @@ +# src/test/regress/serial_schedule +# This should probably be in an order similar to parallel_schedule. +test: tablespace +test: boolean +test: char +test: name +test: varchar +test: text +test: int2 +test: int4 +test: int8 +test: oid +test: float4 +test: float8 +test: bit +test: numeric +test: txid +test: uuid +test: enum +test: money +test: rangetypes +test: pg_lsn +test: regproc +test: strings +test: numerology +test: point +test: lseg +test: line +test: box +test: path +test: polygon +test: circle +test: date +test: time +test: timetz +test: timestamp +test: timestamptz +test: interval +test: inet +test: macaddr +test: macaddr8 +test: tstypes +test: geometry +test: horology +test: regex +test: oidjoins +test: type_sanity +test: opr_sanity +test: misc_sanity +test: comments +test: expressions +test: create_function_1 +test: create_type +test: create_table +test: create_function_2 +test: copy +test: copyselect +test: copydml +test: insert +test: insert_conflict +test: create_misc +test: create_operator +test: create_procedure +test: create_index +test: create_index_spgist +test: create_view +test: index_including +test: index_including_gist +test: create_aggregate +test: create_function_3 +test: create_cast +test: constraints +test: triggers +test: select +test: inherit +test: typed_table +test: vacuum +test: drop_if_exists +test: updatable_views +test: roleattributes +test: create_am +test: hash_func +test: errors +test: sanity_check +test: select_into +test: select_distinct +test: select_distinct_on +test: select_implicit +test: select_having +test: subselect +test: union +test: case +test: join +test: adaptive_hj +test: parallel_adaptive_hj +test: aggregates +test: transactions +ignore: random +test: random +test: portals +test: arrays +test: btree_index +test: hash_index +test: update +test: delete +test: namespace +test: prepared_xacts +test: brin +test: gin +test: gist +test: spgist +test: privileges +test: init_privs +test: security_label +test: collate +test: matview +test: lock +test: replica_identity +test: rowsecurity + diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index acba391332..15867f3196 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -91,6 +91,8 @@ test: subselect test: union test: case test: join +test: adaptive_hj +test: parallel_adaptive_hj test: aggregates test: transactions ignore: random diff --git a/src/test/regress/sql/adaptive_hj.sql b/src/test/regress/sql/adaptive_hj.sql new file mode 100644 index 0000000000..a5af798ea8 --- /dev/null +++ b/src/test/regress/sql/adaptive_hj.sql @@ -0,0 +1,240 @@ +-- TODO: remove some of these tests and make the test file faster +create schema adaptive_hj; +set search_path=adaptive_hj; +drop table if exists t1; +drop table if exists t2; +create table t1(a int); +create table t2(b int); + +-- serial setup +set work_mem=64; +set enable_mergejoin to off; +-- TODO: make this function general +create or replace function explain_multi_batch() returns setof text language plpgsql as +$$ +declare ln text; +begin + for ln in + explain (analyze, summary off, timing off, costs off) + select count(*) from t1 left outer join t2 on a = b + loop + ln := regexp_replace(ln, 'Memory Usage: \S*', 'Memory Usage: xxx'); + return next ln; + end loop; +end; +$$; + +-- Serial_Test_1 reset +-- TODO: refactor into procedure or change to drop table +update pg_class set reltuples = 0, relpages = 0 where relname = 't2'; +update pg_class set reltuples = 0, relpages = 0 where relname = 't1'; +delete from pg_statistic where starelid = 't2'::regclass; +delete from pg_statistic where starelid = 't1'::regclass; + +-- Serial_Test_1 setup +truncate table t1; +insert into t1 values(1),(2); +insert into t1 select i from generate_series(1,10)i; +insert into t1 select 2 from generate_series(1,5)i; +truncate table t2; +insert into t2 values(2),(3),(11); +insert into t2 select i from generate_series(2,10)i; +insert into t2 select 2 from generate_series(2,7)i; + +-- Serial_Test_1.1 +-- TODO: automate the checking for expected number of chunks (explain option?) +-- spills in 4 batches +-- batch 1 falls back with 2 chunks with no unmatched tuples +-- batch 2 falls back with 2 chunks with 2 unmatched tuples emitted at EOB +-- batch 3 falls back with 5 chunks with no unmatched tuples +-- batch 4 does not fall back with no unmatched tuples +select * from explain_multi_batch(); +select * from t1 left outer join t2 on a = b order by b, a; +select * from t1, t2 where a = b order by b; +select * from t1 right outer join t2 on a = b order by a, b; +select * from t1 full outer join t2 on a = b order by b, a; + +-- Serial_Test_1.2 setup +analyze t1; analyze t2; + +-- Serial_Test_1.2 +-- doesn't spill (happens to do a hash right join) +select * from explain_multi_batch(); + +-- Serial_Test_2 reset +update pg_class set reltuples = 0, relpages = 0 where relname = 't2'; +update pg_class set reltuples = 0, relpages = 0 where relname = 't1'; +delete from pg_statistic where starelid = 't2'::regclass; +delete from pg_statistic where starelid = 't1'::regclass; + +-- Serial_Test_2 setup: +truncate table t1; +insert into t1 values (1),(2),(2),(3); +truncate table t2; +insert into t2 values(2),(2),(3),(3),(4); + +-- Serial_Test_2.1 +-- spills in 4 batches +-- batch 1 falls back with 2 chunks with no unmatched tuples +-- batch 2 does not fall back with 1 unmatched tuple +-- batch 3 does not fall back with no unmatched tuples +-- batch 4 does not fall back with no unmatched tuples +select * from explain_multi_batch(); +select * from t1 left outer join t2 on a = b order by b, a; +select * from t1 right outer join t2 on a = b order by a, b; + +-- TODO: check coverage for emitting ummatched inner tuples +-- Serial_Test_2.1.a +-- results checking for inner join +select * from t1 left outer join t2 on a = b order by b, a; +select * from t1, t2 where a = b order by b; +select * from t1 right outer join t2 on a = b order by a, b; +select * from t1 full outer join t2 on a = b order by b, a; +select * from t1, t2 where a = b order by b; + +-- Serial_Test_2.2 +analyze t1; analyze t2; +-- doesn't spill (happens to do a hash right join) +select * from explain_multi_batch(); + +-- Serial_Test_3 reset +update pg_class set reltuples = 0, relpages = 0 where relname = 't2'; +update pg_class set reltuples = 0, relpages = 0 where relname = 't1'; +delete from pg_statistic where starelid = 't2'::regclass; +delete from pg_statistic where starelid = 't1'::regclass; + + +-- Serial_Test_3 setup: +truncate table t1; +insert into t1 values(1),(1); +insert into t1 select 2 from generate_series(1,7)i; +insert into t1 select i from generate_series(3,10)i; +truncate table t2; +insert into t2 select 2 from generate_series(1,7)i; +insert into t2 values(3),(3); +insert into t2 select i from generate_series(5,9)i; + +-- Serial_Test_3.1 +-- spills in 4 batches +-- batch 1 falls back with 2 chunks with 1 unmatched tuple +-- batch 2 does not fall back with 2 unmatched tuples +-- batch 3 falls back with 4 chunks with 1 unmatched tuple +-- batch 4 does not fall back with no unmatched tuples +select * from explain_multi_batch(); +select * from t1 left outer join t2 on a = b order by b, a; +select * from t1, t2 where a = b order by b; +select * from t1 right outer join t2 on a = b order by a, b; +select * from t1 full outer join t2 on a = b order by b, a; +select * from t1, t2 where a = b order by b; + +-- Serial_Test_3.2 +-- swap join order +select * from t2 left outer join t1 on a = b order by a, b; +select * from t2, t1 where a = b order by a; +select * from t2 right outer join t1 on a = b order by b, a; +select * from t2 full outer join t1 on a = b order by a, b; + +-- Serial_Test_3.3 setup +analyze t1; analyze t2; + +-- Serial_Test_3.3 +-- doesn't spill +select * from explain_multi_batch(); + +-- Serial_Test_4 setup +drop table t1; +create table t1(b int); +insert into t1 select i from generate_series(1,111)i; +insert into t1 select 2 from generate_series(1,180)i; +analyze t1; + +drop table t2; +create table t2(a int); +insert into t2 select i from generate_series(20,25000)i; +insert into t2 select 2 from generate_series(1,100)i; +analyze t2; +update pg_class + set reltuples = 10, relpages = pg_relation_size('t2') / 8192 + where relname = 't2'; + +-- Serial_Test_4.1 +-- spills in 32 batches +--batch 0 does not fall back with 1 unmatched outer tuple (15) +--batch 1 falls back with 396 chunks. +--batch 2 falls back with 402 chunks with 1 unmatched outer tuple (1) +--batch 3 falls back with 389 chunks with 1 unmatched outer tuple (8) +--batch 4 falls back with 409 chunks with no unmatched outer tuples +--batch 5 falls back with 366 chunks with 1 unmatched outer tuple (4) +--batch 6 falls back with 407 chunks with 1 unmatched outer tuple (11) +--batch 7 falls back with 382 chunks with unmatched outer tuple (10) +--batch 8 falls back with 413 chunks with no unmatched outer tuples +--batch 9 falls back with 371 chunks with 1 unmatched outer tuple (3) +--batch 10 falls back with 389 chunks with no unmatched outer tuples +--batch 11 falls back with 408 chunks with no unmatched outer tuples +--batch 12 falls back with 387 chunks with no unmatched outer tuples +--batch 13 falls back with 402 chunks with 1 unmatched outer tuple (18) +--batch 14 falls back with 369 chunks with 1 unmatched outer tuple (9) +--batch 15 falls back with 387 chunks with no unmatched outer tuples +--batch 16 falls back with 365 chunks with no unmatched outer tuples +--batch 17 falls back with 403 chunks with 2 unmatched outer tuples (14,19) +--batch 18 falls back with 375 chunks with no unmatched outer tuples +--batch 19 falls back with 384 chunks with no unmatched outer tuples +--batch 20 falls back with 377 chunks with 1 unmatched outer tuple (12) +--batch 22 falls back with 401 chunks with no unmatched outer tuples +--batch 23 falls back with 396 chunks with no unmatched outer tuples +--batch 24 falls back with 387 chunks with 1 unmatched outer tuple (5) +--batch 25 falls back with 399 chunks with 1 unmatched outer tuple (7) +--batch 26 falls back with 387 chunks. +--batch 27 falls back with 442 chunks. +--batch 28 falls back with 385 chunks with 1 unmatched outer tuple (17) +--batch 29 falls back with 375 chunks. +--batch 30 falls back with 404 chunks with 1 unmatched outer tuple (6) +--batch 31 falls back with 396 chunks with 2 unmatched outer tuples (13,16) +select * from explain_multi_batch(); +select count(*) from t1 left outer join t2 on a = b; +select count(a) from t1 left outer join t2 on a = b; +select count(*) from t1, t2 where a = b; +-- used to give wrong results because there is a whole batch of outer which is +-- empty and so the inner doesn't emit unmatched tuples with ROJ +select count(*) from t1 right outer join t2 on a = b; +select count(*) from t1 full outer join t2 on a = b; + +-- Test_6 non-negligible amount of data test case +-- TODO: doesn't finish with my code when it is set to be serial +-- it does finish when it is parallel -- the serial version is either simply too +-- slow or has a bug -- I tried it with less data and it did finish, so it must +-- just be really slow +-- inner join shouldn't even need to make the unmatched files +-- it finishes eventually if I decrease data amount + +--drop table simple; +--create table simple as + -- select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +--alter table simple set (parallel_workers = 2); +--analyze simple; +-- +--drop table extremely_skewed; +--create table extremely_skewed (id int, t text); +--alter table extremely_skewed set (autovacuum_enabled = 'false'); +--alter table extremely_skewed set (parallel_workers = 2); +--analyze extremely_skewed; +--insert into extremely_skewed +-- select 42 as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' +-- from generate_series(1, 20000); +--update pg_class +-- set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 +-- where relname = 'extremely_skewed'; + +--set work_mem=64; +--set enable_mergejoin to off; +--explain (analyze, costs off, timing off) + --select * from simple r join extremely_skewed s using (id); +--select * from explain_multi_batch(); + +drop table t1; +drop table t2; +drop function explain_multi_batch(); +reset enable_mergejoin; +reset work_mem; +reset search_path; +drop schema adaptive_hj; diff --git a/src/test/regress/sql/parallel_adaptive_hj.sql b/src/test/regress/sql/parallel_adaptive_hj.sql new file mode 100644 index 0000000000..3071c5f82e --- /dev/null +++ b/src/test/regress/sql/parallel_adaptive_hj.sql @@ -0,0 +1,182 @@ +create schema parallel_adaptive_hj; +set search_path=parallel_adaptive_hj; + +-- TODO: anti-semi-join and semi-join tests + +-- TODO: check if test2 and 3 are different at all + +-- TODO: add test for parallel-oblivious parallel hash join + +-- TODO: make this function general +create or replace function explain_parallel_multi_batch() returns setof text language plpgsql as +$$ +declare ln text; +begin + for ln in + explain (analyze, summary off, timing off, costs off) + select count(*) from t1 left outer join t2 on a = b + loop + ln := regexp_replace(ln, 'Memory Usage: \S*', 'Memory Usage: xxx'); + return next ln; + end loop; +end; +$$; + +-- parallel setup +set enable_nestloop to off; +set enable_mergejoin to off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set enable_parallel_hash = on; +set enable_hashjoin = on; +set max_parallel_workers_per_gather = 1; +set work_mem = 64; + +-- Parallel_Test_1 setup +drop table if exists t1; +create table t1(a int); +insert into t1 select i from generate_series(1,11)i; +insert into t1 select 2 from generate_series(1,18)i; +analyze t1; + +drop table if exists t2; +create table t2(b int); +insert into t2 select i from generate_series(4,2500)i; +insert into t2 select 2 from generate_series(1,10)i; +analyze t2; +alter table t2 set (autovacuum_enabled = 'false'); +update pg_class + set reltuples = 10, relpages = pg_relation_size('t2') / 8192 + where relname = 't2'; + +-- Parallel_Test_1.1 +-- spills in 4 batches +-- 1 resize of nbatches +-- no batch falls back +select * from explain_parallel_multi_batch(); +-- need an aggregate to exercise the code but still want to know if we are +-- emitting the right unmatched outer tuples +select count(a) from t1 left outer join t2 on a = b; +select count(*) from t1 left outer join t2 on a = b; + +-- Parallel_Test_1.1.a +-- results checking for inner join +-- doesn't fall back +select count(*) from t1, t2 where a = b; +-- Parallel_Test_1.1.b +-- results checking for right outer join +-- doesn't exercise the fallback code but just checking results +select count(*) from t1 right outer join t2 on a = b; +-- Parallel_Test_1.1.c +-- results checking for full outer join +select count(*) from t1 full outer join t2 on a = b; + +-- Parallel_Test_1.2 +-- spill and doesn't have to resize nbatches +analyze t2; +select * from explain_parallel_multi_batch(); +select count(a) from t1 left outer join t2 on a = b; + +-- Parallel_Test_1.3 +-- doesn't spill +-- does resize nbuckets +set work_mem = '4MB'; +select * from explain_parallel_multi_batch(); +select count(a) from t1 left outer join t2 on a = b; +set work_mem = 64; + + +-- Parallel_Test_3 +-- big example +drop table if exists t2; +create table t2(b int); +insert into t2 select i from generate_series(20,25000)i; +insert into t2 select 2 from generate_series(1,100)i; +analyze t2; +update pg_class + set reltuples = 10, relpages = pg_relation_size('t2') / 8192 + where relname = 't2'; + +drop table if exists t1; +create table t1(a int); +insert into t1 select i from generate_series(1,111)i; +insert into t1 select 2 from generate_series(1,180)i; +analyze t1; + +select * from explain_parallel_multi_batch(); +select count(*) from t1 left outer join t2 on a = b; + +-- TODO: check what each of these is exercising -- chunk num, etc and write that +-- down +-- also, note that this example did reveal with ROJ that it wasn't working, so +-- maybe keep that but it is not parallel +-- make sure the plans make sense for the code we are writing +select count(*) from t1 left outer join t2 on a = b; +select count(*) from t1, t2 where a = b; +select count(*) from t1 right outer join t2 on a = b; +select count(*) from t1 full outer join t2 on a = b; + +-- Parallel_Test_4 +-- spill and resize nbatches 2x + +drop table if exists t2; +create table t2(b int); +insert into t2 select i from generate_series(4,1000)i; +insert into t2 select 2 from generate_series(1,4000)i; +analyze t2; +alter table t2 set (autovacuum_enabled = 'false'); +update pg_class +set reltuples = 10, relpages = pg_relation_size('t2') / 8192 +where relname = 't2'; + +drop table if exists t1; +create table t1(a int); +insert into t1 select i from generate_series(1,11)i; +insert into t1 select 2 from generate_series(1,18)i; +insert into t1 values(500); +analyze t1; + +select * from explain_parallel_multi_batch(); +select count(*) from t1 left outer join t2 on a = b; +select count(*) from t1, t2 where a = b; +select count(*) from t1 right outer join t2 on a = b; +select count(*) from t1 full outer join t2 on a = b; +select count(a) from t1 left outer join t2 on a = b; + +-- Parallel_Test_5 +-- revealed race condition because two workers are working on a chunked batch +-- only 2 unmatched tuples + +drop table if exists t2; +create table t2(b int); +insert into t2 select i%1111 from generate_series(200,10000)i; +delete from t2 where b = 115; +delete from t2 where b = 200; +insert into t2 select 2 from generate_series(1,4000); +analyze t2; +alter table t2 set (autovacuum_enabled = 'false'); +update pg_class + set reltuples = 10, relpages = pg_relation_size('t2') / 8192 + where relname = 't2'; + +drop table if exists t1; +create table t1(a int); +insert into t1 select i from generate_series(1,111)i; +insert into t1 values(115); +insert into t1 values(200); +insert into t1 select 2 from generate_series(1,180)i; +analyze t1; + +select * from explain_parallel_multi_batch(); +select count(*) from t1 left outer join t2 on a = b; + +-- without count(*), can't reproduce desired plan so can't rely on results +select count(*) from t1 left outer join t2 on a = b; + +drop table if exists t1; +drop table if exists t2; +drop function explain_parallel_multi_batch(); +reset enable_mergejoin; +reset work_mem; +reset search_path; +drop schema parallel_adaptive_hj; -- 2.20.1 (Apple Git-117)