From 5860ccb743a3981ade55e1d394d43bde5cc9d342 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Tue, 22 Aug 2023 16:48:30 +1200 Subject: [PATCH v1 11/14] WIP: Use streaming reads in bitmap heapscan. XXX Cherry-picked from https://github.com/melanieplageman/postgres/tree/bhs_aio and lightly modified by TM, for demonstration purposes. Author: Melanie Plageman --- src/backend/access/gin/ginget.c | 17 +- src/backend/access/gin/ginscan.c | 7 + src/backend/access/heap/heapam.c | 76 ++++ src/backend/access/heap/heapam_handler.c | 78 ++-- src/backend/commands/explain.c | 21 +- src/backend/executor/nodeBitmapHeapscan.c | 521 ++-------------------- src/backend/nodes/tidbitmap.c | 76 ++-- src/include/access/heapam.h | 7 + src/include/access/relscan.h | 8 + src/include/access/tableam.h | 71 ++- src/include/executor/nodeBitmapHeapscan.h | 1 + src/include/nodes/execnodes.h | 39 +- src/include/nodes/tidbitmap.h | 4 +- 13 files changed, 289 insertions(+), 637 deletions(-) diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 1f0214498c..c6a45a4e03 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -375,7 +375,10 @@ restartScanEntry: if (entry->matchBitmap) { if (entry->matchIterator) + { tbm_end_iterate(entry->matchIterator); + pfree(entry->matchResult); + } entry->matchIterator = NULL; tbm_free(entry->matchBitmap); entry->matchBitmap = NULL; @@ -388,6 +391,9 @@ restartScanEntry: if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap)) { entry->matchIterator = tbm_begin_iterate(entry->matchBitmap); + entry->matchResult = (TBMIterateResult *) + palloc0(sizeof(TBMIterateResult) + MaxHeapTuplesPerPage * + sizeof(OffsetNumber)); entry->isFinished = false; } } @@ -825,21 +831,24 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, { /* * If we've exhausted all items on this block, move to next block - * in the bitmap. + * in the bitmap. tbm_iterate() sets matchResult->blockno to + * InvalidBlockNumber when the bitmap is exhausted. */ - while (entry->matchResult == NULL || + while ((!BlockNumberIsValid(entry->matchResult->blockno)) || (entry->matchResult->ntuples >= 0 && entry->offset >= entry->matchResult->ntuples) || entry->matchResult->blockno < advancePastBlk || (ItemPointerIsLossyPage(&advancePast) && entry->matchResult->blockno == advancePastBlk)) { - entry->matchResult = tbm_iterate(entry->matchIterator); - if (entry->matchResult == NULL) + tbm_iterate(entry->matchIterator, entry->matchResult); + if (!BlockNumberIsValid(entry->matchResult->blockno)) { ItemPointerSetInvalid(&entry->curItem); tbm_end_iterate(entry->matchIterator); + pfree(entry->matchResult); + entry->matchResult = NULL; entry->matchIterator = NULL; entry->isFinished = true; break; diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c index ae7b0e9bb8..e82baf590e 100644 --- a/src/backend/access/gin/ginscan.c +++ b/src/backend/access/gin/ginscan.c @@ -246,7 +246,14 @@ ginFreeScanKeys(GinScanOpaque so) if (entry->list) pfree(entry->list); if (entry->matchIterator) + { tbm_end_iterate(entry->matchIterator); + if (entry->matchResult) + { + pfree(entry->matchResult); + entry->matchResult = NULL; + } + } if (entry->matchBitmap) tbm_free(entry->matchBitmap); } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 01cca53586..ad6b7df483 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -226,6 +226,60 @@ static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = * ---------------------------------------------------------------- */ +static bool +bitmapheap_pgsr_next_single(PgStreamingRead *pgsr, + uintptr_t pgsr_private, void *io_private, + BufferManagerRelation *bmr, ForkNumber *fork, BlockNumber *block, + ReadBufferMode *mode) +{ + TBMIterateResult *tbmres; + HeapScanDesc hdesc = (HeapScanDesc) pgsr_private; + + *bmr = BMR_REL(hdesc->rs_base.rs_rd); + *fork = MAIN_FORKNUM; + *mode = RBM_NORMAL; + + tbmres = (TBMIterateResult *) io_private; + + for (;;) + { + if (hdesc->rs_base.shared_tbmiterator) + tbm_shared_iterate(hdesc->rs_base.shared_tbmiterator, tbmres); + else + tbm_iterate(hdesc->rs_base.tbmiterator, tbmres); + + if (!BlockNumberIsValid(tbmres->blockno)) + return false; + + /* + * Ignore any claimed entries past what we think is the end of the + * relation. It may have been extended after the start of our scan (we + * only hold an AccessShareLock, and it could be inserts from this + * backend). We don't take this optimization in SERIALIZABLE + * isolation though, as we need to examine all invisible tuples + * reachable by the index. + */ + if (!IsolationIsSerializable() && tbmres->blockno >= hdesc->rs_nblocks) + continue; + + if (tbmres->ntuples >= 0) + hdesc->rs_base.exact_pages++; + else + hdesc->rs_base.lossy_pages++; + + if (hdesc->rs_base.rs_flags & SO_CAN_SKIP_FETCH && + !tbmres->recheck && + VM_ALL_VISIBLE(hdesc->rs_base.rs_rd, tbmres->blockno, &hdesc->vmbuffer)) + { + hdesc->empty_tuples += tbmres->ntuples; + continue; + } + + *block = tbmres->blockno; + return true; + } +} + static bool heap_pgsr_next_single(PgStreamingRead *pgsr, uintptr_t pgsr_private, void *per_io_data, @@ -443,6 +497,11 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) pg_streaming_read_free(scan->pgsr); scan->pgsr = NULL; } + if (BufferIsValid(scan->vmbuffer)) + { + ReleaseBuffer(scan->vmbuffer); + scan->vmbuffer = InvalidBuffer; + } /* * FIXME: This probably should be done in the !rs_inited blocks instead. @@ -456,6 +515,17 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) else scan->pgsr = heap_pgsr_single_alloc(scan); } + else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN) + { + int iodepth = Max(Min(128, NBuffers / 128), 1); + + scan->pgsr = pg_streaming_read_buffer_alloc(iodepth, + offsetof(TBMIterateResult, offsets) + MaxHeapTuplesPerPage * sizeof(OffsetNumber), + (uintptr_t) scan, scan->rs_strategy, bitmapheap_pgsr_next_single); + + + scan->rs_inited = true; + } } /* @@ -1140,6 +1210,12 @@ heap_beginscan(Relation relation, Snapshot snapshot, scan->rs_strategy = NULL; /* set in initscan */ scan->pgsr = NULL; + scan->vmbuffer = InvalidBuffer; + scan->empty_tuples = 0; + scan->rs_base.exact_pages = 0; + scan->rs_base.lossy_pages = 0; + scan->rs_base.tbmiterator = NULL; + scan->rs_base.shared_tbmiterator = NULL; /* * Disable page-at-a-time mode if it's not a MVCC-safe snapshot. diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 26378cbb4b..c665910b27 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -44,6 +44,7 @@ #include "storage/smgr.h" #include "utils/builtins.h" #include "utils/rel.h" +#include "storage/streaming_read.h" static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, @@ -2114,38 +2115,55 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths, * Executor related callbacks for the heap AM * ------------------------------------------------------------------------ */ - static bool -heapam_scan_bitmap_next_block(TableScanDesc scan, - TBMIterateResult *tbmres) +heapam_scan_bitmap_next_block(TableScanDesc scan, bool *recheck) { HeapScanDesc hscan = (HeapScanDesc) scan; - BlockNumber block = tbmres->blockno; + TBMIterateResult *tbmres; + void *io_private; Buffer buffer; Snapshot snapshot; int ntup; - hscan->rs_cindex = 0; - hscan->rs_ntuples = 0; + Assert(hscan->pgsr); /* - * Ignore any claimed entries past what we think is the end of the - * relation. It may have been extended after the start of our scan (we - * only hold an AccessShareLock, and it could be inserts from this - * backend). We don't take this optimization in SERIALIZABLE isolation - * though, as we need to examine all invisible tuples reachable by the - * index. + * Release the buffer containing the previous block and reset the per-page + * counters. Reset hscan->rs_ntuples here just to be safe. */ - if (!IsolationIsSerializable() && block >= hscan->rs_nblocks) - return false; + if (BufferIsValid(hscan->rs_cbuf)) + { + ReleaseBuffer(hscan->rs_cbuf); + hscan->rs_cbuf = InvalidBuffer; + } + + hscan->rs_cindex = 0; + hscan->rs_ntuples = 0; + + hscan->rs_cbuf = pg_streaming_read_buffer_get_next(hscan->pgsr, &io_private); + + if (BufferIsInvalid(hscan->rs_cbuf)) + { + if (BufferIsValid(hscan->vmbuffer)) + { + ReleaseBuffer(hscan->vmbuffer); + hscan->vmbuffer = InvalidBuffer; + } + + return hscan->empty_tuples > 0; + } + + Assert(io_private); + + tbmres = (TBMIterateResult *) io_private; + + Assert(BufferGetBlockNumber(hscan->rs_cbuf) == tbmres->blockno); + + *recheck = tbmres->recheck; + + hscan->rs_cblock = tbmres->blockno; + hscan->rs_ntuples = tbmres->ntuples; - /* - * Acquire pin on the target heap page, trading in any pin we held before. - */ - hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf, - scan->rs_rd, - block); - hscan->rs_cblock = block; buffer = hscan->rs_cbuf; snapshot = scan->rs_snapshot; @@ -2181,7 +2199,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, ItemPointerData tid; HeapTupleData heapTuple; - ItemPointerSet(&tid, block, offnum); + ItemPointerSet(&tid, tbmres->blockno, offnum); if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, &heapTuple, NULL, true)) hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); @@ -2209,7 +2227,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); loctup.t_len = ItemIdGetLength(lp); loctup.t_tableOid = scan->rs_rd->rd_id; - ItemPointerSet(&loctup.t_self, block, offnum); + ItemPointerSet(&loctup.t_self, tbmres->blockno, offnum); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) { @@ -2227,25 +2245,31 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, Assert(ntup <= MaxHeapTuplesPerPage); hscan->rs_ntuples = ntup; - return ntup > 0; + return true; } static bool -heapam_scan_bitmap_next_tuple(TableScanDesc scan, - TBMIterateResult *tbmres, - TupleTableSlot *slot) +heapam_scan_bitmap_next_tuple(TableScanDesc scan, TupleTableSlot *slot) { HeapScanDesc hscan = (HeapScanDesc) scan; OffsetNumber targoffset; Page page; ItemId lp; + if (hscan->empty_tuples) + { + ExecStoreAllNullTuple(slot); + hscan->empty_tuples--; + return true; + } + /* * Out of range? If so, nothing more to look at on this page */ if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples) return false; + Assert(BufferIsValid(hscan->rs_cbuf)); targoffset = hscan->rs_vistuples[hscan->rs_cindex]; page = BufferGetPage(hscan->rs_cbuf); lp = PageGetItemId(page, targoffset); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 8570b14f62..e654ada418 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/relscan.h" #include "access/xact.h" #include "catalog/pg_type.h" #include "commands/createas.h" @@ -112,7 +113,7 @@ static void show_hash_info(HashState *hashstate, ExplainState *es); static void show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es); static void show_hashagg_info(AggState *aggstate, ExplainState *es); -static void show_tidbitmap_info(BitmapHeapScanState *planstate, +static void show_tidbitmap_info(TableScanDesc tdesc, ExplainState *es); static void show_instrumentation_count(const char *qlabel, int which, PlanState *planstate, ExplainState *es); @@ -1819,7 +1820,7 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); if (es->analyze) - show_tidbitmap_info((BitmapHeapScanState *) planstate, es); + show_tidbitmap_info(((BitmapHeapScanState *) planstate)->ss.ss_currentScanDesc, es); break; case T_SampleScan: show_tablesample(((SampleScan *) plan)->tablesample, @@ -3408,25 +3409,25 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) * If it's EXPLAIN ANALYZE, show exact/lossy pages for a BitmapHeapScan node */ static void -show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es) +show_tidbitmap_info(TableScanDesc tdesc, ExplainState *es) { if (es->format != EXPLAIN_FORMAT_TEXT) { ExplainPropertyInteger("Exact Heap Blocks", NULL, - planstate->exact_pages, es); + tdesc->exact_pages, es); ExplainPropertyInteger("Lossy Heap Blocks", NULL, - planstate->lossy_pages, es); + tdesc->lossy_pages, es); } else { - if (planstate->exact_pages > 0 || planstate->lossy_pages > 0) + if (tdesc->exact_pages > 0 || tdesc->lossy_pages > 0) { ExplainIndentText(es); appendStringInfoString(es->str, "Heap Blocks:"); - if (planstate->exact_pages > 0) - appendStringInfo(es->str, " exact=%ld", planstate->exact_pages); - if (planstate->lossy_pages > 0) - appendStringInfo(es->str, " lossy=%ld", planstate->lossy_pages); + if (tdesc->exact_pages > 0) + appendStringInfo(es->str, " exact=%ld", tdesc->exact_pages); + if (tdesc->lossy_pages > 0) + appendStringInfo(es->str, " lossy=%ld", tdesc->lossy_pages); appendStringInfoChar(es->str, '\n'); } } diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index f35df0b8bf..32518e8d31 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -54,11 +54,6 @@ static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node); static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate); -static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, - TBMIterateResult *tbmres); -static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node); -static inline void BitmapPrefetch(BitmapHeapScanState *node, - TableScanDesc scan); static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate); @@ -74,10 +69,7 @@ BitmapHeapNext(BitmapHeapScanState *node) ExprContext *econtext; TableScanDesc scan; TIDBitmap *tbm; - TBMIterator *tbmiterator = NULL; - TBMSharedIterator *shared_tbmiterator = NULL; - TBMIterateResult *tbmres; - TupleTableSlot *slot; + TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; ParallelBitmapHeapState *pstate = node->pstate; dsa_area *dsa = node->ss.ps.state->es_query_dsa; @@ -88,23 +80,10 @@ BitmapHeapNext(BitmapHeapScanState *node) slot = node->ss.ss_ScanTupleSlot; scan = node->ss.ss_currentScanDesc; tbm = node->tbm; - if (pstate == NULL) - tbmiterator = node->tbmiterator; - else - shared_tbmiterator = node->shared_tbmiterator; - tbmres = node->tbmres; /* * If we haven't yet performed the underlying index scan, do it, and begin * the iteration over the bitmap. - * - * For prefetching, we use *two* iterators, one for the pages we are - * actually scanning and another that runs ahead of the first for - * prefetching. node->prefetch_pages tracks exactly how many pages ahead - * the prefetch iterator is. Also, node->prefetch_target tracks the - * desired prefetch distance, which starts small and increases up to the - * node->prefetch_maximum. This is to avoid doing a lot of prefetching in - * a scan that stops after a few tuples because of a LIMIT. */ if (!node->initialized) { @@ -116,17 +95,7 @@ BitmapHeapNext(BitmapHeapScanState *node) elog(ERROR, "unrecognized result from subplan"); node->tbm = tbm; - node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); - node->tbmres = tbmres = NULL; - -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - { - node->prefetch_iterator = tbm_begin_iterate(tbm); - node->prefetch_pages = 0; - node->prefetch_target = -1; - } -#endif /* USE_PREFETCH */ + scan->tbmiterator = tbm_begin_iterate(tbm); } else { @@ -148,194 +117,59 @@ BitmapHeapNext(BitmapHeapScanState *node) * dsa_pointer of the iterator state which will be used by * multiple processes to iterate jointly. */ - pstate->tbmiterator = tbm_prepare_shared_iterate(tbm); -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - { - pstate->prefetch_iterator = - tbm_prepare_shared_iterate(tbm); - - /* - * We don't need the mutex here as we haven't yet woke up - * others. - */ - pstate->prefetch_pages = 0; - pstate->prefetch_target = -1; - } -#endif + pstate->tbmiterator = + tbm_prepare_shared_iterate(tbm); /* We have initialized the shared state so wake up others. */ BitmapDoneInitializingSharedState(pstate); } /* Allocate a private iterator and attach the shared state to it */ - node->shared_tbmiterator = shared_tbmiterator = + scan->shared_tbmiterator = tbm_attach_shared_iterate(dsa, pstate->tbmiterator); - node->tbmres = tbmres = NULL; - -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - { - node->shared_prefetch_iterator = - tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator); - } -#endif /* USE_PREFETCH */ } node->initialized = true; + + /* Get the first block. if none, end of scan */ + if (!table_scan_bitmap_next_block(scan, &node->recheck)) + goto exit; } - for (;;) + do { - bool skip_fetch; - - CHECK_FOR_INTERRUPTS(); - - /* - * Get next page of results if needed - */ - if (tbmres == NULL) - { - if (!pstate) - node->tbmres = tbmres = tbm_iterate(tbmiterator); - else - node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator); - if (tbmres == NULL) - { - /* no more entries in the bitmap */ - break; - } - - BitmapAdjustPrefetchIterator(node, tbmres); - - /* - * We can skip fetching the heap page if we don't need any fields - * from the heap, and the bitmap entries don't need rechecking, - * and all tuples on the page are visible to our transaction. - * - * XXX: It's a layering violation that we do these checks above - * tableam, they should probably moved below it at some point. - */ - skip_fetch = (node->can_skip_fetch && - !tbmres->recheck && - VM_ALL_VISIBLE(node->ss.ss_currentRelation, - tbmres->blockno, - &node->vmbuffer)); - - if (skip_fetch) - { - /* can't be lossy in the skip_fetch case */ - Assert(tbmres->ntuples >= 0); - - /* - * The number of tuples on this page is put into - * node->return_empty_tuples. - */ - node->return_empty_tuples = tbmres->ntuples; - } - else if (!table_scan_bitmap_next_block(scan, tbmres)) - { - /* AM doesn't think this block is valid, skip */ - continue; - } - - if (tbmres->ntuples >= 0) - node->exact_pages++; - else - node->lossy_pages++; - - /* Adjust the prefetch target */ - BitmapAdjustPrefetchTarget(node); - } - else + while (table_scan_bitmap_next_tuple(scan, slot)) { - /* - * Continuing in previously obtained page. - */ - -#ifdef USE_PREFETCH - - /* - * Try to prefetch at least a few pages even before we get to the - * second page if we don't stop reading after the first tuple. - */ - if (!pstate) - { - if (node->prefetch_target < node->prefetch_maximum) - node->prefetch_target++; - } - else if (pstate->prefetch_target < node->prefetch_maximum) - { - /* take spinlock while updating shared state */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target < node->prefetch_maximum) - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } -#endif /* USE_PREFETCH */ - } + CHECK_FOR_INTERRUPTS(); - /* - * We issue prefetch requests *after* fetching the current page to try - * to avoid having prefetching interfere with the main I/O. Also, this - * should happen only when we have determined there is still something - * to do on the current page, else we may uselessly prefetch the same - * page we are just about to request for real. - * - * XXX: It's a layering violation that we do these checks above - * tableam, they should probably moved below it at some point. - */ - BitmapPrefetch(node, scan); - - if (node->return_empty_tuples > 0) - { - /* - * If we don't have to fetch the tuple, just return nulls. - */ - ExecStoreAllNullTuple(slot); + if (!node->recheck) + return slot; - if (--node->return_empty_tuples == 0) - { - /* no more tuples to return in the next round */ - node->tbmres = tbmres = NULL; - } - } - else - { - /* - * Attempt to fetch tuple from AM. - */ - if (!table_scan_bitmap_next_tuple(scan, tbmres, slot)) - { - /* nothing more to look at on this page */ - node->tbmres = tbmres = NULL; - continue; - } + econtext->ecxt_scantuple = slot; + if (ExecQualAndReset(node->bitmapqualorig, econtext)) + return slot; - /* - * If we are using lossy info, we have to recheck the qual - * conditions at every tuple. - */ - if (tbmres->recheck) - { - econtext->ecxt_scantuple = slot; - if (!ExecQualAndReset(node->bitmapqualorig, econtext)) - { - /* Fails recheck, so drop it and loop back for another */ - InstrCountFiltered2(node, 1); - ExecClearTuple(slot); - continue; - } - } + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + ExecClearTuple(slot); } + } while (table_scan_bitmap_next_block(scan, &node->recheck)); - /* OK to return this tuple */ - return slot; - } +exit: /* - * if we get here it means we are at the end of the scan.. + * Release iterator */ - return ExecClearTuple(slot); + if (scan->tbmiterator) + { + tbm_end_iterate(scan->tbmiterator); + scan->tbmiterator = NULL; + } + else if (scan->shared_tbmiterator) + { + tbm_end_shared_iterate(scan->shared_tbmiterator); + scan->shared_tbmiterator = NULL; + } + return NULL; } /* @@ -353,215 +187,6 @@ BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate) ConditionVariableBroadcast(&pstate->cv); } -/* - * BitmapAdjustPrefetchIterator - Adjust the prefetch iterator - */ -static inline void -BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, - TBMIterateResult *tbmres) -{ -#ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; - - if (pstate == NULL) - { - TBMIterator *prefetch_iterator = node->prefetch_iterator; - - if (node->prefetch_pages > 0) - { - /* The main iterator has closed the distance by one page */ - node->prefetch_pages--; - } - else if (prefetch_iterator) - { - /* Do not let the prefetch iterator get behind the main one */ - TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); - - if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) - elog(ERROR, "prefetch and main iterators are out of sync"); - } - return; - } - - if (node->prefetch_maximum > 0) - { - TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; - - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_pages > 0) - { - pstate->prefetch_pages--; - SpinLockRelease(&pstate->mutex); - } - else - { - /* Release the mutex before iterating */ - SpinLockRelease(&pstate->mutex); - - /* - * In case of shared mode, we can not ensure that the current - * blockno of the main iterator and that of the prefetch iterator - * are same. It's possible that whatever blockno we are - * prefetching will be processed by another process. Therefore, - * we don't validate the blockno here as we do in non-parallel - * case. - */ - if (prefetch_iterator) - tbm_shared_iterate(prefetch_iterator); - } - } -#endif /* USE_PREFETCH */ -} - -/* - * BitmapAdjustPrefetchTarget - Adjust the prefetch target - * - * Increase prefetch target if it's not yet at the max. Note that - * we will increase it to zero after fetching the very first - * page/tuple, then to one after the second tuple is fetched, then - * it doubles as later pages are fetched. - */ -static inline void -BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) -{ -#ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; - - if (pstate == NULL) - { - if (node->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (node->prefetch_target >= node->prefetch_maximum / 2) - node->prefetch_target = node->prefetch_maximum; - else if (node->prefetch_target > 0) - node->prefetch_target *= 2; - else - node->prefetch_target++; - return; - } - - /* Do an unlocked check first to save spinlock acquisitions. */ - if (pstate->prefetch_target < node->prefetch_maximum) - { - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (pstate->prefetch_target >= node->prefetch_maximum / 2) - pstate->prefetch_target = node->prefetch_maximum; - else if (pstate->prefetch_target > 0) - pstate->prefetch_target *= 2; - else - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } -#endif /* USE_PREFETCH */ -} - -/* - * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target - */ -static inline void -BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) -{ -#ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; - - if (pstate == NULL) - { - TBMIterator *prefetch_iterator = node->prefetch_iterator; - - if (prefetch_iterator) - { - while (node->prefetch_pages < node->prefetch_target) - { - TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); - bool skip_fetch; - - if (tbmpre == NULL) - { - /* No more pages to prefetch */ - tbm_end_iterate(prefetch_iterator); - node->prefetch_iterator = NULL; - break; - } - node->prefetch_pages++; - - /* - * If we expect not to have to actually read this heap page, - * skip this prefetch call, but continue to run the prefetch - * logic normally. (Would it be better not to increment - * prefetch_pages?) - * - * This depends on the assumption that the index AM will - * report the same recheck flag for this future heap page as - * it did for the current heap page; which is not a certainty - * but is true in many cases. - */ - skip_fetch = (node->can_skip_fetch && - (node->tbmres ? !node->tbmres->recheck : false) && - VM_ALL_VISIBLE(node->ss.ss_currentRelation, - tbmpre->blockno, - &node->pvmbuffer)); - - if (!skip_fetch) - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); - } - } - - return; - } - - if (pstate->prefetch_pages < pstate->prefetch_target) - { - TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; - - if (prefetch_iterator) - { - while (1) - { - TBMIterateResult *tbmpre; - bool do_prefetch = false; - bool skip_fetch; - - /* - * Recheck under the mutex. If some other process has already - * done enough prefetching then we need not to do anything. - */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_pages < pstate->prefetch_target) - { - pstate->prefetch_pages++; - do_prefetch = true; - } - SpinLockRelease(&pstate->mutex); - - if (!do_prefetch) - return; - - tbmpre = tbm_shared_iterate(prefetch_iterator); - if (tbmpre == NULL) - { - /* No more pages to prefetch */ - tbm_end_shared_iterate(prefetch_iterator); - node->shared_prefetch_iterator = NULL; - break; - } - - /* As above, skip prefetch if we expect not to need page */ - skip_fetch = (node->can_skip_fetch && - (node->tbmres ? !node->tbmres->recheck : false) && - VM_ALL_VISIBLE(node->ss.ss_currentRelation, - tbmpre->blockno, - &node->pvmbuffer)); - - if (!skip_fetch) - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); - } - } - } -#endif /* USE_PREFETCH */ -} - /* * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual */ @@ -607,29 +232,10 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) table_rescan(node->ss.ss_currentScanDesc, NULL); /* release bitmaps and buffers if any */ - if (node->tbmiterator) - tbm_end_iterate(node->tbmiterator); - if (node->prefetch_iterator) - tbm_end_iterate(node->prefetch_iterator); - if (node->shared_tbmiterator) - tbm_end_shared_iterate(node->shared_tbmiterator); - if (node->shared_prefetch_iterator) - tbm_end_shared_iterate(node->shared_prefetch_iterator); if (node->tbm) tbm_free(node->tbm); - if (node->vmbuffer != InvalidBuffer) - ReleaseBuffer(node->vmbuffer); - if (node->pvmbuffer != InvalidBuffer) - ReleaseBuffer(node->pvmbuffer); node->tbm = NULL; - node->tbmiterator = NULL; - node->tbmres = NULL; - node->prefetch_iterator = NULL; node->initialized = false; - node->shared_tbmiterator = NULL; - node->shared_prefetch_iterator = NULL; - node->vmbuffer = InvalidBuffer; - node->pvmbuffer = InvalidBuffer; ExecScanReScan(&node->ss); @@ -675,20 +281,8 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) /* * release bitmaps and buffers if any */ - if (node->tbmiterator) - tbm_end_iterate(node->tbmiterator); - if (node->prefetch_iterator) - tbm_end_iterate(node->prefetch_iterator); if (node->tbm) tbm_free(node->tbm); - if (node->shared_tbmiterator) - tbm_end_shared_iterate(node->shared_tbmiterator); - if (node->shared_prefetch_iterator) - tbm_end_shared_iterate(node->shared_prefetch_iterator); - if (node->vmbuffer != InvalidBuffer) - ReleaseBuffer(node->vmbuffer); - if (node->pvmbuffer != InvalidBuffer) - ReleaseBuffer(node->pvmbuffer); /* * close heap scan @@ -707,6 +301,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) { BitmapHeapScanState *scanstate; Relation currentRelation; + bool can_skip_fetch; /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); @@ -726,32 +321,10 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan; scanstate->tbm = NULL; - scanstate->tbmiterator = NULL; - scanstate->tbmres = NULL; - scanstate->return_empty_tuples = 0; - scanstate->vmbuffer = InvalidBuffer; - scanstate->pvmbuffer = InvalidBuffer; - scanstate->exact_pages = 0; - scanstate->lossy_pages = 0; - scanstate->prefetch_iterator = NULL; - scanstate->prefetch_pages = 0; - scanstate->prefetch_target = 0; scanstate->pscan_len = 0; scanstate->initialized = false; - scanstate->shared_tbmiterator = NULL; - scanstate->shared_prefetch_iterator = NULL; scanstate->pstate = NULL; - /* - * We can potentially skip fetching heap pages if we do not need any - * columns of the table, either for checking non-indexable quals or for - * returning data. This test is a bit simplistic, as it checks the - * stronger condition that there's no qual or return tlist at all. But in - * most cases it's probably not worth working harder than that. - */ - scanstate->can_skip_fetch = (node->scan.plan.qual == NIL && - node->scan.plan.targetlist == NIL); - /* * Miscellaneous initialization * @@ -790,19 +363,22 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->bitmapqualorig = ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate); + scanstate->ss.ss_currentRelation = currentRelation; + /* - * Maximum number of prefetches for the tablespace if configured, - * otherwise the current value of the effective_io_concurrency GUC. + * We can potentially skip fetching heap pages if we do not need any + * columns of the table, either for checking non-indexable quals or for + * returning data. This test is a bit simplistic, as it checks the + * stronger condition that there's no qual or return tlist at all. But in + * most cases it's probably not worth working harder than that. */ - scanstate->prefetch_maximum = - get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); - - scanstate->ss.ss_currentRelation = currentRelation; + can_skip_fetch = (node->scan.plan.qual == NIL && + node->scan.plan.targetlist == NIL); scanstate->ss.ss_currentScanDesc = table_beginscan_bm(currentRelation, estate->es_snapshot, 0, - NULL); + NULL, can_skip_fetch); /* * all done. @@ -887,12 +463,9 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, pstate = shm_toc_allocate(pcxt->toc, node->pscan_len); pstate->tbmiterator = 0; - pstate->prefetch_iterator = 0; /* Initialize the mutex */ SpinLockInit(&pstate->mutex); - pstate->prefetch_pages = 0; - pstate->prefetch_target = 0; pstate->state = BM_INITIAL; ConditionVariableInit(&pstate->cv); @@ -924,11 +497,7 @@ ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node, if (DsaPointerIsValid(pstate->tbmiterator)) tbm_free_shared_area(dsa, pstate->tbmiterator); - if (DsaPointerIsValid(pstate->prefetch_iterator)) - tbm_free_shared_area(dsa, pstate->prefetch_iterator); - pstate->tbmiterator = InvalidDsaPointer; - pstate->prefetch_iterator = InvalidDsaPointer; } /* ---------------------------------------------------------------- diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 29a1858441..052c8ff91a 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -180,7 +180,6 @@ struct TBMIterator int spageptr; /* next spages index */ int schunkptr; /* next schunks index */ int schunkbit; /* next bit to check in current schunk */ - TBMIterateResult output; /* MUST BE LAST (because variable-size) */ }; /* @@ -221,7 +220,6 @@ struct TBMSharedIterator PTEntryArray *ptbase; /* pagetable element array */ PTIterationArray *ptpages; /* sorted exact page index list */ PTIterationArray *ptchunks; /* sorted lossy page index list */ - TBMIterateResult output; /* MUST BE LAST (because variable-size) */ }; /* Local function prototypes */ @@ -695,8 +693,7 @@ tbm_begin_iterate(TIDBitmap *tbm) * Create the TBMIterator struct, with enough trailing space to serve the * needs of the TBMIterateResult sub-struct. */ - iterator = (TBMIterator *) palloc(sizeof(TBMIterator) + - MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); + iterator = (TBMIterator *) palloc(sizeof(TBMIterator)); iterator->tbm = tbm; /* @@ -957,20 +954,21 @@ tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp) /* * tbm_iterate - scan through next page of a TIDBitmap * - * Returns a TBMIterateResult representing one page, or NULL if there are - * no more pages to scan. Pages are guaranteed to be delivered in numerical - * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to - * remember the exact tuples to look at on this page --- the caller must - * examine all tuples on the page and check if they meet the intended - * condition. If result->recheck is true, only the indicated tuples need - * be examined, but the condition must be rechecked anyway. (For ease of - * testing, recheck is always set true when ntuples < 0.) + * Caller must pass in a TBMIterateResult to be filled. + * + * Pages are guaranteed to be delivered in numerical order. tbmres->blockno is + * set to InvalidBlockNumber when there are no more pages to scan. If + * tbmres->ntuples < 0, then the bitmap is "lossy" and failed to remember the + * exact tuples to look at on this page --- the caller must examine all tuples + * on the page and check if they meet the intended condition. If + * tbmres->recheck is true, only the indicated tuples need be examined, but the + * condition must be rechecked anyway. (For ease of testing, recheck is always + * set true when ntuples < 0.) */ -TBMIterateResult * -tbm_iterate(TBMIterator *iterator) +void +tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres) { TIDBitmap *tbm = iterator->tbm; - TBMIterateResult *output = &(iterator->output); Assert(tbm->iterating == TBM_ITERATING_PRIVATE); @@ -998,6 +996,7 @@ tbm_iterate(TBMIterator *iterator) * If both chunk and per-page data remain, must output the numerically * earlier page. */ + Assert(tbmres); if (iterator->schunkptr < tbm->nchunks) { PagetableEntry *chunk = tbm->schunks[iterator->schunkptr]; @@ -1008,11 +1007,11 @@ tbm_iterate(TBMIterator *iterator) chunk_blockno < tbm->spages[iterator->spageptr]->blockno) { /* Return a lossy page indicator from the chunk */ - output->blockno = chunk_blockno; - output->ntuples = -1; - output->recheck = true; + tbmres->blockno = chunk_blockno; + tbmres->ntuples = -1; + tbmres->recheck = true; iterator->schunkbit++; - return output; + return; } } @@ -1028,16 +1027,17 @@ tbm_iterate(TBMIterator *iterator) page = tbm->spages[iterator->spageptr]; /* scan bitmap to extract individual offset numbers */ - ntuples = tbm_extract_page_tuple(page, output); - output->blockno = page->blockno; - output->ntuples = ntuples; - output->recheck = page->recheck; + ntuples = tbm_extract_page_tuple(page, tbmres); + tbmres->blockno = page->blockno; + tbmres->ntuples = ntuples; + tbmres->recheck = page->recheck; iterator->spageptr++; - return output; + return; } /* Nothing more in the bitmap */ - return NULL; + tbmres->blockno = InvalidBlockNumber; + return; } /* @@ -1047,10 +1047,9 @@ tbm_iterate(TBMIterator *iterator) * across multiple processes. We need to acquire the iterator LWLock, * before accessing the shared members. */ -TBMIterateResult * -tbm_shared_iterate(TBMSharedIterator *iterator) +void +tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres) { - TBMIterateResult *output = &iterator->output; TBMSharedIteratorState *istate = iterator->state; PagetableEntry *ptbase = NULL; int *idxpages = NULL; @@ -1101,13 +1100,13 @@ tbm_shared_iterate(TBMSharedIterator *iterator) chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno) { /* Return a lossy page indicator from the chunk */ - output->blockno = chunk_blockno; - output->ntuples = -1; - output->recheck = true; + tbmres->blockno = chunk_blockno; + tbmres->ntuples = -1; + tbmres->recheck = true; istate->schunkbit++; LWLockRelease(&istate->lock); - return output; + return; } } @@ -1117,21 +1116,22 @@ tbm_shared_iterate(TBMSharedIterator *iterator) int ntuples; /* scan bitmap to extract individual offset numbers */ - ntuples = tbm_extract_page_tuple(page, output); - output->blockno = page->blockno; - output->ntuples = ntuples; - output->recheck = page->recheck; + ntuples = tbm_extract_page_tuple(page, tbmres); + tbmres->blockno = page->blockno; + tbmres->ntuples = ntuples; + tbmres->recheck = page->recheck; istate->spageptr++; LWLockRelease(&istate->lock); - return output; + return; } LWLockRelease(&istate->lock); /* Nothing more in the bitmap */ - return NULL; + tbmres->blockno = InvalidBlockNumber; + return; } /* diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 852d47b1f8..8cb4be8570 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -20,6 +20,7 @@ #include "access/skey.h" #include "access/table.h" /* for backward compatibility */ #include "access/tableam.h" +#include "nodes/execnodes.h" #include "nodes/lockoptions.h" #include "nodes/primnodes.h" #include "storage/bufpage.h" @@ -76,9 +77,15 @@ typedef struct HeapScanDescData struct PgStreamingRead *pgsr; /* these fields only used in page-at-a-time mode and for bitmap scans */ + + /* + * MFIXME: not sure if vmbuffer is being released in the right place. + */ + Buffer vmbuffer; /* current VM buffer for BHS */ int rs_cindex; /* current tuple's index in vistuples */ int rs_ntuples; /* number of visible tuples on page */ OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ + int empty_tuples; } HeapScanDescData; typedef struct HeapScanDescData *HeapScanDesc; diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index d03360eac0..6642e1a754 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -16,6 +16,7 @@ #include "access/htup_details.h" #include "access/itup.h" +#include "nodes/tidbitmap.h" #include "port/atomics.h" #include "storage/buf.h" #include "storage/spin.h" @@ -40,6 +41,13 @@ typedef struct TableScanDescData ItemPointerData rs_mintid; ItemPointerData rs_maxtid; + /* Only used for Bitmap table scans */ + TBMIterator *tbmiterator; + TBMSharedIterator *shared_tbmiterator; + long exact_pages; + long lossy_pages; + int empty_tuples; + /* * Information about type and behaviour of the scan, a bitmask of members * of the ScanOptions enum (see tableam.h). diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 230bc39cc0..864ca4ed10 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -32,6 +32,7 @@ extern PGDLLIMPORT char *default_table_access_method; extern PGDLLIMPORT bool synchronize_seqscans; +struct BitmapHeapScanState; struct BulkInsertStateData; struct IndexInfo; struct SampleScanState; @@ -61,7 +62,8 @@ typedef enum ScanOptions SO_ALLOW_PAGEMODE = 1 << 8, /* unregister snapshot at scan end? */ - SO_TEMP_SNAPSHOT = 1 << 9 + SO_TEMP_SNAPSHOT = 1 << 9, + SO_CAN_SKIP_FETCH = 1 << 10 } ScanOptions; /* @@ -773,53 +775,36 @@ typedef struct TableAmRoutine */ /* - * Prepare to fetch / check / return tuples from `tbmres->blockno` as part - * of a bitmap table scan. `scan` was started via table_beginscan_bm(). - * Return false if there are no tuples to be found on the page, true - * otherwise. + * Prepare to fetch / check / return tuples obtained from the streaming + * read helper as part of a bitmap table scan. `scan` was started via + * table_beginscan_bm(). Return false if the relation is exhausted and + * true otherwise. * * This will typically read and pin the target block, and do the necessary * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might * make sense to perform tuple visibility checks at this time). For some - * AMs it will make more sense to do all the work referencing `tbmres` - * contents here, for others it might be better to defer more work to - * scan_bitmap_next_tuple. + * AMs, it might be better to defer more work to scan_bitmap_next_tuple(). * - * If `tbmres->blockno` is -1, this is a lossy scan and all visible tuples - * on the page have to be returned, otherwise the tuples at offsets in - * `tbmres->offsets` need to be returned. + * After examining the page from the TBMIterateResult returned by the + * streaming read helper, most AMs will set relevant fields in the `scan` + * for the benefit of scan_bitmap_next_tuple(). All AMs must set + * `recheck`, passed in by the caller, based on the value in the + * TBMIterateResult so that BitmapHeapNext() can determine whether or not + * to yield tuples. * - * XXX: Currently this may only be implemented if the AM uses md.c as its - * storage manager, and uses ItemPointer->ip_blkid in a manner that maps - * blockids directly to the underlying storage. nodeBitmapHeapscan.c - * performs prefetching directly using that interface. This probably - * needs to be rectified at a later point. - * - * XXX: Currently this may only be implemented if the AM uses the - * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to - * perform prefetching. This probably needs to be rectified at a later - * point. - * - * Optional callback, but either both scan_bitmap_next_block and - * scan_bitmap_next_tuple need to exist, or neither. + * Optional callback, but scan_bitmap_setup, scan_bitmap_next_block, and + * scan_bitmap_next_tuple need to exist, or none of them. */ - bool (*scan_bitmap_next_block) (TableScanDesc scan, - struct TBMIterateResult *tbmres); + bool (*scan_bitmap_next_block) (TableScanDesc scan, bool *recheck); /* * Fetch the next tuple of a bitmap table scan into `slot` and return true * if a visible tuple was found, false otherwise. * - * For some AMs it will make more sense to do all the work referencing - * `tbmres` contents in scan_bitmap_next_block, for others it might be - * better to defer more work to this callback. - * - * Optional callback, but either both scan_bitmap_next_block and - * scan_bitmap_next_tuple need to exist, or neither. + * Optional callback, but scan_bitmap_setup, scan_bitmap_next_block, and + * scan_bitmap_next_tuple need to exist, or none of them. */ - bool (*scan_bitmap_next_tuple) (TableScanDesc scan, - struct TBMIterateResult *tbmres, - TupleTableSlot *slot); + bool (*scan_bitmap_next_tuple) (TableScanDesc scan, TupleTableSlot *slot); /* * Prepare to fetch tuples from the next block in a sample scan. Return @@ -944,10 +929,13 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, */ static inline TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, - int nkeys, struct ScanKeyData *key) + int nkeys, struct ScanKeyData *key, bool can_skip_fetch) { uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; + if (can_skip_fetch) + flags |= SO_CAN_SKIP_FETCH; + return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } @@ -1955,8 +1943,7 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths, * used after verifying the presence (at plan time or such). */ static inline bool -table_scan_bitmap_next_block(TableScanDesc scan, - struct TBMIterateResult *tbmres) +table_scan_bitmap_next_block(TableScanDesc scan, bool *recheck) { /* * We don't expect direct calls to table_scan_bitmap_next_block with valid @@ -1966,8 +1953,7 @@ table_scan_bitmap_next_block(TableScanDesc scan, if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding"); - return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan, - tbmres); + return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan, recheck); } /* @@ -1979,9 +1965,7 @@ table_scan_bitmap_next_block(TableScanDesc scan, * returned false. */ static inline bool -table_scan_bitmap_next_tuple(TableScanDesc scan, - struct TBMIterateResult *tbmres, - TupleTableSlot *slot) +table_scan_bitmap_next_tuple(TableScanDesc scan, TupleTableSlot *slot) { /* * We don't expect direct calls to table_scan_bitmap_next_tuple with valid @@ -1992,7 +1976,6 @@ table_scan_bitmap_next_tuple(TableScanDesc scan, elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding"); return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan, - tbmres, slot); } diff --git a/src/include/executor/nodeBitmapHeapscan.h b/src/include/executor/nodeBitmapHeapscan.h index 3a267a7fbd..578989ae7f 100644 --- a/src/include/executor/nodeBitmapHeapscan.h +++ b/src/include/executor/nodeBitmapHeapscan.h @@ -28,5 +28,6 @@ extern void ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node, ParallelContext *pcxt); extern void ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, ParallelWorkerContext *pwcxt); +extern void table_bitmap_scan_setup(BitmapHeapScanState *scanstate); #endif /* NODEBITMAPHEAPSCAN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index cb714f4a19..6aad5f1d70 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1676,11 +1676,7 @@ typedef enum /* ---------------- * ParallelBitmapHeapState information * tbmiterator iterator for scanning current pages - * prefetch_iterator iterator for prefetching ahead of current page - * mutex mutual exclusion for the prefetching variable - * and state - * prefetch_pages # pages prefetch iterator is ahead of current - * prefetch_target current target prefetch distance + * mutex mutual exclusion for the state * state current state of the TIDBitmap * cv conditional wait variable * phs_snapshot_data snapshot data shared to workers @@ -1689,10 +1685,7 @@ typedef enum typedef struct ParallelBitmapHeapState { dsa_pointer tbmiterator; - dsa_pointer prefetch_iterator; slock_t mutex; - int prefetch_pages; - int prefetch_target; SharedBitmapState state; ConditionVariable cv; char phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; @@ -1703,22 +1696,9 @@ typedef struct ParallelBitmapHeapState * * bitmapqualorig execution state for bitmapqualorig expressions * tbm bitmap obtained from child index scan(s) - * tbmiterator iterator for scanning current pages - * tbmres current-page data - * can_skip_fetch can we potentially skip tuple fetches in this scan? - * return_empty_tuples number of empty tuples to return - * vmbuffer buffer for visibility-map lookups - * pvmbuffer ditto, for prefetched pages - * exact_pages total number of exact pages retrieved - * lossy_pages total number of lossy pages retrieved - * prefetch_iterator iterator for prefetching ahead of current page - * prefetch_pages # pages prefetch iterator is ahead of current - * prefetch_target current target prefetch distance - * prefetch_maximum maximum value for prefetch_target + * recheck whether or not the tuple needs to be rechecked before returning * pscan_len size of the shared memory for parallel bitmap * initialized is node is ready to iterate - * shared_tbmiterator shared iterator - * shared_prefetch_iterator shared iterator for prefetching * pstate shared state for parallel bitmap scan * ---------------- */ @@ -1727,22 +1707,9 @@ typedef struct BitmapHeapScanState ScanState ss; /* its first field is NodeTag */ ExprState *bitmapqualorig; TIDBitmap *tbm; - TBMIterator *tbmiterator; - TBMIterateResult *tbmres; - bool can_skip_fetch; - int return_empty_tuples; - Buffer vmbuffer; - Buffer pvmbuffer; - long exact_pages; - long lossy_pages; - TBMIterator *prefetch_iterator; - int prefetch_pages; - int prefetch_target; - int prefetch_maximum; + bool recheck; Size pscan_len; bool initialized; - TBMSharedIterator *shared_tbmiterator; - TBMSharedIterator *shared_prefetch_iterator; ParallelBitmapHeapState *pstate; } BitmapHeapScanState; diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index b64e36437a..e15d77dc81 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -64,8 +64,8 @@ extern bool tbm_is_empty(const TIDBitmap *tbm); extern TBMIterator *tbm_begin_iterate(TIDBitmap *tbm); extern dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm); -extern TBMIterateResult *tbm_iterate(TBMIterator *iterator); -extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator); +extern void tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres); +extern void tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres); extern void tbm_end_iterate(TBMIterator *iterator); extern void tbm_end_shared_iterate(TBMSharedIterator *iterator); extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa, -- 2.39.2