From a0b1fe9ccfdc95ee2e2a36b14fa7661a63fb0afb Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sun, 22 Mar 2026 02:36:57 -0400 Subject: [PATCH v18 02/19] Add slot-based table AM interface for index scans. Add table_index_getnext_slot, a new table AM callback that wraps both plain and index-only index scans that use amgettuple. Two new TableAmRoutine callbacks are introduced -- one for plain scans and one for index-only scans -- which an upcoming commit that adds the amgetbatch interface will expand to four. The appropriate callback is resolved once in index_beginscan, and called through a function pointer (xs_getnext_slot) on the IndexScanDesc when the table_index_getnext_slot shim function is called from executor nodes. This moves VM checks for index-only scans and heap tuple fetching for plain index scans out of the executor and into heapam, where they can be maintained alongside the rest of the table AM logic. This higher level interface greatly simplifies nodeIndexonlyscan.c, which no longer needs to know anything about the visibility map (nor about predicate locking). A small minority of callers (2 callers in total) continue to use the old table_index_fetch_tuple interface. This is still necessary with callers that fundamentally need to pass a TID to the table AM. In practice both callers perform some kind of constraint enforcement. XXX Do we still need IndexFetchTableData.rel? We are now mostly using IndexScanDescData.heapRelation, but we could use it for absolutely everything if we were willing to revise the signature of the old table_index_fetch_tuple interface by giving it its own heapRelation arg. Index-only scan callers pass table_index_getnext_slot a TupleTableSlot (which the table AM needs internally for heap fetches), but continue to read their results from IndexScanDescData fields such as xs_itup (rather than from the slot itself). This convention lets both plain and index-only scans use the same table_index_getnext_slot interface. All callers can continue to rely on the scan descriptor's xs_heaptid field being set on each call. (execCurrentOf is the only caller that reads this field directly, to determine the current row of an index-only scan, but it seems like a good idea to do the same thing for all callers). XXX Should execCurrentOf continue to do this? Can't it get the same TID from the table slot instead? The VISITED_PAGES_LIMIT mechanism used by get_actual_variable_range to cap scan overhead during planning is reworked to go through a new scan descriptor interface (xs_visited_pages_limit), rather than tracking the costs directly and terminating the scan itself, in an ad-hoc way. This is necessary because callers that use the new slot-based interface no longer have direct access to which heap blocks were fetched by the table AM. Similarly, nodeIndexonlyscan.c can no longer use InstrCountTuples2 to count heap fetches during an EXPLAIN ANALYZE. Instead it relies on heapam maintaining a new IndexScanInstrumentation.ntablefetches field. Though independently useful, this commit is also preparatory work for an upcoming commit that will add an amgetbatch index AM interface, where the table AM takes full responsibility for managing the progress of index scans. The heapam implementations make aggressive use of forced inlining to ensure that plain and index-only code paths are fully specialized at compile time despite sharing a common implementation. Testing has shown this is necessary to keep icache misses to a minimum, at least with the two upcoming amgetbatch variants. Author: Peter Geoghegan Reviewed-By: Andres Freund Reviewed-By: Tomas Vondra Discussion: https://postgr.es/m/CAH2-WzmYqhacBH161peAWb5eF=Ja7CFAQ+0jSEMq=qnfLVTOOg@mail.gmail.com --- src/include/access/genam.h | 5 +- src/include/access/heapam.h | 6 + src/include/access/relscan.h | 18 +- src/include/access/tableam.h | 46 ++++- src/include/executor/instrument_node.h | 5 + src/include/nodes/execnodes.h | 2 - src/backend/access/heap/heapam_handler.c | 8 +- src/backend/access/heap/heapam_indexscan.c | 223 +++++++++++++++++++++ src/backend/access/heap/visibilitymap.c | 27 ++- src/backend/access/index/genam.c | 11 +- src/backend/access/index/indexam.c | 203 ++++++------------- src/backend/commands/explain.c | 23 ++- src/backend/executor/execIndexing.c | 6 +- src/backend/executor/execReplication.c | 8 +- src/backend/executor/nodeBitmapIndexscan.c | 1 + src/backend/executor/nodeIndexonlyscan.c | 109 +--------- src/backend/executor/nodeIndexscan.c | 13 +- src/backend/utils/adt/selfuncs.c | 61 +----- 18 files changed, 440 insertions(+), 335 deletions(-) diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 1a27bf060..a7d78935c 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -156,6 +156,7 @@ extern void index_insert_cleanup(Relation indexRelation, extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, + bool index_only_scan, Snapshot snapshot, IndexScanInstrumentation *instrument, int nkeys, int norderbys); @@ -182,14 +183,12 @@ extern void index_parallelscan_initialize(Relation heapRelation, extern void index_parallelrescan(IndexScanDesc scan); extern IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, + bool index_only_scan, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); -extern bool index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot); -extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, - TupleTableSlot *slot); extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap); extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 62f15e0b1..8f9cc2f88 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -436,6 +436,12 @@ extern TransactionId heap_index_delete_tuples(Relation rel, extern IndexFetchTableData *heapam_index_fetch_begin(Relation rel); extern void heapam_index_fetch_reset(IndexFetchTableData *scan); extern void heapam_index_fetch_end(IndexFetchTableData *scan); +extern bool heapam_index_plain_amgettuple_getnext_slot(IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot); +extern bool heapam_index_only_amgettuple_getnext_slot(IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot); extern bool heapam_index_fetch_tuple(struct IndexFetchTableData *scan, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot, bool *heap_continue, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index ce340c076..9e32f0209 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -16,6 +16,7 @@ #include "access/htup_details.h" #include "access/itup.h" +#include "access/sdir.h" #include "nodes/tidbitmap.h" #include "port/atomics.h" #include "storage/relfilelocator.h" @@ -24,6 +25,7 @@ struct ParallelTableScanDescData; +struct TupleTableSlot; /* * Generic descriptor for table scans. This is the base-class for table scans, @@ -149,6 +151,8 @@ typedef struct IndexScanDescData bool ignore_killed_tuples; /* do not return killed entries */ bool xactStartedInRecovery; /* prevents killing/seeing killed * tuples */ + /* xs_snapshot uses an MVCC snapshot? */ + bool MVCCScan; /* index access method's private state */ void *opaque; /* access-method-specific info */ @@ -171,10 +175,13 @@ typedef struct IndexScanDescData struct TupleDescData *xs_hitupdesc; /* rowtype descriptor of xs_hitup */ ItemPointerData xs_heaptid; /* result */ - bool xs_heap_continue; /* T if must keep walking, potential - * further results */ IndexFetchTableData *xs_heapfetch; + /* Resolved getnext_slot implementation, set by index_beginscan */ + bool (*xs_getnext_slot) (struct IndexScanDescData *scan, + ScanDirection direction, + struct TupleTableSlot *slot); + bool xs_recheck; /* T means scan keys must be rechecked */ /* @@ -188,6 +195,13 @@ typedef struct IndexScanDescData bool *xs_orderbynulls; bool xs_recheckorderby; + /* + * An approximate limit on the amount of work, measured in pages touched, + * imposed on the index scan. The default, 0, means no limit. Used by + * selfuncs.c to bound the cost of get_actual_variable_endpoint(). + */ + uint8 xs_visited_pages_limit; + /* parallel index scan information, in shared memory */ struct ParallelIndexScanDescData *parallel_scan; } IndexScanDescData; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 060847522..090d0c556 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -418,7 +418,9 @@ typedef struct TableAmRoutine * IndexFetchTableData, which the AM will typically embed in a larger * structure with additional information. * - * Tuples for an index scan can then be fetched via index_fetch_tuple. + * Tuples for an index scan can then be fetched via one of the + * slot-based callbacks called through table_index_getnext_slot, or via + * the lower-level TID-based index_fetch_tuple interface. */ struct IndexFetchTableData *(*index_fetch_begin) (Relation rel); @@ -433,11 +435,30 @@ typedef struct TableAmRoutine */ void (*index_fetch_end) (struct IndexFetchTableData *data); + /* + * Fetch the next tuple from an index scan, scanning in the specified + * direction, and return true if a tuple was found, false otherwise. + * + * Two variants cover {plain, index-only} index scans that use amgettuple. + * index_beginscan resolves which variant to use. Callers use + * table_index_getnext_slot(), which calls through that pointer directly. + */ + bool (*index_plain_amgettuple_getnext_slot) (IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot); + bool (*index_only_amgettuple_getnext_slot) (IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot); + /* * Fetch tuple at `tid` into `slot`, after doing a visibility test * according to `snapshot`. If a tuple was found and passed the visibility * test, return true, false otherwise. * + * This is a lower-level callback that takes a TID from the caller. + * Callers should favor the table_index_getnext_slot callbacks whenever + * possible. + * * Note that AMs that do not necessarily update indexes when indexed * columns do not change, need to return the current/correct version of * the tuple that is visible to the snapshot, even if the tid points to an @@ -1207,6 +1228,29 @@ table_index_fetch_end(struct IndexFetchTableData *scan) scan->rel->rd_tableam->index_fetch_end(scan); } +/* + * Fetch the next tuple from an index scan into `slot`, scanning in the + * specified direction. Returns true if a tuple was found, false otherwise. + * + * Dispatches through scan->xs_getnext_slot, which is resolved once by + * index_beginscan. + * + * On success, the following IndexScanDesc fields are set by the callback: + * + * xs_recheck - true if scan keys must be rechecked against the tuple + * xs_heaptid - table TID of the returned tuple + * + * For index-only scans, the callback also fills xs_itup/xs_itupdesc or + * xs_hitup/xs_hitupdesc (or both) so that index data can be returned without + * a heap fetch. + */ +static inline bool +table_index_getnext_slot(IndexScanDesc iscan, ScanDirection direction, + TupleTableSlot *slot) +{ + return iscan->xs_getnext_slot(iscan, direction, slot); +} + /* * Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing * a visibility test according to `snapshot`. If a tuple was found and passed diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 8847d7f94..c33f64675 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -48,6 +48,11 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + + /* + * table blocks fetched count (incremented during index-only scans) + */ + uint64 ntablefetches; } IndexScanInstrumentation; /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 684e398f8..4c6a06523 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1763,7 +1763,6 @@ typedef struct IndexScanState * Instrument local index scan instrumentation * SharedInfo parallel worker instrumentation (no leader entry) * TableSlot slot for holding tuples fetched from the table - * VMBuffer buffer in use for visibility map testing, if any * PscanLen size of parallel index-only scan descriptor * NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN * NameCStringCount number of elements in the NameCStringAttNums array @@ -1786,7 +1785,6 @@ typedef struct IndexOnlyScanState IndexScanInstrumentation *ioss_Instrument; SharedIndexScanInstrumentation *ioss_SharedInfo; TupleTableSlot *ioss_TableSlot; - Buffer ioss_VMBuffer; Size ioss_PscanLen; AttrNumber *ioss_NameCStringAttNums; int ioss_NameCStringCount; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index eff276f0d..32226dce2 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -654,7 +654,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tableScan = NULL; heapScan = NULL; - indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0); + indexScan = index_beginscan(OldHeap, OldIndex, false, SnapshotAny, + NULL, 0, 0); index_rescan(indexScan, NULL, 0, NULL, 0); } else @@ -691,7 +692,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, if (indexScan != NULL) { - if (!index_getnext_slot(indexScan, ForwardScanDirection, slot)) + if (!table_index_getnext_slot(indexScan, ForwardScanDirection, + slot)) break; /* Since we used no scan keys, should never need to recheck */ @@ -2550,6 +2552,8 @@ static const TableAmRoutine heapam_methods = { .index_fetch_begin = heapam_index_fetch_begin, .index_fetch_reset = heapam_index_fetch_reset, .index_fetch_end = heapam_index_fetch_end, + .index_plain_amgettuple_getnext_slot = heapam_index_plain_amgettuple_getnext_slot, + .index_only_amgettuple_getnext_slot = heapam_index_only_amgettuple_getnext_slot, .index_fetch_tuple = heapam_index_fetch_tuple, .tuple_insert = heapam_tuple_insert, diff --git a/src/backend/access/heap/heapam_indexscan.c b/src/backend/access/heap/heapam_indexscan.c index 4e8230834..de812a462 100644 --- a/src/backend/access/heap/heapam_indexscan.c +++ b/src/backend/access/heap/heapam_indexscan.c @@ -14,9 +14,27 @@ */ #include "postgres.h" +#include "access/amapi.h" #include "access/heapam.h" +#include "access/visibilitymap.h" +#include "storage/predicate.h" +#include "utils/pgstat_internal.h" +static pg_attribute_always_inline bool heapam_index_fetch_tuple_impl(struct IndexFetchTableData *scan, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot, + bool *heap_continue, + bool *all_dead); +static pg_attribute_always_inline bool heapam_index_getnext_slot(IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot, + bool index_only); +static pg_attribute_always_inline bool heapam_index_fetch_heap(IndexScanDesc scan, + TupleTableSlot *slot, + bool *heap_continue); + /* ------------------------------------------------------------------------ * Index Scan Callbacks for heap AM * ------------------------------------------------------------------------ @@ -62,6 +80,30 @@ heapam_index_fetch_end(IndexFetchTableData *scan) pfree(hscan); } +/* amgettuple, plain index scan */ +pg_attribute_hot bool +heapam_index_plain_amgettuple_getnext_slot(IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot) +{ + Assert(!scan->xs_want_itup); + Assert(scan->indexRelation->rd_indam->amgettuple != NULL); + + return heapam_index_getnext_slot(scan, direction, slot, false); +} + +/* amgettuple, index-only scan */ +pg_attribute_hot bool +heapam_index_only_amgettuple_getnext_slot(IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot) +{ + Assert(scan->xs_want_itup); + Assert(scan->indexRelation->rd_indam->amgettuple != NULL); + + return heapam_index_getnext_slot(scan, direction, slot, true); +} + bool heapam_index_fetch_tuple(struct IndexFetchTableData *scan, ItemPointer tid, @@ -69,6 +111,18 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, TupleTableSlot *slot, bool *heap_continue, bool *all_dead) { + + return heapam_index_fetch_tuple_impl(scan, tid, snapshot, slot, + heap_continue, all_dead); +} + +static pg_attribute_always_inline bool +heapam_index_fetch_tuple_impl(struct IndexFetchTableData *scan, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot, + bool *heap_continue, bool *all_dead) +{ IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; bool got_heap_tuple; @@ -124,3 +178,172 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, return got_heap_tuple; } + +/* + * Common implementation for both heapam_index_*_getnext_slot variants. + * + * The result is true if a tuple satisfying the scan keys and the snapshot was + * found, false otherwise. The tuple is stored in the specified slot. + * + * On success, resources (like buffer pins) are likely to be held, and will be + * dropped by a future call here (or by a later call to heapam_index_fetch_end + * through index_endscan). + * + * The index_only parameter is a compile-time constant at each call site, + * allowing the compiler to specialize the code for each variant. + */ +static pg_attribute_always_inline bool +heapam_index_getnext_slot(IndexScanDesc scan, ScanDirection direction, + TupleTableSlot *slot, bool index_only) +{ + IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan->xs_heapfetch; + ItemPointer tid; + bool heap_continue = false; + bool all_visible = false; + BlockNumber last_visited_block = InvalidBlockNumber; + uint8 n_visited_pages = 0, + xs_visited_pages_limit = 0; + + if (index_only) + xs_visited_pages_limit = scan->xs_visited_pages_limit; + + for (;;) + { + if (!heap_continue) + { + /* Get the next TID from the index */ + tid = index_getnext_tid(scan, direction); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + /* For index-only scans, check the visibility map */ + if (index_only) + all_visible = VM_ALL_VISIBLE(scan->heapRelation, + ItemPointerGetBlockNumber(tid), + &hscan->xs_vmbuffer); + } + + Assert(ItemPointerIsValid(&scan->xs_heaptid)); + + if (index_only) + { + /* + * We can skip the heap fetch if the TID references a heap page on + * which all tuples are known visible to everybody. In any case, + * we'll use the index tuple not the heap tuple as the data + * source. + */ + if (!all_visible) + { + /* + * Rats, we have to visit the heap to check visibility. + */ + if (scan->instrument) + scan->instrument->ntablefetches++; + + if (!heapam_index_fetch_heap(scan, slot, &heap_continue)) + { + /* + * No visible tuple. If caller set a visited-pages limit + * (only selfuncs.c does this), count distinct heap pages + * and give up once we've visited too many. + */ + if (unlikely(xs_visited_pages_limit > 0)) + { + BlockNumber blk = ItemPointerGetBlockNumber(tid); + + if (blk != last_visited_block) + { + last_visited_block = blk; + if (++n_visited_pages > xs_visited_pages_limit) + return false; /* give up */ + } + } + continue; /* no visible tuple, try next index entry */ + } + + ExecClearTuple(slot); + + /* + * Only MVCC snapshots are supported with standard index-only + * scans, so there should be no need to keep following the HOT + * chain once a visible entry has been found. Other callers + * (currently only selfuncs.c) use SnapshotNonVacuumable, and + * want us to assume that just having one visible tuple in the + * hot chain is always good enough. + */ + Assert(!(heap_continue && IsMVCCSnapshot(scan->xs_snapshot))); + } + else + { + /* + * We didn't access the heap, so we'll need to take a + * predicate lock explicitly, as if we had. For now we do + * that at page level. + */ + PredicateLockPage(scan->heapRelation, + ItemPointerGetBlockNumber(tid), + scan->xs_snapshot); + } + + /* + * Return matching index tuple now set in scan->xs_itup (or return + * matching heap tuple now set in scan->xs_hitup) + */ + return true; + } + else + { + /* + * Fetch the next (or only) visible heap tuple for this index + * entry. If we don't find anything, loop around and grab the + * next TID from the index. + */ + if (heapam_index_fetch_heap(scan, slot, &heap_continue)) + return true; + } + } + + return false; +} + +/* + * Get the scan's next heap tuple. + * + * The result is a visible heap tuple associated with the index TID most + * recently fetched by our caller in scan->xs_heaptid, or NULL if no more + * matching tuples exist. (There can be more than one matching tuple because + * of HOT chains, although when using an MVCC snapshot it should be impossible + * for more than one such tuple to exist.) + * + * On success, the buffer containing the heap tup is pinned. The pin must be + * dropped elsewhere. + */ +static pg_attribute_always_inline bool +heapam_index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot, + bool *heap_continue) +{ + bool all_dead = false; + bool found; + + found = heapam_index_fetch_tuple_impl(scan->xs_heapfetch, &scan->xs_heaptid, + scan->xs_snapshot, slot, + heap_continue, &all_dead); + + if (found) + pgstat_count_heap_fetch(scan->indexRelation); + + /* + * If we scanned a whole HOT chain and found only dead tuples, tell index + * AM to kill its entry for that TID (this will take effect in the next + * amgettuple call, in index_getnext_tid). We do not do this when in + * recovery because it may violate MVCC to do so. See comments in + * RelationGetIndexScan(). + */ + if (!scan->xactStartedInRecovery) + scan->kill_prior_tuple = all_dead; + + return found; +} diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 4fd470702..4ba9f48e9 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -313,7 +313,32 @@ visibilitymap_set(BlockNumber heapBlk, * since we don't lock the visibility map page either, it's even possible that * someone else could have changed the bit just before we look at it, but yet * we might see the old value. It is the caller's responsibility to deal with - * all concurrency issues! + * all concurrency issues! In practice it can't be stale enough to matter for + * the primary use case: index-only scans that check whether a heap fetch can + * be skipped. + * + * The argument for why it can't be stale enough to matter for the primary use + * case is as follows: + * + * Inserts: we need to detect that a VM bit was cleared by an insert right + * away, because the new tuple is present in the index but not yet visible. + * Reading the TID from the index page (under a shared lock on the index + * buffer) is serialized with the insertion of the TID into the index (under + * an exclusive lock on the same index buffer). Because the VM bit is cleared + * before the index is updated, and locking/unlocking of the index page acts + * as a full memory barrier, we are sure to see the cleared bit whenever we + * see a recently-inserted TID. + * + * Deletes: the clearing of the VM bit by a delete is NOT serialized with the + * index page access, because deletes do not update the index page (only + * VACUUM removes the index TID). So we may see a significantly stale value. + * However, we don't need to detect the delete right away, because the tuple + * remains visible until the deleting transaction commits or the statement + * ends (if it's our own transaction). In either case, the lock on the VM + * buffer will have been released (acting as a write barrier) after clearing + * the bit. And for us to have a snapshot that includes the deleting + * transaction (making the tuple invisible), we must have acquired + * ProcArrayLock after that time, acting as a read barrier. */ uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 5e89b86a6..0ff77135d 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -126,6 +126,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->xs_visited_pages_limit = 0; + return scan; } @@ -454,7 +456,7 @@ systable_beginscan(Relation heapRelation, elog(ERROR, "column is not in index"); } - sysscan->iscan = index_beginscan(heapRelation, irel, + sysscan->iscan = index_beginscan(heapRelation, irel, false, snapshot, NULL, nkeys, 0); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); sysscan->scan = NULL; @@ -517,7 +519,8 @@ systable_getnext(SysScanDesc sysscan) if (sysscan->irel) { - if (index_getnext_slot(sysscan->iscan, ForwardScanDirection, sysscan->slot)) + if (table_index_getnext_slot(sysscan->iscan, ForwardScanDirection, + sysscan->slot)) { bool shouldFree; @@ -715,7 +718,7 @@ systable_beginscan_ordered(Relation heapRelation, if (TransactionIdIsValid(CheckXidAlive)) bsysscan = true; - sysscan->iscan = index_beginscan(heapRelation, indexRelation, + sysscan->iscan = index_beginscan(heapRelation, indexRelation, false, snapshot, NULL, nkeys, 0); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); sysscan->scan = NULL; @@ -734,7 +737,7 @@ systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction) HeapTuple htup = NULL; Assert(sysscan->irel); - if (index_getnext_slot(sysscan->iscan, direction, sysscan->slot)) + if (table_index_getnext_slot(sysscan->iscan, direction, sysscan->slot)) htup = ExecFetchSlotHeapTuple(sysscan->slot, false, NULL); /* See notes in systable_getnext */ diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index fbfc33159..527e06e09 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -24,9 +24,7 @@ * index_parallelscan_initialize - initialize parallel scan * index_parallelrescan - (re)start a parallel scan of an index * index_beginscan_parallel - join parallel index scan - * index_getnext_tid - get the next TID from a scan - * index_fetch_heap - get the scan's next heap tuple - * index_getnext_slot - get the next tuple from a scan + * index_getnext_tid - amgettuple table AM helper routine * index_getbitmap - get all tuples from a scan * index_bulk_delete - bulk deletion of index tuples * index_vacuum_cleanup - post-deletion cleanup of an index @@ -105,9 +103,15 @@ do { \ CppAsString(pname), RelationGetRelationName(scan->indexRelation)); \ } while(0) -static IndexScanDesc index_beginscan_internal(Relation indexRelation, - int nkeys, int norderbys, Snapshot snapshot, - ParallelIndexScanDesc pscan, bool temp_snap); +static pg_attribute_always_inline IndexScanDesc index_beginscan_internal(Relation indexRelation, + int nkeys, + int norderbys, + Snapshot snapshot, + ParallelIndexScanDesc pscan, + bool temp_snap, + Relation heapRelation, + bool index_only_scan, + IndexScanInstrumentation *instrument); static inline void validate_relation_as_index(Relation r); @@ -256,12 +260,11 @@ index_insert_cleanup(Relation indexRelation, IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, + bool index_only_scan, Snapshot snapshot, IndexScanInstrumentation *instrument, int nkeys, int norderbys) { - IndexScanDesc scan; - Assert(snapshot != InvalidSnapshot); /* Check that a historic snapshot is not used for non-catalog tables */ @@ -274,20 +277,9 @@ index_beginscan(Relation heapRelation, RelationGetRelationName(heapRelation)))); } - scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false); - - /* - * Save additional parameters into the scandesc. Everything else was set - * up by RelationGetIndexScan. - */ - scan->heapRelation = heapRelation; - scan->xs_snapshot = snapshot; - scan->instrument = instrument; - - /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heapRelation); - - return scan; + return index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, + NULL, false, heapRelation, + index_only_scan, instrument); } /* @@ -302,29 +294,24 @@ index_beginscan_bitmap(Relation indexRelation, IndexScanInstrumentation *instrument, int nkeys) { - IndexScanDesc scan; - Assert(snapshot != InvalidSnapshot); - scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false); - - /* - * Save additional parameters into the scandesc. Everything else was set - * up by RelationGetIndexScan. - */ - scan->xs_snapshot = snapshot; - scan->instrument = instrument; - - return scan; + return index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, + false, NULL, false, instrument); } /* * index_beginscan_internal --- common code for index_beginscan variants + * + * When heapRelation is not NULL, also initializes heap-side scan state: + * getnext_slot resolution and table fetch initialization. */ -static IndexScanDesc +static pg_attribute_always_inline IndexScanDesc index_beginscan_internal(Relation indexRelation, int nkeys, int norderbys, Snapshot snapshot, - ParallelIndexScanDesc pscan, bool temp_snap) + ParallelIndexScanDesc pscan, bool temp_snap, + Relation heapRelation, bool index_only_scan, + IndexScanInstrumentation *instrument) { IndexScanDesc scan; @@ -348,6 +335,31 @@ index_beginscan_internal(Relation indexRelation, scan->parallel_scan = pscan; scan->xs_temp_snap = temp_snap; + scan->xs_snapshot = snapshot; + scan->MVCCScan = IsMVCCLikeSnapshot(snapshot); + scan->instrument = instrument; + + /* + * Initialize heap-side scan state when a heap relation is provided. + * Bitmap scans don't provide one, since they access the heap separately. + */ + if (heapRelation != NULL) + { + scan->heapRelation = heapRelation; + scan->xs_want_itup = index_only_scan; + + /* Resolve which getnext_slot implementation to use for this scan */ + if (index_only_scan) + scan->xs_getnext_slot = + heapRelation->rd_tableam->index_only_amgettuple_getnext_slot; + else + scan->xs_getnext_slot = + heapRelation->rd_tableam->index_plain_amgettuple_getnext_slot; + + /* prepare to fetch index matches from table */ + scan->xs_heapfetch = table_index_fetch_begin(heapRelation); + } + return scan; } @@ -379,7 +391,6 @@ index_rescan(IndexScanDesc scan, table_index_fetch_reset(scan->xs_heapfetch); scan->kill_prior_tuple = false; /* for safety */ - scan->xs_heap_continue = false; scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys, orderbys, norderbys); @@ -456,7 +467,6 @@ index_restrpos(IndexScanDesc scan) table_index_fetch_reset(scan->xs_heapfetch); scan->kill_prior_tuple = false; /* for safety */ - scan->xs_heap_continue = false; scan->indexRelation->rd_indam->amrestrpos(scan); } @@ -592,40 +602,33 @@ index_parallelrescan(IndexScanDesc scan) */ IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, + bool index_only_scan, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan) { Snapshot snapshot; - IndexScanDesc scan; Assert(RelFileLocatorEquals(heaprel->rd_locator, pscan->ps_locator)); Assert(RelFileLocatorEquals(indexrel->rd_locator, pscan->ps_indexlocator)); snapshot = RestoreSnapshot(pscan->ps_snapshot_data); RegisterSnapshot(snapshot); - scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot, - pscan, true); - /* - * Save additional parameters into the scandesc. Everything else was set - * up by index_beginscan_internal. - */ - scan->heapRelation = heaprel; - scan->xs_snapshot = snapshot; - scan->instrument = instrument; - - /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heaprel); - - return scan; + return index_beginscan_internal(indexrel, nkeys, norderbys, snapshot, + pscan, true, heaprel, index_only_scan, + instrument); } /* ---------------- - * index_getnext_tid - get the next TID from a scan + * index_getnext_tid - amgettuple interface * * The result is the next TID satisfying the scan keys, * or NULL if no more matching tuples exist. + * + * This should only be called by table AM's index_getnext_slot implementation, + * and only given an index AM that supports the single-tuple amgettuple + * interface. * ---------------- */ ItemPointer @@ -649,7 +652,6 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* Reset kill flag immediately for safety */ scan->kill_prior_tuple = false; - scan->xs_heap_continue = false; /* If we're out of index entries, we're done */ if (!found) @@ -668,97 +670,6 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) return &scan->xs_heaptid; } -/* ---------------- - * index_fetch_heap - get the scan's next heap tuple - * - * The result is a visible heap tuple associated with the index TID most - * recently fetched by index_getnext_tid, or NULL if no more matching tuples - * exist. (There can be more than one matching tuple because of HOT chains, - * although when using an MVCC snapshot it should be impossible for more than - * one such tuple to exist.) - * - * On success, the buffer containing the heap tup is pinned (the pin will be - * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan - * call). - * - * Note: caller must check scan->xs_recheck, and perform rechecking of the - * scan keys if required. We do not do that here because we don't have - * enough information to do it efficiently in the general case. - * ---------------- - */ -bool -index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) -{ - bool all_dead = false; - bool found; - - found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid, - scan->xs_snapshot, slot, - &scan->xs_heap_continue, &all_dead); - - if (found) - pgstat_count_heap_fetch(scan->indexRelation); - - /* - * If we scanned a whole HOT chain and found only dead tuples, tell index - * AM to kill its entry for that TID (this will take effect in the next - * amgettuple call, in index_getnext_tid). We do not do this when in - * recovery because it may violate MVCC to do so. See comments in - * RelationGetIndexScan(). - */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; - - return found; -} - -/* ---------------- - * index_getnext_slot - get the next tuple from a scan - * - * The result is true if a tuple satisfying the scan keys and the snapshot was - * found, false otherwise. The tuple is stored in the specified slot. - * - * On success, resources (like buffer pins) are likely to be held, and will be - * dropped by a future index_getnext_tid, index_fetch_heap or index_endscan - * call). - * - * Note: caller must check scan->xs_recheck, and perform rechecking of the - * scan keys if required. We do not do that here because we don't have - * enough information to do it efficiently in the general case. - * ---------------- - */ -bool -index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *slot) -{ - for (;;) - { - if (!scan->xs_heap_continue) - { - ItemPointer tid; - - /* Time to fetch the next TID from the index */ - tid = index_getnext_tid(scan, direction); - - /* If we're out of index entries, we're done */ - if (tid == NULL) - break; - - Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); - } - - /* - * Fetch the next (or only) visible heap tuple for this index entry. - * If we don't find anything, loop around and grab the next TID from - * the index. - */ - Assert(ItemPointerIsValid(&scan->xs_heaptid)); - if (index_fetch_heap(scan, slot)) - return true; - } - - return false; -} - /* ---------------- * index_getbitmap - get all tuples at once from an index scan * diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index e4b70166b..ca759b0ac 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -136,7 +136,7 @@ static void show_recursive_union_info(RecursiveUnionState *rstate, static void show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es); static void show_hashagg_info(AggState *aggstate, ExplainState *es); -static void show_indexsearches_info(PlanState *planstate, ExplainState *es); +static void show_indexscan_info(PlanState *planstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); static void show_instrumentation_count(const char *qlabel, int which, @@ -1974,7 +1974,7 @@ ExplainNode(PlanState *planstate, List *ancestors, if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); - show_indexsearches_info(planstate, es); + show_indexscan_info(planstate, es); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -1988,15 +1988,12 @@ ExplainNode(PlanState *planstate, List *ancestors, if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); - if (es->analyze) - ExplainPropertyFloat("Heap Fetches", NULL, - planstate->instrument->ntuples2, 0, es); - show_indexsearches_info(planstate, es); + show_indexscan_info(planstate, es); break; case T_BitmapIndexScan: show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig, "Index Cond", planstate, ancestors, es); - show_indexsearches_info(planstate, es); + show_indexscan_info(planstate, es); break; case T_BitmapHeapScan: show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig, @@ -3860,15 +3857,16 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) } /* - * Show the total number of index searches for a + * Show index scan related executor instrumentation for a * IndexScan/IndexOnlyScan/BitmapIndexScan node */ static void -show_indexsearches_info(PlanState *planstate, ExplainState *es) +show_indexscan_info(PlanState *planstate, ExplainState *es) { Plan *plan = planstate->plan; SharedIndexScanInstrumentation *SharedInfo = NULL; - uint64 nsearches = 0; + uint64 nsearches = 0, + ntablefetches = 0; if (!es->analyze) return; @@ -3889,6 +3887,7 @@ show_indexsearches_info(PlanState *planstate, ExplainState *es) IndexOnlyScanState *indexstate = ((IndexOnlyScanState *) planstate); nsearches = indexstate->ioss_Instrument->nsearches; + ntablefetches = indexstate->ioss_Instrument->ntablefetches; SharedInfo = indexstate->ioss_SharedInfo; break; } @@ -3912,9 +3911,13 @@ show_indexsearches_info(PlanState *planstate, ExplainState *es) IndexScanInstrumentation *winstrument = &SharedInfo->winstrument[i]; nsearches += winstrument->nsearches; + ntablefetches += winstrument->ntablefetches; } } + if (nodeTag(plan) == T_IndexOnlyScan) + ExplainPropertyUInteger("Heap Fetches", NULL, ntablefetches, es); + ExplainPropertyUInteger("Index Searches", NULL, nsearches, es); } diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 9d071e495..3f0c8453d 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -815,10 +815,12 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, retry: conflict = false; found_self = false; - index_scan = index_beginscan(heap, index, &DirtySnapshot, NULL, indnkeyatts, 0); + index_scan = index_beginscan(heap, index, false, &DirtySnapshot, NULL, + indnkeyatts, 0); index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0); - while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot)) + while (table_index_getnext_slot(index_scan, ForwardScanDirection, + existing_slot)) { TransactionId xwait; XLTW_Oper reason_wait; diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 2497ee7ed..2f636ba3e 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -205,7 +205,7 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot); /* Start an index scan. */ - scan = index_beginscan(rel, idxrel, &snap, NULL, skey_attoff, 0); + scan = index_beginscan(rel, idxrel, false, &snap, NULL, skey_attoff, 0); retry: found = false; @@ -213,7 +213,7 @@ retry: index_rescan(scan, skey, skey_attoff, NULL, 0); /* Try to find the tuple */ - while (index_getnext_slot(scan, ForwardScanDirection, outslot)) + while (table_index_getnext_slot(scan, ForwardScanDirection, outslot)) { /* * Avoid expensive equality check if the index is primary key or @@ -666,12 +666,12 @@ RelationFindDeletedTupleInfoByIndex(Relation rel, Oid idxoid, * not yet committed or those just committed prior to the scan are * excluded in update_most_recent_deletion_info(). */ - scan = index_beginscan(rel, idxrel, SnapshotAny, NULL, skey_attoff, 0); + scan = index_beginscan(rel, idxrel, false, SnapshotAny, NULL, skey_attoff, 0); index_rescan(scan, skey, skey_attoff, NULL, 0); /* Try to find the tuple */ - while (index_getnext_slot(scan, ForwardScanDirection, scanslot)) + while (table_index_getnext_slot(scan, ForwardScanDirection, scanslot)) { /* * Avoid expensive equality check if the index is primary key or diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index 70c55ee6d..7045e58ef 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -204,6 +204,7 @@ ExecEndBitmapIndexScan(BitmapIndexScanState *node) * which will have a new BitmapIndexScanState and zeroed stats. */ winstrument->nsearches += node->biss_Instrument->nsearches; + Assert(node->biss_Instrument->ntablefetches == 0); } /* diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 9eab81fd1..bf4665b20 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -34,7 +34,6 @@ #include "access/relscan.h" #include "access/tableam.h" #include "access/tupdesc.h" -#include "access/visibilitymap.h" #include "catalog/pg_type.h" #include "executor/executor.h" #include "executor/instrument.h" @@ -42,7 +41,6 @@ #include "executor/nodeIndexscan.h" #include "miscadmin.h" #include "storage/bufmgr.h" -#include "storage/predicate.h" #include "utils/builtins.h" #include "utils/rel.h" @@ -66,7 +64,6 @@ IndexOnlyNext(IndexOnlyScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; - ItemPointer tid; /* * extract necessary information from index scan node @@ -91,18 +88,14 @@ IndexOnlyNext(IndexOnlyScanState *node) * parallel. */ scandesc = index_beginscan(node->ss.ss_currentRelation, - node->ioss_RelationDesc, + node->ioss_RelationDesc, true, estate->es_snapshot, node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys); node->ioss_ScanDesc = scandesc; - - - /* Set it up for index-only scan */ - node->ioss_ScanDesc->xs_want_itup = true; - node->ioss_VMBuffer = InvalidBuffer; + Assert(node->ioss_ScanDesc->xs_want_itup); /* * If no run-time keys to calculate or they are ready, go ahead and @@ -119,78 +112,11 @@ IndexOnlyNext(IndexOnlyScanState *node) /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while (table_index_getnext_slot(scandesc, direction, + node->ioss_TableSlot)) { - bool tuple_from_heap = false; - CHECK_FOR_INTERRUPTS(); - /* - * We can skip the heap fetch if the TID references a heap page on - * which all tuples are known visible to everybody. In any case, - * we'll use the index tuple not the heap tuple as the data source. - * - * Note on Memory Ordering Effects: visibilitymap_get_status does not - * lock the visibility map buffer, and therefore the result we read - * here could be slightly stale. However, it can't be stale enough to - * matter. - * - * We need to detect clearing a VM bit due to an insert right away, - * because the tuple is present in the index page but not visible. The - * reading of the TID by this scan (using a shared lock on the index - * buffer) is serialized with the insert of the TID into the index - * (using an exclusive lock on the index buffer). Because the VM bit - * is cleared before updating the index, and locking/unlocking of the - * index page acts as a full memory barrier, we are sure to see the - * cleared bit if we see a recently-inserted TID. - * - * Deletes do not update the index page (only VACUUM will clear out - * the TID), so the clearing of the VM bit by a delete is not - * serialized with this test below, and we may see a value that is - * significantly stale. However, we don't care about the delete right - * away, because the tuple is still visible until the deleting - * transaction commits or the statement ends (if it's our - * transaction). In either case, the lock on the VM buffer will have - * been released (acting as a write barrier) after clearing the bit. - * And for us to have a snapshot that includes the deleting - * transaction (making the tuple invisible), we must have acquired - * ProcArrayLock after that time, acting as a read barrier. - * - * It's worth going through this complexity to avoid needing to lock - * the VM buffer, which could cause significant contention. - */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) - { - /* - * Rats, we have to visit the heap to check visibility. - */ - InstrCountTuples2(node, 1); - if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) - continue; /* no visible tuple, try next index entry */ - - ExecClearTuple(node->ioss_TableSlot); - - /* - * Only MVCC snapshots are supported here, so there should be no - * need to keep following the HOT chain once a visible entry has - * been found. If we did want to allow that, we'd need to keep - * more state to remember not to call index_getnext_tid next time. - */ - if (scandesc->xs_heap_continue) - elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); - - /* - * Note: at this point we are holding a pin on the heap page, as - * recorded in scandesc->xs_cbuf. We could release that pin now, - * but it's not clear whether it's a win to do so. The next index - * entry might require a visit to the same heap page. - */ - - tuple_from_heap = true; - } - /* * Fill the scan tuple slot with data from the index. This might be * provided in either HeapTuple or IndexTuple format. Conceivably an @@ -239,16 +165,6 @@ IndexOnlyNext(IndexOnlyScanState *node) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("lossy distance functions are not supported in index-only scans"))); - - /* - * If we didn't access the heap, then we'll need to take a predicate - * lock explicitly, as if we had. For now we do that at page level. - */ - if (!tuple_from_heap) - PredicateLockPage(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - estate->es_snapshot); - return slot; } @@ -408,13 +324,6 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node) indexRelationDesc = node->ioss_RelationDesc; indexScanDesc = node->ioss_ScanDesc; - /* Release VM buffer pin, if any. */ - if (node->ioss_VMBuffer != InvalidBuffer) - { - ReleaseBuffer(node->ioss_VMBuffer); - node->ioss_VMBuffer = InvalidBuffer; - } - /* * When ending a parallel worker, copy the statistics gathered by the * worker back into shared memory so that it can be picked up by the main @@ -434,6 +343,7 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->ioss_Instrument->nsearches; + winstrument->ntablefetches += node->ioss_Instrument->ntablefetches; } /* @@ -790,13 +700,12 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, node->ioss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, - node->ioss_RelationDesc, + node->ioss_RelationDesc, true, node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, piscan); - node->ioss_ScanDesc->xs_want_itup = true; - node->ioss_VMBuffer = InvalidBuffer; + Assert(node->ioss_ScanDesc->xs_want_itup); /* * If no run-time keys to calculate or they are ready, go ahead and pass @@ -856,12 +765,12 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, node->ioss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, - node->ioss_RelationDesc, + node->ioss_RelationDesc, true, node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, piscan); - node->ioss_ScanDesc->xs_want_itup = true; + Assert(node->ioss_ScanDesc->xs_want_itup); /* * If no run-time keys to calculate or they are ready, go ahead and pass diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 06143e94c..77a8b88b2 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -109,7 +109,7 @@ IndexNext(IndexScanState *node) * serially executing an index scan that was planned to be parallel. */ scandesc = index_beginscan(node->ss.ss_currentRelation, - node->iss_RelationDesc, + node->iss_RelationDesc, false, estate->es_snapshot, node->iss_Instrument, node->iss_NumScanKeys, @@ -130,7 +130,7 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while (table_index_getnext_slot(scandesc, direction, slot)) { CHECK_FOR_INTERRUPTS(); @@ -205,7 +205,7 @@ IndexNextWithReorder(IndexScanState *node) * serially executing an index scan that was planned to be parallel. */ scandesc = index_beginscan(node->ss.ss_currentRelation, - node->iss_RelationDesc, + node->iss_RelationDesc, false, estate->es_snapshot, node->iss_Instrument, node->iss_NumScanKeys, @@ -262,7 +262,7 @@ IndexNextWithReorder(IndexScanState *node) * Fetch next tuple from the index. */ next_indextuple: - if (!index_getnext_slot(scandesc, ForwardScanDirection, slot)) + if (!table_index_getnext_slot(scandesc, ForwardScanDirection, slot)) { /* * No more tuples from the index. But we still need to drain any @@ -814,6 +814,7 @@ ExecEndIndexScan(IndexScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->iss_Instrument->nsearches; + Assert(node->iss_Instrument->ntablefetches == 0); } /* @@ -1726,7 +1727,7 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, - node->iss_RelationDesc, + node->iss_RelationDesc, false, node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, @@ -1790,7 +1791,7 @@ ExecIndexScanInitializeWorker(IndexScanState *node, node->iss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, - node->iss_RelationDesc, + node->iss_RelationDesc, false, node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 53f85ccde..9f623d97b 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -102,7 +102,6 @@ #include "access/gin.h" #include "access/table.h" #include "access/tableam.h" -#include "access/visibilitymap.h" #include "catalog/pg_collation.h" #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" @@ -7121,10 +7120,6 @@ get_actual_variable_endpoint(Relation heapRel, bool have_data = false; SnapshotData SnapshotNonVacuumable; IndexScanDesc index_scan; - Buffer vmbuffer = InvalidBuffer; - BlockNumber last_heap_block = InvalidBlockNumber; - int n_visited_heap_pages = 0; - ItemPointer tid; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; MemoryContext oldcontext; @@ -7172,61 +7167,25 @@ get_actual_variable_endpoint(Relation heapRel, * a huge amount of time here, so we give up once we've read too many heap * pages. When we fail for that reason, the caller will end up using * whatever extremal value is recorded in pg_statistic. + * + * We set xs_visited_pages_limit to tell the table AM to count distinct + * heap pages visited for non-visible tuples and give up after the limit + * is exceeded. */ +#define VISITED_PAGES_LIMIT 100 InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(heapRel)); - index_scan = index_beginscan(heapRel, indexRel, + index_scan = index_beginscan(heapRel, indexRel, true, &SnapshotNonVacuumable, NULL, 1, 0); - /* Set it up for index-only scan */ - index_scan->xs_want_itup = true; + Assert(index_scan->xs_want_itup); + index_scan->xs_visited_pages_limit = VISITED_PAGES_LIMIT; index_rescan(index_scan, scankeys, 1, NULL, 0); /* Fetch first/next tuple in specified direction */ - while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) + while (table_index_getnext_slot(index_scan, indexscandir, tableslot)) { - BlockNumber block = ItemPointerGetBlockNumber(tid); - - if (!VM_ALL_VISIBLE(heapRel, - block, - &vmbuffer)) - { - /* Rats, we have to visit the heap to check visibility */ - if (!index_fetch_heap(index_scan, tableslot)) - { - /* - * No visible tuple for this index entry, so we need to - * advance to the next entry. Before doing so, count heap - * page fetches and give up if we've done too many. - * - * We don't charge a page fetch if this is the same heap page - * as the previous tuple. This is on the conservative side, - * since other recently-accessed pages are probably still in - * buffers too; but it's good enough for this heuristic. - */ -#define VISITED_PAGES_LIMIT 100 - - if (block != last_heap_block) - { - last_heap_block = block; - n_visited_heap_pages++; - if (n_visited_heap_pages > VISITED_PAGES_LIMIT) - break; - } - - continue; /* no visible tuple, try next index entry */ - } - - /* We don't actually need the heap tuple for anything */ - ExecClearTuple(tableslot); - - /* - * We don't care whether there's more than one visible tuple in - * the HOT chain; if any are visible, that's good enough. - */ - } - /* * We expect that the index will return data in IndexTuple not * HeapTuple format. @@ -7258,8 +7217,6 @@ get_actual_variable_endpoint(Relation heapRel, break; } - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); index_endscan(index_scan); return have_data; -- 2.53.0