From 66fe86eeadd9980dfff5ae076f48307f39ff3aa5 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sun, 22 Mar 2026 02:36:57 -0400 Subject: [PATCH v23 1/8] Add slot-based table AM index scan interface. Add table_index_getnext_slot, a new table AM callback that wraps both plain and index-only index scans that use amgettuple. Two new TableAmRoutine callbacks are introduced -- one for plain scans and one for index-only scans -- which an upcoming commit that adds the amgetbatch interface will expand to four. The appropriate callback is resolved once in index_beginscan, and called through a function pointer (xs_getnext_slot) on the IndexScanDesc when the table_index_getnext_slot shim function is called from executor nodes. This moves VM checks for index-only scans out of the executor and into heapam, enabling batching of visibility map lookups (though for now we continue to just perform retail lookups). Using the new higher level slot-based interface greatly simplifies nodeIndexonlyscan.c, which no longer has to deal with the visibility map directly. More importantly, this is a significant architectural improvement: table AMs can now implement index-only scans that are not tied to heapam's visibility map. A small minority of callers (2 callers in total) fundamentally need to pass a TID to the table AM (both perform constraint enforcement). These callers don't actually perform index scans (even if their TIDs are taken from an index), and have no need for most of the index scan machinery. Switch these callers over to the new fetch_tid interface (which replaces the previous TID-based index_fetch_tuple interface). All index scan callers now use the new slot-based interface (table_index_getnext_slot). Index-only scan callers pass table_index_getnext_slot a TupleTableSlot (which the table AM uses internally for heap fetches), but continue to read their results from IndexScanDescData fields such as xs_itup (rather than from the slot itself). All callers can continue to rely on the scan descriptor's xs_heaptid field being set on each call. The VISITED_PAGES_LIMIT mechanism used by get_actual_variable_range to cap scan overhead during planning is reworked to go through a new scan descriptor interface (xs_visited_pages_limit), rather than tracking the costs directly and terminating the scan itself, in an ad-hoc way. This is necessary because callers that use the new slot-based interface no longer have direct access to which heap blocks were fetched. Similarly, nodeIndexonlyscan.c can no longer use InstrCountTuples2 to count heap fetches during an EXPLAIN ANALYZE. EXPLAIN ANALYZE now obtains this information from a new IndexScanInstrumentation field, which table AMs are required to maintain. Though independently useful, this commit is preparatory work for an upcoming commit that will add an amgetbatch index AM interface, where the table AM takes full responsibility for managing the progress of index scans. That will move most of the implementation of scrollable cursors out of index AMs and into table AMs, making it essential that executor nodes pass the current scan direction down to the table AM. The heapam implementations make aggressive use of forced inlining to ensure that plain and index-only code paths are fully specialized at compile time despite sharing a common implementation. Testing has shown this is necessary to keep icache misses to a minimum, at least with the two upcoming amgetbatch variants. Author: Peter Geoghegan Reviewed-By: Andres Freund Reviewed-By: Tomas Vondra Discussion: https://postgr.es/m/CAH2-WzmYqhacBH161peAWb5eF=Ja7CFAQ+0jSEMq=qnfLVTOOg@mail.gmail.com --- src/include/access/genam.h | 5 +- src/include/access/heapam.h | 17 +- src/include/access/relscan.h | 34 ++- src/include/access/tableam.h | 208 ++++++++------- src/include/executor/instrument_node.h | 3 + src/include/nodes/execnodes.h | 2 - src/backend/access/heap/heapam_handler.c | 9 +- src/backend/access/heap/heapam_indexscan.c | 275 +++++++++++++++++++- src/backend/access/heap/visibilitymap.c | 27 +- src/backend/access/index/genam.c | 11 +- src/backend/access/index/indexam.c | 284 +++++++-------------- src/backend/access/nbtree/nbtinsert.c | 10 +- src/backend/access/table/tableam.c | 26 +- src/backend/access/table/tableamapi.c | 2 +- src/backend/commands/constraint.c | 28 +- src/backend/commands/explain.c | 23 +- src/backend/executor/execIndexing.c | 5 +- src/backend/executor/execReplication.c | 8 +- src/backend/executor/nodeBitmapIndexscan.c | 1 + src/backend/executor/nodeIndexonlyscan.c | 106 +------- src/backend/executor/nodeIndexscan.c | 9 +- src/backend/utils/adt/ri_triggers.c | 8 +- src/backend/utils/adt/selfuncs.c | 61 +---- 23 files changed, 624 insertions(+), 538 deletions(-) diff --git a/src/include/access/genam.h b/src/include/access/genam.h index b69320a7f..db62e0ca1 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -156,6 +156,7 @@ extern void index_insert_cleanup(Relation indexRelation, extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, + bool index_only_scan, Snapshot snapshot, IndexScanInstrumentation *instrument, int nkeys, int norderbys, @@ -183,15 +184,13 @@ extern void index_parallelscan_initialize(Relation heapRelation, extern void index_parallelrescan(IndexScanDesc scan); extern IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, + bool index_only_scan, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan, uint32 flags); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); -extern bool index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot); -extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, - TupleTableSlot *slot); extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap); extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 5176478c2..3ca42eb93 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -128,8 +128,8 @@ typedef struct IndexFetchHeapData Buffer xs_cbuf; BlockNumber xs_blk; - /* Current heap block's corresponding page in the visibility map */ - Buffer xs_vmbuffer; + /* For visibility map checks (index-only scans and on-access pruning) */ + Buffer xs_vmbuffer; /* visibility map buffer */ } IndexFetchHeapData; /* Result codes for HeapTupleSatisfiesVacuum */ @@ -430,16 +430,15 @@ extern TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate); /* in heap/heapam_indexscan.c */ -extern IndexFetchTableData *heapam_index_fetch_begin(Relation rel, uint32 flags); -extern void heapam_index_fetch_reset(IndexFetchTableData *scan); -extern void heapam_index_fetch_end(IndexFetchTableData *scan); +extern bool heapam_fetch_tid(Relation rel, ItemPointer tid, Snapshot snapshot, + TupleTableSlot *slot, bool *all_dead); +extern IndexFetchTableData *heapam_index_fetch_begin(IndexScanDesc scan, + uint32 flags); +extern void heapam_index_fetch_reset(IndexScanDesc scan); +extern void heapam_index_fetch_end(IndexScanDesc scan); extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call); -extern bool heapam_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, Snapshot snapshot, - TupleTableSlot *slot, bool *heap_continue, - bool *all_dead); /* in heap/pruneheap.c */ extern void heap_page_prune_opt(Relation relation, Buffer buffer, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 960abf6c2..986b4f5f3 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -16,6 +16,7 @@ #include "access/htup_details.h" #include "access/itup.h" +#include "access/sdir.h" #include "nodes/tidbitmap.h" #include "port/atomics.h" #include "storage/relfilelocator.h" @@ -24,6 +25,7 @@ struct ParallelTableScanDescData; +struct TupleTableSlot; /* * Generic descriptor for table scans. This is the base-class for table scans, @@ -117,12 +119,13 @@ typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker; /* * Base class for fetches from a table via an index. This is the base-class * for such scans, which needs to be embedded in the respective struct for - * individual AMs. + * individual table AMs. + * + * This is essentially the table AM specific portion of IndexScanDescData, + * accessed through its xs_heapfetch field. */ typedef struct IndexFetchTableData { - Relation rel; - /* * Bitmask of ScanOptions affecting the relation. No SO_INTERNAL_FLAGS are * permitted. @@ -166,10 +169,10 @@ typedef struct IndexScanDescData struct IndexScanInstrumentation *instrument; /* - * In an index-only scan, a successful amgettuple call must fill either - * xs_itup (and xs_itupdesc) or xs_hitup (and xs_hitupdesc) to provide the - * data returned by the scan. It can fill both, in which case the heap - * format will be used. + * In an index-only scan, a successful table_index_getnext_slot call must + * fill either xs_itup (and xs_itupdesc) or xs_hitup (and xs_hitupdesc) to + * provide the data returned by the scan. It can fill both, in which case + * the heap format will be used. */ IndexTuple xs_itup; /* index tuple returned by AM */ struct TupleDescData *xs_itupdesc; /* rowtype descriptor of xs_itup */ @@ -181,6 +184,14 @@ typedef struct IndexScanDescData * further results */ IndexFetchTableData *xs_heapfetch; + /* + * Resolved table_index_getnext_slot callback, which is set by + * table_index_fetch_begin at the start of amgettuple scans + */ + bool (*xs_getnext_slot) (struct IndexScanDescData *scan, + ScanDirection direction, + struct TupleTableSlot *slot); + bool xs_recheck; /* T means scan keys must be rechecked */ /* @@ -194,6 +205,13 @@ typedef struct IndexScanDescData bool *xs_orderbynulls; bool xs_recheckorderby; + /* + * An approximate limit on the amount of work, measured in pages touched, + * imposed on the index scan. The default, 0, means no limit. Used by + * selfuncs.c to bound the cost of get_actual_variable_endpoint(). + */ + uint8 xs_visited_pages_limit; + /* parallel index scan information, in shared memory */ struct ParallelIndexScanDescData *parallel_scan; } IndexScanDescData; @@ -208,8 +226,6 @@ typedef struct ParallelIndexScanDescData char ps_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; } ParallelIndexScanDescData; -struct TupleTableSlot; - /* Struct for storage-or-index scans of system tables */ typedef struct SysScanDescData { diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 4647785fd..62016fd0b 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -38,6 +38,7 @@ typedef struct BulkInsertStateData BulkInsertStateData; typedef struct IndexInfo IndexInfo; typedef struct SampleScanState SampleScanState; typedef struct ScanKeyData ScanKeyData; +typedef struct IndexScanDescData *IndexScanDesc; typedef struct ValidateIndexState ValidateIndexState; typedef struct VacuumParams VacuumParams; @@ -442,60 +443,66 @@ typedef struct TableAmRoutine */ /* - * Prepare to fetch tuples from the relation, as needed when fetching - * tuples for an index scan. The callback has to return an - * IndexFetchTableData, which the AM will typically embed in a larger - * structure with additional information. + * Prepare to fetch tuples, as needed when fetching tuples for an index + * scan. The callback has to return an IndexFetchTableData, which the AM + * will typically embed in a larger structure with additional information. + * A pointer to this structure will be stored in passed index scan + * descriptor's xs_heapfetch field by the caller (core executor code). * * flags is a bitmask of ScanOptions affecting underlying table scan * behavior. See scan_begin() for more information on passing these. * - * Tuples for an index scan can then be fetched via index_fetch_tuple. + * Callback is responsible for setting IndexScanDesc.xs_getnext_slot to + * the appropriate slot-based callback. Tuples can then be fetched via + * table_index_getnext_slot(). No separate slot-based callback exists in + * this struct! + * + * In principle a single general-purpose callback (stored here) would + * suffice, but using specialized variants allows the table AM to provide + * minimal code based on conditions that are fixed for the whole scan as + * an optimization (e.g., variants for plain index scans and index-only + * scans, each with fewer branches). + * + * Note that AMs that do not necessarily update indexes when indexed + * columns do not change, need to return the current/correct version of + * the tuple that is visible to the snapshot, even if the tid points to an + * older version of the tuple. */ - struct IndexFetchTableData *(*index_fetch_begin) (Relation rel, uint32 flags); + struct IndexFetchTableData *(*index_fetch_begin) (IndexScanDesc scan, + uint32 flags); /* - * Reset index fetch. Typically this will release cross index fetch - * resources held in IndexFetchTableData. + * Reset index scan for a rescan. Resets table-owned resources. */ - void (*index_fetch_reset) (struct IndexFetchTableData *data); + void (*index_fetch_reset) (IndexScanDesc scan); /* - * Release resources and deallocate index fetch. + * Release resources and deallocate index scan state. + */ + void (*index_fetch_end) (IndexScanDesc scan); + + /* ------------------------------------------------------------------------ + * Callbacks for non-modifying operations on individual tuples + * ------------------------------------------------------------------------ */ - void (*index_fetch_end) (struct IndexFetchTableData *data); /* * Fetch tuple at `tid` into `slot`, after doing a visibility test * according to `snapshot`. If a tuple was found and passed the visibility * test, return true, false otherwise. * - * Note that AMs that do not necessarily update indexes when indexed - * columns do not change, need to return the current/correct version of - * the tuple that is visible to the snapshot, even if the tid points to an - * older version of the tuple. + * This is a lower-level callback for single-shot TID lookups used by + * constraint enforcement code (unique checks and similar). * - * *call_again is false on the first call to index_fetch_tuple for a tid. - * If there potentially is another tuple matching the tid, *call_again - * needs to be set to true by index_fetch_tuple, signaling to the caller - * that index_fetch_tuple should be called again for the same tid. - * - * *all_dead, if all_dead is not NULL, should be set to true by - * index_fetch_tuple iff it is guaranteed that no backend needs to see - * that tuple. Index AMs can use that to avoid returning that tid in - * future searches. - */ - bool (*index_fetch_tuple) (struct IndexFetchTableData *scan, - ItemPointer tid, - Snapshot snapshot, - TupleTableSlot *slot, - bool *call_again, bool *all_dead); - - - /* ------------------------------------------------------------------------ - * Callbacks for non-modifying operations on individual tuples - * ------------------------------------------------------------------------ + * *all_dead, if all_dead is not NULL, should be set to true by fetch_tid + * iff it is guaranteed that no backend needs to see that tuple. Index AMs + * can use that to avoid returning that tid in future searches. */ + bool (*fetch_tid) (Relation rel, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot, + bool *all_dead); /* * Fetch tuple at `tid` into `slot`, after doing a visibility test @@ -1230,16 +1237,18 @@ table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) */ /* - * Prepare to fetch tuples from the relation, as needed when fetching tuples - * for an index scan. + * Prepare to fetch tuples from the relation as part of an index scan. + * Caller puts this in the passed index scan descriptor's xs_heapfetch field. + * + * Tuples for an index scan can then be fetched via table_index_getnext_slot(). * * flags is a bitmask of ScanOptions. No SO_INTERNAL_FLAGS are permitted. - * - * Tuples for an index scan can then be fetched via table_index_fetch_tuple(). */ static inline IndexFetchTableData * -table_index_fetch_begin(Relation rel, uint32 flags) +table_index_fetch_begin(IndexScanDesc scan, uint32 flags) { + Relation rel = scan->heapRelation; + Assert((flags & SO_INTERNAL_FLAGS) == 0); /* @@ -1250,74 +1259,102 @@ table_index_fetch_begin(Relation rel, uint32 flags) if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "scan started during logical decoding"); - return rel->rd_tableam->index_fetch_begin(rel, flags); + return rel->rd_tableam->index_fetch_begin(scan, flags); } /* - * Reset index fetch. Typically this will release cross index fetch resources - * held in IndexFetchTableData. + * Reset index scan, for a rescan. Resets table-owned resources. */ static inline void -table_index_fetch_reset(struct IndexFetchTableData *scan) +table_index_fetch_reset(IndexScanDesc scan) { - scan->rel->rd_tableam->index_fetch_reset(scan); + Assert(scan->xs_heapfetch); + + scan->heapRelation->rd_tableam->index_fetch_reset(scan); } /* - * Release resources and deallocate index fetch. + * Release resources and deallocate the IndexFetchTableData in the scan. */ static inline void -table_index_fetch_end(struct IndexFetchTableData *scan) +table_index_fetch_end(IndexScanDesc scan) { - scan->rel->rd_tableam->index_fetch_end(scan); + Assert(scan->xs_heapfetch); + + scan->heapRelation->rd_tableam->index_fetch_end(scan); } /* - * Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing - * a visibility test according to `snapshot`. If a tuple was found and passed - * the visibility test, returns true, false otherwise. Note that *tid may be - * modified when we return true (see later remarks on multiple row versions - * reachable via a single index entry). + * Fetch the next tuple from an index scan into `slot`, scanning in the + * specified direction. Returns true if a tuple satisfying the scan keys and + * the snapshot was found, false otherwise. The tuple is stored in the + * specified slot. * - * *call_again needs to be false on the first call to table_index_fetch_tuple() for - * a tid. If there potentially is another tuple matching the tid, *call_again - * will be set to true, signaling that table_index_fetch_tuple() should be called - * again for the same tid. + * Dispatches through scan->xs_getnext_slot, which is resolved once by + * the table AM's index_fetch_begin callback. * - * *all_dead, if all_dead is not NULL, will be set to true by - * table_index_fetch_tuple() iff it is guaranteed that no backend needs to see - * that tuple. Index AMs can use that to avoid returning that tid in future - * searches. + * On success, resources (like buffer pins) are likely to be held, and will be + * released by a future table_index_getnext_slot or index_endscan call. * - * The difference between this function and table_tuple_fetch_row_version() - * is that this function returns the currently visible version of a row if - * the AM supports storing multiple row versions reachable via a single index - * entry (like heap's HOT). Whereas table_tuple_fetch_row_version() only - * evaluates the tuple exactly at `tid`. Outside of index entry ->table tuple - * lookups, table_tuple_fetch_row_version() is what's usually needed. + * Note: caller must check scan->xs_recheck, and perform rechecking of the + * scan keys if required. We do not do that here because we don't have + * enough information to do it efficiently in the general case. + * + * For index-only scans, the callback also fills xs_itup/xs_itupdesc or + * xs_hitup/xs_hitupdesc (or both) so that index data can be returned without + * a heap fetch. */ static inline bool -table_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, - Snapshot snapshot, - TupleTableSlot *slot, - bool *call_again, bool *all_dead) +table_index_getnext_slot(IndexScanDesc scan, ScanDirection direction, + TupleTableSlot *slot) { - return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, - slot, call_again, - all_dead); + Assert(scan->xs_heapfetch); + + return scan->xs_getnext_slot(scan, direction, slot); } /* - * This is a convenience wrapper around table_index_fetch_tuple() which - * returns whether there are table tuple items corresponding to an index - * entry. This likely is only useful to verify if there's a conflict in a - * unique index. + * Fetch tuple at `tid` into `slot`, after doing a visibility test according + * to `snapshot`. If a tuple was found and passed the visibility test, returns + * true, false otherwise. This is a low-level interface designed for use by + * constraint enforcement code, where passing a TID can't be avoided. + * + * Note that *tid may be modified when we return true (e.g. due to following a + * HOT chain in a heapam table). Caller should consider passing a pointer to + * a mutable copy of their original TID to avoid unwanted side-effects. + * + * If all_dead is not NULL, *all_dead will be set to true here iff it is + * guaranteed that no backend needs to see any tuple reachable through + * caller's TID. This means that it is safe to mark an index tuple containing + * this TID as LP_DEAD. + * + * The main difference between table_tuple_fetch_row_version() and this + * function is that we return the currently visible version of a row, which + * matters with AMs that support storing multiple row versions reachable via a + * single TID (e.g., due to heapam's HOT chains). To reliably evaluate + * exactly the tuple at `tid`, call table_tuple_fetch_row_version() instead. */ -extern bool table_index_fetch_tuple_check(Relation rel, - ItemPointer tid, - Snapshot snapshot, - bool *all_dead); +static inline bool +table_fetch_tid(Relation rel, + ItemPointer tid, + Snapshot snapshot, + TupleTableSlot *slot, + bool *all_dead) +{ + return rel->rd_tableam->fetch_tid(rel, tid, snapshot, slot, all_dead); +} + +/* + * Convenience wrapper around table_fetch_tid() for callers that just need to + * check if a tuple is visible. + * + * Caller should note the table_fetch_tid warning about *tid being modified + * when we return true in some cases. + */ +extern bool table_fetch_tid_check(Relation rel, + ItemPointer tid, + Snapshot snapshot, + bool *all_dead); /* ------------------------------------------------------------------------ @@ -1331,9 +1368,8 @@ extern bool table_index_fetch_tuple_check(Relation rel, * `snapshot`. If a tuple was found and passed the visibility test, returns * true, false otherwise. * - * See table_index_fetch_tuple's comment about what the difference between - * these functions is. It is correct to use this function outside of index - * entry->table tuple lookups. + * See table_fetch_tid's comment about what the difference between these + * functions is. */ static inline bool table_tuple_fetch_row_version(Relation rel, diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 2a0ff377a..78f810aab 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -48,6 +48,9 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + + /* Table tuples fetched count (incremented during index-only scans) */ + uint64 ntabletuplefetches; } IndexScanInstrumentation; /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 090cfccf6..0b18e74ca 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1785,7 +1785,6 @@ typedef struct IndexScanState * Instrument local index scan instrumentation * SharedInfo parallel worker instrumentation (no leader entry) * TableSlot slot for holding tuples fetched from the table - * VMBuffer buffer in use for visibility map testing, if any * PscanLen size of parallel index-only scan descriptor * NameCStringAttNums attnums of name typed columns to pad to NAMEDATALEN * NameCStringCount number of elements in the NameCStringAttNums array @@ -1808,7 +1807,6 @@ typedef struct IndexOnlyScanState IndexScanInstrumentation *ioss_Instrument; SharedIndexScanInstrumentation *ioss_SharedInfo; TupleTableSlot *ioss_TableSlot; - Buffer ioss_VMBuffer; Size ioss_PscanLen; AttrNumber *ioss_NameCStringAttNums; int ioss_NameCStringCount; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 07f07188d..657ae4414 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -657,8 +657,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tableScan = NULL; heapScan = NULL; - indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0, - SO_NONE); + indexScan = index_beginscan(OldHeap, OldIndex, false, + SnapshotAny, NULL, 0, 0, SO_NONE); index_rescan(indexScan, NULL, 0, NULL, 0); } else @@ -696,7 +696,8 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, if (indexScan != NULL) { - if (!index_getnext_slot(indexScan, ForwardScanDirection, slot)) + if (!table_index_getnext_slot(indexScan, ForwardScanDirection, + slot)) break; /* Since we used no scan keys, should never need to recheck */ @@ -2556,7 +2557,6 @@ static const TableAmRoutine heapam_methods = { .index_fetch_begin = heapam_index_fetch_begin, .index_fetch_reset = heapam_index_fetch_reset, .index_fetch_end = heapam_index_fetch_end, - .index_fetch_tuple = heapam_index_fetch_tuple, .tuple_insert = heapam_tuple_insert, .tuple_insert_speculative = heapam_tuple_insert_speculative, @@ -2566,6 +2566,7 @@ static const TableAmRoutine heapam_methods = { .tuple_update = heapam_tuple_update, .tuple_lock = heapam_tuple_lock, + .fetch_tid = heapam_fetch_tid, .tuple_fetch_row_version = heapam_fetch_row_version, .tuple_get_latest_tid = heap_get_latest_tid, .tuple_tid_valid = heapam_tuple_tid_valid, diff --git a/src/backend/access/heap/heapam_indexscan.c b/src/backend/access/heap/heapam_indexscan.c index 33d14f1de..e1ba19d8a 100644 --- a/src/backend/access/heap/heapam_indexscan.c +++ b/src/backend/access/heap/heapam_indexscan.c @@ -14,32 +14,85 @@ */ #include "postgres.h" +#include "access/amapi.h" #include "access/heapam.h" #include "access/relscan.h" +#include "access/visibilitymap.h" #include "storage/predicate.h" +#include "utils/pgstat_internal.h" +static bool heapam_index_plain_tuple_getnext_slot(IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot); +static bool heapam_index_only_tuple_getnext_slot(IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot); + +/* + * Simple, single-shot TID lookup for constraint enforcement code (unique + * checks and similar). This is essentially just a heap_hot_search_buffer + * wrapper. + * + * This isn't actually related to index scans, but keeping it near + * heap_hot_search_buffer can help the compiler generate better code. + */ +bool +heapam_fetch_tid(Relation rel, ItemPointer tid, Snapshot snapshot, + TupleTableSlot *slot, bool *all_dead) +{ + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + Buffer buf; + bool found; + + Assert(TTS_IS_BUFFERTUPLE(slot)); + + buf = ReadBuffer(rel, ItemPointerGetBlockNumber(tid)); + + LockBuffer(buf, BUFFER_LOCK_SHARE); + found = heap_hot_search_buffer(tid, rel, buf, snapshot, + &bslot->base.tupdata, all_dead, true); + bslot->base.tupdata.t_self = *tid; + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (found) + { + slot->tts_tableOid = RelationGetRelid(rel); + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, + buf); + } + else + ReleaseBuffer(buf); + + return found; +} + /* ------------------------------------------------------------------------ * Index Scan Callbacks for heap AM * ------------------------------------------------------------------------ */ IndexFetchTableData * -heapam_index_fetch_begin(Relation rel, uint32 flags) +heapam_index_fetch_begin(IndexScanDesc scan, uint32 flags) { IndexFetchHeapData *hscan = palloc0_object(IndexFetchHeapData); - hscan->xs_base.rel = rel; hscan->xs_base.flags = flags; hscan->xs_cbuf = InvalidBuffer; hscan->xs_blk = InvalidBlockNumber; hscan->xs_vmbuffer = InvalidBuffer; + /* Resolve which getnext_slot implementation to use for this scan */ + if (scan->xs_want_itup) + scan->xs_getnext_slot = heapam_index_only_tuple_getnext_slot; + else + scan->xs_getnext_slot = heapam_index_plain_tuple_getnext_slot; + return &hscan->xs_base; } void -heapam_index_fetch_reset(IndexFetchTableData *scan) +heapam_index_fetch_reset(IndexScanDesc scan) { /* * Resets are a no-op. @@ -51,9 +104,9 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) } void -heapam_index_fetch_end(IndexFetchTableData *scan) +heapam_index_fetch_end(IndexScanDesc scan) { - IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; + IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan->xs_heapfetch; /* drop pin if there's a pinned heap page */ if (BufferIsValid(hscan->xs_cbuf)) @@ -228,14 +281,18 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, return false; } -bool -heapam_index_fetch_tuple(struct IndexFetchTableData *scan, +/* + * This is the guts of heapam_index_fetch_heap_item, where it actually fetches + * the heap tuple itself + */ +static pg_attribute_always_inline bool +heapam_index_fetch_tuple(Relation rel, + IndexFetchHeapData *hscan, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot, bool *heap_continue, bool *all_dead) { - IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; bool got_heap_tuple; @@ -252,13 +309,12 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, if (BufferIsValid(hscan->xs_cbuf)) ReleaseBuffer(hscan->xs_cbuf); - hscan->xs_cbuf = ReadBuffer(hscan->xs_base.rel, hscan->xs_blk); + hscan->xs_cbuf = ReadBuffer(rel, hscan->xs_blk); /* * Prune page when it is pinned for the first time */ - heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf, - &hscan->xs_vmbuffer, + heap_page_prune_opt(rel, hscan->xs_cbuf, &hscan->xs_vmbuffer, hscan->xs_base.flags & SO_HINT_REL_READ_ONLY); } @@ -268,7 +324,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, /* Obtain share-lock on the buffer so we can examine visibility */ LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE); got_heap_tuple = heap_hot_search_buffer(tid, - hscan->xs_base.rel, + rel, hscan->xs_cbuf, snapshot, &bslot->base.tupdata, @@ -285,7 +341,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, */ *heap_continue = !IsMVCCLikeSnapshot(snapshot); - slot->tts_tableOid = RelationGetRelid(scan->rel); + slot->tts_tableOid = RelationGetRelid(rel); ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf); } else @@ -296,3 +352,196 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, return got_heap_tuple; } + +/* + * Get the scan's next heap item. + * + * The result is a visible heap tuple associated with the index TID most + * recently fetched by our caller in scan->xs_heaptid, or NULL if no more + * matching tuples exist. (There can be more than one matching tuple because + * of HOT chains, although when using an MVCC snapshot it should be impossible + * for more than one such tuple to exist.) + * + * On success, the buffer containing the heap tup is pinned. The pin must be + * dropped elsewhere. + */ +static pg_attribute_always_inline bool +heapam_index_fetch_heap_item(IndexScanDesc scan, IndexFetchHeapData *hscan, + TupleTableSlot *slot, bool *heap_continue) +{ + bool all_dead = false; + bool found; + + found = heapam_index_fetch_tuple(scan->heapRelation, hscan, + &scan->xs_heaptid, + scan->xs_snapshot, slot, + heap_continue, &all_dead); + + if (found) + pgstat_count_heap_fetch(scan->indexRelation); + + /* + * If we scanned a whole HOT chain and found only dead tuples, tell index + * AM to kill its entry for that TID (this will take effect in the next + * amgettuple call, in index_getnext_tid). We do not do this when in + * recovery because it may violate MVCC to do so. See comments in + * RelationGetIndexScan(). + */ + if (!scan->xactStartedInRecovery) + scan->kill_prior_tuple = all_dead; + + return found; +} + +/* + * Common implementation for both heapam_index_*_getnext_slot variants. + * + * The result is true if a tuple satisfying the scan keys and the snapshot was + * found, false otherwise. The tuple is stored in the specified slot. + * + * On success, resources (like buffer pins) are likely to be held, and will be + * dropped by a future call here (or by a later call to heapam_index_fetch_end + * through index_endscan). + * + * The index_only parameter is a compile-time constant at each call site, + * allowing the compiler to specialize the code for each variant. + */ +static pg_attribute_always_inline bool +heapam_index_getnext_slot(IndexScanDesc scan, ScanDirection direction, + TupleTableSlot *slot, bool index_only) +{ + IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan->xs_heapfetch; + bool *heap_continue = &scan->xs_heap_continue; + bool all_visible = false; + BlockNumber last_visited_block = InvalidBlockNumber; + uint8 n_visited_pages = 0; + ItemPointer tid = NULL; + + for (;;) + { + if (!*heap_continue) + { + /* Get the next TID from the index */ + tid = index_getnext_tid(scan, direction); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + /* For index-only scans, check the visibility map */ + if (index_only) + all_visible = VM_ALL_VISIBLE(scan->heapRelation, + ItemPointerGetBlockNumber(tid), + &hscan->xs_vmbuffer); + } + + Assert(ItemPointerIsValid(&scan->xs_heaptid)); + + if (index_only) + { + /* + * We can skip the heap fetch if the TID references a heap page on + * which all tuples are known visible to everybody. In any case, + * we'll use the index tuple not the heap tuple as the data + * source. + */ + if (!all_visible) + { + /* + * Rats, we have to visit the heap to check visibility. + */ + if (scan->instrument) + scan->instrument->ntabletuplefetches++; + + if (!heapam_index_fetch_heap_item(scan, hscan, slot, + heap_continue)) + { + /* + * No visible tuple. If caller set a visited-pages limit + * (only selfuncs.c does this), count distinct heap pages + * and give up once we've visited too many. + */ + if (unlikely(scan->xs_visited_pages_limit > 0)) + { + Assert(hscan->xs_blk == ItemPointerGetBlockNumber(tid)); + + if (hscan->xs_blk != last_visited_block) + { + last_visited_block = hscan->xs_blk; + if (++n_visited_pages > scan->xs_visited_pages_limit) + return false; /* give up */ + } + } + continue; /* no visible tuple, try next index entry */ + } + + /* We don't actually need the heap tuple for anything */ + ExecClearTuple(slot); + + /* + * Only MVCC snapshots are supported with standard index-only + * scans, so there should be no need to keep following the HOT + * chain once a visible entry has been found. Other callers + * (currently only selfuncs.c) use SnapshotNonVacuumable, and + * want us to assume that just having one visible tuple in the + * hot chain is always good enough. + */ + Assert(!(*heap_continue && IsMVCCSnapshot(scan->xs_snapshot))); + } + else + { + /* + * We didn't access the heap, so we'll need to take a + * predicate lock explicitly, as if we had. For now we do + * that at page level. + */ + PredicateLockPage(scan->heapRelation, + ItemPointerGetBlockNumber(tid), + scan->xs_snapshot); + } + + /* + * Return matching index tuple now set in scan->xs_itup (or return + * matching heap tuple now set in scan->xs_hitup) + */ + return true; + } + else + { + /* + * Fetch the next (or only) visible heap tuple for this index + * entry. If we don't find anything, loop around and grab the + * next TID from the index. + */ + if (heapam_index_fetch_heap_item(scan, hscan, slot, + heap_continue)) + return true; + } + } + + return false; +} + +/* xs_getnext_slot callback: amgettuple, plain index scan */ +static pg_attribute_hot bool +heapam_index_plain_tuple_getnext_slot(IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot) +{ + Assert(!scan->xs_want_itup); + Assert(scan->indexRelation->rd_indam->amgettuple != NULL); + + return heapam_index_getnext_slot(scan, direction, slot, false); +} + +/* xs_getnext_slot callback: amgettuple, index-only scan */ +static pg_attribute_hot bool +heapam_index_only_tuple_getnext_slot(IndexScanDesc scan, + ScanDirection direction, + TupleTableSlot *slot) +{ + Assert(scan->xs_want_itup); + Assert(scan->indexRelation->rd_indam->amgettuple != NULL); + + return heapam_index_getnext_slot(scan, direction, slot, true); +} diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 4fd470702..4ba9f48e9 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -313,7 +313,32 @@ visibilitymap_set(BlockNumber heapBlk, * since we don't lock the visibility map page either, it's even possible that * someone else could have changed the bit just before we look at it, but yet * we might see the old value. It is the caller's responsibility to deal with - * all concurrency issues! + * all concurrency issues! In practice it can't be stale enough to matter for + * the primary use case: index-only scans that check whether a heap fetch can + * be skipped. + * + * The argument for why it can't be stale enough to matter for the primary use + * case is as follows: + * + * Inserts: we need to detect that a VM bit was cleared by an insert right + * away, because the new tuple is present in the index but not yet visible. + * Reading the TID from the index page (under a shared lock on the index + * buffer) is serialized with the insertion of the TID into the index (under + * an exclusive lock on the same index buffer). Because the VM bit is cleared + * before the index is updated, and locking/unlocking of the index page acts + * as a full memory barrier, we are sure to see the cleared bit whenever we + * see a recently-inserted TID. + * + * Deletes: the clearing of the VM bit by a delete is NOT serialized with the + * index page access, because deletes do not update the index page (only + * VACUUM removes the index TID). So we may see a significantly stale value. + * However, we don't need to detect the delete right away, because the tuple + * remains visible until the deleting transaction commits or the statement + * ends (if it's our own transaction). In either case, the lock on the VM + * buffer will have been released (acting as a write barrier) after clearing + * the bit. And for us to have a snapshot that includes the deleting + * transaction (making the tuple invisible), we must have acquired + * ProcArrayLock after that time, acting as a read barrier. */ uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 1408989c5..acc9f3e6a 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -126,6 +126,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->xs_visited_pages_limit = 0; + return scan; } @@ -454,7 +456,7 @@ systable_beginscan(Relation heapRelation, elog(ERROR, "column is not in index"); } - sysscan->iscan = index_beginscan(heapRelation, irel, + sysscan->iscan = index_beginscan(heapRelation, irel, false, snapshot, NULL, nkeys, 0, SO_NONE); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); @@ -518,7 +520,8 @@ systable_getnext(SysScanDesc sysscan) if (sysscan->irel) { - if (index_getnext_slot(sysscan->iscan, ForwardScanDirection, sysscan->slot)) + if (table_index_getnext_slot(sysscan->iscan, ForwardScanDirection, + sysscan->slot)) { bool shouldFree; @@ -716,7 +719,7 @@ systable_beginscan_ordered(Relation heapRelation, if (TransactionIdIsValid(CheckXidAlive)) bsysscan = true; - sysscan->iscan = index_beginscan(heapRelation, indexRelation, + sysscan->iscan = index_beginscan(heapRelation, indexRelation, false, snapshot, NULL, nkeys, 0, SO_NONE); index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); @@ -736,7 +739,7 @@ systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction) HeapTuple htup = NULL; Assert(sysscan->irel); - if (index_getnext_slot(sysscan->iscan, direction, sysscan->slot)) + if (table_index_getnext_slot(sysscan->iscan, direction, sysscan->slot)) htup = ExecFetchSlotHeapTuple(sysscan->slot, false, NULL); /* See notes in systable_getnext */ diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 23288a4f9..3fac4c30d 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -24,9 +24,7 @@ * index_parallelscan_initialize - initialize parallel scan * index_parallelrescan - (re)start a parallel scan of an index * index_beginscan_parallel - join parallel index scan - * index_getnext_tid - get the next TID from a scan - * index_fetch_heap - get the scan's next heap tuple - * index_getnext_slot - get the next tuple from a scan + * index_getnext_tid - amgettuple table AM helper routine * index_getbitmap - get all tuples from a scan * index_bulk_delete - bulk deletion of index tuples * index_vacuum_cleanup - post-deletion cleanup of an index @@ -105,9 +103,6 @@ do { \ CppAsString(pname), RelationGetRelationName(scan->indexRelation)); \ } while(0) -static IndexScanDesc index_beginscan_internal(Relation indexRelation, - int nkeys, int norderbys, Snapshot snapshot, - ParallelIndexScanDesc pscan, bool temp_snap); static inline void validate_relation_as_index(Relation r); @@ -248,84 +243,17 @@ index_insert_cleanup(Relation indexRelation, indexRelation->rd_indam->aminsertcleanup(indexRelation, indexInfo); } -/* - * index_beginscan - start a scan of an index with amgettuple - * - * Caller must be holding suitable locks on the heap and the index. - */ -IndexScanDesc -index_beginscan(Relation heapRelation, - Relation indexRelation, - Snapshot snapshot, - IndexScanInstrumentation *instrument, - int nkeys, int norderbys, - uint32 flags) -{ - IndexScanDesc scan; - - Assert(snapshot != InvalidSnapshot); - - /* Check that a historic snapshot is not used for non-catalog tables */ - if (IsHistoricMVCCSnapshot(snapshot) && - !RelationIsAccessibleInLogicalDecoding(heapRelation)) - { - ereport(ERROR, - (errcode(ERRCODE_INVALID_TRANSACTION_STATE), - errmsg("cannot query non-catalog table \"%s\" during logical decoding", - RelationGetRelationName(heapRelation)))); - } - - scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false); - - /* - * Save additional parameters into the scandesc. Everything else was set - * up by RelationGetIndexScan. - */ - scan->heapRelation = heapRelation; - scan->xs_snapshot = snapshot; - scan->instrument = instrument; - - /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heapRelation, flags); - - return scan; -} - -/* - * index_beginscan_bitmap - start a scan of an index with amgetbitmap - * - * As above, caller had better be holding some lock on the parent heap - * relation, even though it's not explicitly mentioned here. - */ -IndexScanDesc -index_beginscan_bitmap(Relation indexRelation, - Snapshot snapshot, - IndexScanInstrumentation *instrument, - int nkeys) -{ - IndexScanDesc scan; - - Assert(snapshot != InvalidSnapshot); - - scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false); - - /* - * Save additional parameters into the scandesc. Everything else was set - * up by RelationGetIndexScan. - */ - scan->xs_snapshot = snapshot; - scan->instrument = instrument; - - return scan; -} - /* * index_beginscan_internal --- common code for index_beginscan variants + * + * When heapRelation is not NULL, also initializes table AM index scan state. */ -static IndexScanDesc -index_beginscan_internal(Relation indexRelation, +static pg_attribute_always_inline IndexScanDesc +index_beginscan_internal(Relation indexRelation, Relation heapRelation, int nkeys, int norderbys, Snapshot snapshot, - ParallelIndexScanDesc pscan, bool temp_snap) + ParallelIndexScanDesc pscan, + IndexScanInstrumentation *instrument, + bool index_only_scan, bool temp_snap, uint32 flags) { IndexScanDesc scan; @@ -349,9 +277,80 @@ index_beginscan_internal(Relation indexRelation, scan->parallel_scan = pscan; scan->xs_temp_snap = temp_snap; + scan->xs_snapshot = snapshot; + scan->instrument = instrument; + + /* + * Initialize heap-side scan state when a heap relation is provided. + * Bitmap index scans don't provide one. + */ + if (heapRelation != NULL) + { + scan->heapRelation = heapRelation; + scan->xs_want_itup = index_only_scan; + scan->xs_heap_continue = false; + + /* prepare to fetch index matches from table */ + scan->xs_heapfetch = table_index_fetch_begin(scan, flags); + + /* table AM must set xs_getnext_slot callback for us */ + Assert(scan->xs_getnext_slot != NULL); + } + return scan; } +/* + * index_beginscan - start a scan of an index with amgettuple + * + * Caller must be holding suitable locks on the heap and the index. + */ +IndexScanDesc +index_beginscan(Relation heapRelation, + Relation indexRelation, + bool index_only_scan, + Snapshot snapshot, + IndexScanInstrumentation *instrument, + int nkeys, int norderbys, + uint32 flags) +{ + Assert(snapshot != InvalidSnapshot); + + /* Check that a historic snapshot is not used for non-catalog tables */ + if (IsHistoricMVCCSnapshot(snapshot) && + !RelationIsAccessibleInLogicalDecoding(heapRelation)) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot query non-catalog table \"%s\" during logical decoding", + RelationGetRelationName(heapRelation)))); + } + + return index_beginscan_internal(indexRelation, heapRelation, + nkeys, norderbys, + snapshot, NULL, instrument, + index_only_scan, false, flags); +} + +/* + * index_beginscan_bitmap - start a scan of an index with amgetbitmap + * + * As above, caller had better be holding some lock on the parent heap + * relation, even though it's not explicitly mentioned here. + */ +IndexScanDesc +index_beginscan_bitmap(Relation indexRelation, + Snapshot snapshot, + IndexScanInstrumentation *instrument, + int nkeys) +{ + Assert(snapshot != InvalidSnapshot); + Assert(IsMVCCLikeSnapshot(snapshot)); + + return index_beginscan_internal(indexRelation, NULL, nkeys, 0, snapshot, + NULL, instrument, false, false, SO_NONE); +} + /* ---------------- * index_rescan - (re)start a scan of an index * @@ -377,7 +376,7 @@ index_rescan(IndexScanDesc scan, /* reset table AM state for rescan */ if (scan->xs_heapfetch) - table_index_fetch_reset(scan->xs_heapfetch); + table_index_fetch_reset(scan); scan->kill_prior_tuple = false; /* for safety */ scan->xs_heap_continue = false; @@ -399,7 +398,7 @@ index_endscan(IndexScanDesc scan) /* Release resources (like buffer pins) from table accesses */ if (scan->xs_heapfetch) { - table_index_fetch_end(scan->xs_heapfetch); + table_index_fetch_end(scan); scan->xs_heapfetch = NULL; } @@ -454,7 +453,7 @@ index_restrpos(IndexScanDesc scan) /* reset table AM state for restoring the marked position */ if (scan->xs_heapfetch) - table_index_fetch_reset(scan->xs_heapfetch); + table_index_fetch_reset(scan); scan->kill_prior_tuple = false; /* for safety */ scan->xs_heap_continue = false; @@ -580,7 +579,7 @@ index_parallelrescan(IndexScanDesc scan) /* reset table AM state for rescan */ if (scan->xs_heapfetch) - table_index_fetch_reset(scan->xs_heapfetch); + table_index_fetch_reset(scan); /* amparallelrescan is optional; assume no-op if not provided by AM */ if (scan->indexRelation->rd_indam->amparallelrescan != NULL) @@ -597,41 +596,33 @@ index_parallelrescan(IndexScanDesc scan) */ IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, + bool index_only_scan, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan, uint32 flags) { Snapshot snapshot; - IndexScanDesc scan; Assert(RelFileLocatorEquals(heaprel->rd_locator, pscan->ps_locator)); Assert(RelFileLocatorEquals(indexrel->rd_locator, pscan->ps_indexlocator)); snapshot = RestoreSnapshot(pscan->ps_snapshot_data); RegisterSnapshot(snapshot); - scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot, - pscan, true); - /* - * Save additional parameters into the scandesc. Everything else was set - * up by index_beginscan_internal. - */ - scan->heapRelation = heaprel; - scan->xs_snapshot = snapshot; - scan->instrument = instrument; - - /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heaprel, flags); - - return scan; + return index_beginscan_internal(indexrel, heaprel, nkeys, norderbys, + snapshot, pscan, instrument, + index_only_scan, true, flags); } /* ---------------- - * index_getnext_tid - get the next TID from a scan + * index_getnext_tid - amgettuple interface * * The result is the next TID satisfying the scan keys, * or NULL if no more matching tuples exist. + * + * This should only be called by table AM amgettuple-based index scan + * callbacks. * ---------------- */ ItemPointer @@ -662,7 +653,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) { /* reset table AM state */ if (scan->xs_heapfetch) - table_index_fetch_reset(scan->xs_heapfetch); + table_index_fetch_reset(scan); return NULL; } @@ -674,97 +665,6 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) return &scan->xs_heaptid; } -/* ---------------- - * index_fetch_heap - get the scan's next heap tuple - * - * The result is a visible heap tuple associated with the index TID most - * recently fetched by index_getnext_tid, or NULL if no more matching tuples - * exist. (There can be more than one matching tuple because of HOT chains, - * although when using an MVCC snapshot it should be impossible for more than - * one such tuple to exist.) - * - * On success, the buffer containing the heap tup is pinned (the pin will be - * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan - * call). - * - * Note: caller must check scan->xs_recheck, and perform rechecking of the - * scan keys if required. We do not do that here because we don't have - * enough information to do it efficiently in the general case. - * ---------------- - */ -bool -index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) -{ - bool all_dead = false; - bool found; - - found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid, - scan->xs_snapshot, slot, - &scan->xs_heap_continue, &all_dead); - - if (found) - pgstat_count_heap_fetch(scan->indexRelation); - - /* - * If we scanned a whole HOT chain and found only dead tuples, tell index - * AM to kill its entry for that TID (this will take effect in the next - * amgettuple call, in index_getnext_tid). We do not do this when in - * recovery because it may violate MVCC to do so. See comments in - * RelationGetIndexScan(). - */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; - - return found; -} - -/* ---------------- - * index_getnext_slot - get the next tuple from a scan - * - * The result is true if a tuple satisfying the scan keys and the snapshot was - * found, false otherwise. The tuple is stored in the specified slot. - * - * On success, resources (like buffer pins) are likely to be held, and will be - * dropped by a future index_getnext_tid, index_fetch_heap or index_endscan - * call). - * - * Note: caller must check scan->xs_recheck, and perform rechecking of the - * scan keys if required. We do not do that here because we don't have - * enough information to do it efficiently in the general case. - * ---------------- - */ -bool -index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *slot) -{ - for (;;) - { - if (!scan->xs_heap_continue) - { - ItemPointer tid; - - /* Time to fetch the next TID from the index */ - tid = index_getnext_tid(scan, direction); - - /* If we're out of index entries, we're done */ - if (tid == NULL) - break; - - Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); - } - - /* - * Fetch the next (or only) visible heap tuple for this index entry. - * If we don't find anything, loop around and grab the next TID from - * the index. - */ - Assert(ItemPointerIsValid(&scan->xs_heaptid)); - if (index_fetch_heap(scan, slot)) - return true; - } - - return false; -} - /* ---------------- * index_getbitmap - get all tuples at once from an index scan * diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index c8af97dd2..f1b55fb20 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -560,9 +560,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * with optimizations like heap's HOT, we have just a single * index entry for the entire chain. */ - else if (table_index_fetch_tuple_check(heapRel, &htid, - &SnapshotDirty, - &all_dead)) + else if (table_fetch_tid_check(heapRel, &htid, + &SnapshotDirty, + &all_dead)) { TransactionId xwait; @@ -618,8 +618,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * entry. */ htid = itup->t_tid; - if (table_index_fetch_tuple_check(heapRel, &htid, - SnapshotSelf, NULL)) + if (table_fetch_tid_check(heapRel, &htid, + SnapshotSelf, NULL)) { /* Normal case --- it's still live */ } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 68ff0966f..0ac9f0143 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -228,32 +228,20 @@ table_beginscan_parallel_tidrange(Relation relation, */ /* - * To perform that check simply start an index scan, create the necessary - * slot, do the heap lookup, and shut everything down again. This could be - * optimized, but is unlikely to matter from a performance POV. If there - * frequently are live index pointers also matching a unique index key, the - * CPU overhead of this routine is unlikely to matter. - * - * Note that *tid may be modified when we return true if the AM supports - * storing multiple row versions reachable via a single index entry (like - * heap's HOT). + * Caller should note the table_fetch_tid warning about *tid being modified + * when we return true in some cases */ bool -table_index_fetch_tuple_check(Relation rel, - ItemPointer tid, - Snapshot snapshot, - bool *all_dead) +table_fetch_tid_check(Relation rel, + ItemPointer tid, + Snapshot snapshot, + bool *all_dead) { - IndexFetchTableData *scan; TupleTableSlot *slot; - bool call_again = false; bool found; slot = table_slot_create(rel, NULL); - scan = table_index_fetch_begin(rel, SO_NONE); - found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, - all_dead); - table_index_fetch_end(scan); + found = table_fetch_tid(rel, tid, snapshot, slot, all_dead); ExecDropSingleTupleTableSlot(slot); return found; diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 5450a27fa..c09473b97 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -53,8 +53,8 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->index_fetch_begin != NULL); Assert(routine->index_fetch_reset != NULL); Assert(routine->index_fetch_end != NULL); - Assert(routine->index_fetch_tuple != NULL); + Assert(routine->fetch_tid != NULL); Assert(routine->tuple_fetch_row_version != NULL); Assert(routine->tuple_tid_valid != NULL); Assert(routine->tuple_get_latest_tid != NULL); diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 421d8c359..7aff48124 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -105,23 +105,14 @@ unique_key_recheck(PG_FUNCTION_ARGS) * removed. */ tmptid = checktid; + if (!table_fetch_tid(trigdata->tg_relation, &tmptid, SnapshotSelf, + slot, NULL)) { - IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation, - SO_NONE); - bool call_again = false; - - if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot, - &call_again, NULL)) - { - /* - * All rows referenced by the index entry are dead, so skip the - * check. - */ - ExecDropSingleTupleTableSlot(slot); - table_index_fetch_end(scan); - return PointerGetDatum(NULL); - } - table_index_fetch_end(scan); + /* + * All rows referenced by the index entry are dead, so skip the check + */ + ExecDropSingleTupleTableSlot(slot); + return PointerGetDatum(NULL); } /* @@ -168,9 +159,8 @@ unique_key_recheck(PG_FUNCTION_ARGS) /* * Note: this is not a real insert; it is a check that the index entry * that has already been inserted is unique. Passing the tuple's tid - * (i.e. unmodified by table_index_fetch_tuple()) is correct even if - * the row is now dead, because that is the TID the index will know - * about. + * (i.e. unmodified by table_fetch_tid()) is correct even if the row + * is now dead, because that is the TID the index will know about. */ index_insert(indexRel, values, isnull, &checktid, trigdata->tg_relation, UNIQUE_CHECK_EXISTING, diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index e4b70166b..18cf422ad 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -136,7 +136,7 @@ static void show_recursive_union_info(RecursiveUnionState *rstate, static void show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es); static void show_hashagg_info(AggState *aggstate, ExplainState *es); -static void show_indexsearches_info(PlanState *planstate, ExplainState *es); +static void show_indexscan_info(PlanState *planstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); static void show_instrumentation_count(const char *qlabel, int which, @@ -1974,7 +1974,7 @@ ExplainNode(PlanState *planstate, List *ancestors, if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); - show_indexsearches_info(planstate, es); + show_indexscan_info(planstate, es); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -1988,15 +1988,12 @@ ExplainNode(PlanState *planstate, List *ancestors, if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); - if (es->analyze) - ExplainPropertyFloat("Heap Fetches", NULL, - planstate->instrument->ntuples2, 0, es); - show_indexsearches_info(planstate, es); + show_indexscan_info(planstate, es); break; case T_BitmapIndexScan: show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig, "Index Cond", planstate, ancestors, es); - show_indexsearches_info(planstate, es); + show_indexscan_info(planstate, es); break; case T_BitmapHeapScan: show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig, @@ -3860,15 +3857,16 @@ show_hashagg_info(AggState *aggstate, ExplainState *es) } /* - * Show the total number of index searches for a + * Show index scan related executor instrumentation for a * IndexScan/IndexOnlyScan/BitmapIndexScan node */ static void -show_indexsearches_info(PlanState *planstate, ExplainState *es) +show_indexscan_info(PlanState *planstate, ExplainState *es) { Plan *plan = planstate->plan; SharedIndexScanInstrumentation *SharedInfo = NULL; - uint64 nsearches = 0; + uint64 nsearches = 0, + ntabletuplefetches = 0; if (!es->analyze) return; @@ -3889,6 +3887,7 @@ show_indexsearches_info(PlanState *planstate, ExplainState *es) IndexOnlyScanState *indexstate = ((IndexOnlyScanState *) planstate); nsearches = indexstate->ioss_Instrument->nsearches; + ntabletuplefetches = indexstate->ioss_Instrument->ntabletuplefetches; SharedInfo = indexstate->ioss_SharedInfo; break; } @@ -3912,9 +3911,13 @@ show_indexsearches_info(PlanState *planstate, ExplainState *es) IndexScanInstrumentation *winstrument = &SharedInfo->winstrument[i]; nsearches += winstrument->nsearches; + ntabletuplefetches += winstrument->ntabletuplefetches; } } + if (nodeTag(plan) == T_IndexOnlyScan) + ExplainPropertyUInteger("Heap Fetches", NULL, ntabletuplefetches, es); + ExplainPropertyUInteger("Index Searches", NULL, nsearches, es); } diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 4363e154c..84c8c3403 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -815,12 +815,13 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, retry: conflict = false; found_self = false; - index_scan = index_beginscan(heap, index, + index_scan = index_beginscan(heap, index, false, &DirtySnapshot, NULL, indnkeyatts, 0, SO_NONE); index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0); - while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot)) + while (table_index_getnext_slot(index_scan, ForwardScanDirection, + existing_slot)) { TransactionId xwait; XLTW_Oper reason_wait; diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index b2ca5cbf1..fd9efd947 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -205,7 +205,7 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot); /* Start an index scan. */ - scan = index_beginscan(rel, idxrel, + scan = index_beginscan(rel, idxrel, false, &snap, NULL, skey_attoff, 0, SO_NONE); retry: @@ -214,7 +214,7 @@ retry: index_rescan(scan, skey, skey_attoff, NULL, 0); /* Try to find the tuple */ - while (index_getnext_slot(scan, ForwardScanDirection, outslot)) + while (table_index_getnext_slot(scan, ForwardScanDirection, outslot)) { /* * Avoid expensive equality check if the index is primary key or @@ -669,13 +669,13 @@ RelationFindDeletedTupleInfoByIndex(Relation rel, Oid idxoid, * not yet committed or those just committed prior to the scan are * excluded in update_most_recent_deletion_info(). */ - scan = index_beginscan(rel, idxrel, + scan = index_beginscan(rel, idxrel, false, SnapshotAny, NULL, skey_attoff, 0, SO_NONE); index_rescan(scan, skey, skey_attoff, NULL, 0); /* Try to find the tuple */ - while (index_getnext_slot(scan, ForwardScanDirection, scanslot)) + while (table_index_getnext_slot(scan, ForwardScanDirection, scanslot)) { /* * Avoid expensive equality check if the index is primary key or diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index 70c55ee6d..a9a3d2fb1 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -204,6 +204,7 @@ ExecEndBitmapIndexScan(BitmapIndexScanState *node) * which will have a new BitmapIndexScanState and zeroed stats. */ winstrument->nsearches += node->biss_Instrument->nsearches; + Assert(node->biss_Instrument->ntabletuplefetches == 0); } /* diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index de6154fd5..b6b9dbd10 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -34,7 +34,6 @@ #include "access/relscan.h" #include "access/tableam.h" #include "access/tupdesc.h" -#include "access/visibilitymap.h" #include "catalog/pg_type.h" #include "executor/executor.h" #include "executor/instrument.h" @@ -42,7 +41,6 @@ #include "executor/nodeIndexscan.h" #include "miscadmin.h" #include "storage/bufmgr.h" -#include "storage/predicate.h" #include "utils/builtins.h" #include "utils/rel.h" @@ -66,7 +64,6 @@ IndexOnlyNext(IndexOnlyScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; - ItemPointer tid; /* * extract necessary information from index scan node @@ -92,6 +89,7 @@ IndexOnlyNext(IndexOnlyScanState *node) */ scandesc = index_beginscan(node->ss.ss_currentRelation, node->ioss_RelationDesc, + true, estate->es_snapshot, node->ioss_Instrument, node->ioss_NumScanKeys, @@ -100,11 +98,7 @@ IndexOnlyNext(IndexOnlyScanState *node) SO_HINT_REL_READ_ONLY : SO_NONE); node->ioss_ScanDesc = scandesc; - - - /* Set it up for index-only scan */ - node->ioss_ScanDesc->xs_want_itup = true; - node->ioss_VMBuffer = InvalidBuffer; + Assert(node->ioss_ScanDesc->xs_want_itup); /* * If no run-time keys to calculate or they are ready, go ahead and @@ -121,78 +115,11 @@ IndexOnlyNext(IndexOnlyScanState *node) /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while (table_index_getnext_slot(scandesc, direction, + node->ioss_TableSlot)) { - bool tuple_from_heap = false; - CHECK_FOR_INTERRUPTS(); - /* - * We can skip the heap fetch if the TID references a heap page on - * which all tuples are known visible to everybody. In any case, - * we'll use the index tuple not the heap tuple as the data source. - * - * Note on Memory Ordering Effects: visibilitymap_get_status does not - * lock the visibility map buffer, and therefore the result we read - * here could be slightly stale. However, it can't be stale enough to - * matter. - * - * We need to detect clearing a VM bit due to an insert right away, - * because the tuple is present in the index page but not visible. The - * reading of the TID by this scan (using a shared lock on the index - * buffer) is serialized with the insert of the TID into the index - * (using an exclusive lock on the index buffer). Because the VM bit - * is cleared before updating the index, and locking/unlocking of the - * index page acts as a full memory barrier, we are sure to see the - * cleared bit if we see a recently-inserted TID. - * - * Deletes do not update the index page (only VACUUM will clear out - * the TID), so the clearing of the VM bit by a delete is not - * serialized with this test below, and we may see a value that is - * significantly stale. However, we don't care about the delete right - * away, because the tuple is still visible until the deleting - * transaction commits or the statement ends (if it's our - * transaction). In either case, the lock on the VM buffer will have - * been released (acting as a write barrier) after clearing the bit. - * And for us to have a snapshot that includes the deleting - * transaction (making the tuple invisible), we must have acquired - * ProcArrayLock after that time, acting as a read barrier. - * - * It's worth going through this complexity to avoid needing to lock - * the VM buffer, which could cause significant contention. - */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) - { - /* - * Rats, we have to visit the heap to check visibility. - */ - InstrCountTuples2(node, 1); - if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) - continue; /* no visible tuple, try next index entry */ - - ExecClearTuple(node->ioss_TableSlot); - - /* - * Only MVCC snapshots are supported here, so there should be no - * need to keep following the HOT chain once a visible entry has - * been found. If we did want to allow that, we'd need to keep - * more state to remember not to call index_getnext_tid next time. - */ - if (scandesc->xs_heap_continue) - elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); - - /* - * Note: at this point we are holding a pin on the heap page, as - * recorded in scandesc->xs_cbuf. We could release that pin now, - * but it's not clear whether it's a win to do so. The next index - * entry might require a visit to the same heap page. - */ - - tuple_from_heap = true; - } - /* * Fill the scan tuple slot with data from the index. This might be * provided in either HeapTuple or IndexTuple format. Conceivably an @@ -241,16 +168,6 @@ IndexOnlyNext(IndexOnlyScanState *node) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("lossy distance functions are not supported in index-only scans"))); - - /* - * If we didn't access the heap, then we'll need to take a predicate - * lock explicitly, as if we had. For now we do that at page level. - */ - if (!tuple_from_heap) - PredicateLockPage(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - estate->es_snapshot); - return slot; } @@ -410,13 +327,6 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node) indexRelationDesc = node->ioss_RelationDesc; indexScanDesc = node->ioss_ScanDesc; - /* Release VM buffer pin, if any. */ - if (node->ioss_VMBuffer != InvalidBuffer) - { - ReleaseBuffer(node->ioss_VMBuffer); - node->ioss_VMBuffer = InvalidBuffer; - } - /* * When ending a parallel worker, copy the statistics gathered by the * worker back into shared memory so that it can be picked up by the main @@ -436,6 +346,7 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->ioss_Instrument->nsearches; + winstrument->ntabletuplefetches += node->ioss_Instrument->ntabletuplefetches; } /* @@ -793,14 +704,14 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, node->ioss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->ioss_RelationDesc, + true, node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, piscan, ScanRelIsReadOnly(&node->ss) ? SO_HINT_REL_READ_ONLY : SO_NONE); - node->ioss_ScanDesc->xs_want_itup = true; - node->ioss_VMBuffer = InvalidBuffer; + Assert(node->ioss_ScanDesc->xs_want_itup); /* * If no run-time keys to calculate or they are ready, go ahead and pass @@ -861,13 +772,14 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, node->ioss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->ioss_RelationDesc, + true, node->ioss_Instrument, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys, piscan, ScanRelIsReadOnly(&node->ss) ? SO_HINT_REL_READ_ONLY : SO_NONE); - node->ioss_ScanDesc->xs_want_itup = true; + Assert(node->ioss_ScanDesc->xs_want_itup); /* * If no run-time keys to calculate or they are ready, go ahead and pass diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 1620d1460..2ac854da4 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -110,6 +110,7 @@ IndexNext(IndexScanState *node) */ scandesc = index_beginscan(node->ss.ss_currentRelation, node->iss_RelationDesc, + false, estate->es_snapshot, node->iss_Instrument, node->iss_NumScanKeys, @@ -132,7 +133,7 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while (table_index_getnext_slot(scandesc, direction, slot)) { CHECK_FOR_INTERRUPTS(); @@ -208,6 +209,7 @@ IndexNextWithReorder(IndexScanState *node) */ scandesc = index_beginscan(node->ss.ss_currentRelation, node->iss_RelationDesc, + false, estate->es_snapshot, node->iss_Instrument, node->iss_NumScanKeys, @@ -266,7 +268,7 @@ IndexNextWithReorder(IndexScanState *node) * Fetch next tuple from the index. */ next_indextuple: - if (!index_getnext_slot(scandesc, ForwardScanDirection, slot)) + if (!table_index_getnext_slot(scandesc, ForwardScanDirection, slot)) { /* * No more tuples from the index. But we still need to drain any @@ -818,6 +820,7 @@ ExecEndIndexScan(IndexScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->iss_Instrument->nsearches; + Assert(node->iss_Instrument->ntabletuplefetches == 0); } /* @@ -1731,6 +1734,7 @@ ExecIndexScanInitializeDSM(IndexScanState *node, node->iss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->iss_RelationDesc, + false, node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, @@ -1797,6 +1801,7 @@ ExecIndexScanInitializeWorker(IndexScanState *node, node->iss_ScanDesc = index_beginscan_parallel(node->ss.ss_currentRelation, node->iss_RelationDesc, + false, node->iss_Instrument, node->iss_NumScanKeys, node->iss_NumOrderByKeys, diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index 84f9fecdb..fceb56386 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -2803,7 +2803,7 @@ ri_FastPathCheck(const RI_ConstraintInfo *riinfo, idx_rel = index_open(riinfo->conindid, AccessShareLock); slot = table_slot_create(pk_rel, NULL); - scandesc = index_beginscan(pk_rel, idx_rel, + scandesc = index_beginscan(pk_rel, idx_rel, false, snapshot, NULL, riinfo->nkeys, 0, SO_NONE); @@ -2918,7 +2918,7 @@ ri_FastPathBatchFlush(RI_FastPathEntry *fpentry, Relation fk_rel, */ oldcxt = MemoryContextSwitchTo(fpentry->flush_cxt); - scandesc = index_beginscan(pk_rel, idx_rel, snapshot, NULL, + scandesc = index_beginscan(pk_rel, idx_rel, false, snapshot, NULL, riinfo->nkeys, 0, SO_NONE); GetUserIdAndSecContext(&saved_userid, &saved_sec_context); @@ -3112,7 +3112,7 @@ ri_FastPathFlushArray(RI_FastPathEntry *fpentry, TupleTableSlot *fk_slot, * Walk all matches. The index AM returns them in index order. For each * match, find which batch item(s) it satisfies. */ - while (index_getnext_slot(scandesc, ForwardScanDirection, pk_slot)) + while (table_index_getnext_slot(scandesc, ForwardScanDirection, pk_slot)) { Datum found_val; bool found_null; @@ -3185,7 +3185,7 @@ ri_FastPathProbeOne(Relation pk_rel, Relation idx_rel, index_rescan(scandesc, skey, nkeys, NULL, 0); - if (index_getnext_slot(scandesc, ForwardScanDirection, slot)) + if (table_index_getnext_slot(scandesc, ForwardScanDirection, slot)) { bool concurrently_updated; diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 4160d2d6e..6a1dfab51 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -102,7 +102,6 @@ #include "access/gin.h" #include "access/table.h" #include "access/tableam.h" -#include "access/visibilitymap.h" #include "catalog/pg_collation.h" #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" @@ -7121,10 +7120,6 @@ get_actual_variable_endpoint(Relation heapRel, bool have_data = false; SnapshotData SnapshotNonVacuumable; IndexScanDesc index_scan; - Buffer vmbuffer = InvalidBuffer; - BlockNumber last_heap_block = InvalidBlockNumber; - int n_visited_heap_pages = 0; - ItemPointer tid; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; MemoryContext oldcontext; @@ -7172,62 +7167,26 @@ get_actual_variable_endpoint(Relation heapRel, * a huge amount of time here, so we give up once we've read too many heap * pages. When we fail for that reason, the caller will end up using * whatever extremal value is recorded in pg_statistic. + * + * We set xs_visited_pages_limit to tell the table AM to count distinct + * heap pages visited for non-visible tuples and give up after the limit + * is exceeded. */ +#define VISITED_PAGES_LIMIT 100 InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(heapRel)); - index_scan = index_beginscan(heapRel, indexRel, + index_scan = index_beginscan(heapRel, indexRel, true, &SnapshotNonVacuumable, NULL, 1, 0, SO_NONE); - /* Set it up for index-only scan */ - index_scan->xs_want_itup = true; + Assert(index_scan->xs_want_itup); + index_scan->xs_visited_pages_limit = VISITED_PAGES_LIMIT; index_rescan(index_scan, scankeys, 1, NULL, 0); /* Fetch first/next tuple in specified direction */ - while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) + while (table_index_getnext_slot(index_scan, indexscandir, tableslot)) { - BlockNumber block = ItemPointerGetBlockNumber(tid); - - if (!VM_ALL_VISIBLE(heapRel, - block, - &vmbuffer)) - { - /* Rats, we have to visit the heap to check visibility */ - if (!index_fetch_heap(index_scan, tableslot)) - { - /* - * No visible tuple for this index entry, so we need to - * advance to the next entry. Before doing so, count heap - * page fetches and give up if we've done too many. - * - * We don't charge a page fetch if this is the same heap page - * as the previous tuple. This is on the conservative side, - * since other recently-accessed pages are probably still in - * buffers too; but it's good enough for this heuristic. - */ -#define VISITED_PAGES_LIMIT 100 - - if (block != last_heap_block) - { - last_heap_block = block; - n_visited_heap_pages++; - if (n_visited_heap_pages > VISITED_PAGES_LIMIT) - break; - } - - continue; /* no visible tuple, try next index entry */ - } - - /* We don't actually need the heap tuple for anything */ - ExecClearTuple(tableslot); - - /* - * We don't care whether there's more than one visible tuple in - * the HOT chain; if any are visible, that's good enough. - */ - } - /* * We expect that the index will return data in IndexTuple not * HeapTuple format. @@ -7259,8 +7218,6 @@ get_actual_variable_endpoint(Relation heapRel, break; } - if (vmbuffer != InvalidBuffer) - ReleaseBuffer(vmbuffer); index_endscan(index_scan); return have_data; -- 2.53.0