From c113d0c4f7d4bd7209c06fa73f2153140d8afb9c Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 7 Mar 2025 17:39:23 +0100 Subject: [PATCH v10 1/5] IOS/TableAM: Support AM-specific fast visibility tests Previously, we assumed VM_ALL_VISIBLE is universal across all AMs. This is probably not the case, so we introduce a new table method called "table_index_vischeck_tuples" which allows anyone to ask the AM whether a tuple is definitely visible to everyone or might be invisible to someone. The API is intended to replace direct calls to VM_ALL_VISIBLE and as such doesn't include "definitely dead to everyone", as the Heap AM's VM doesn't support *definitely dead* as output for its lookups; and thus it would be too expensive for the Heap AM to produce such results. A future commit will use this inside GIST and SP-GIST to fix a race condition between IOS and VACUUM, which causes a bug with tuple visibility, and a further patch will add support for this to nbtree. --- src/include/access/heapam.h | 2 + src/include/access/relscan.h | 5 ++ src/include/access/tableam.h | 73 +++++++++++++++++++++ src/backend/access/heap/heapam.c | 64 ++++++++++++++++++ src/backend/access/heap/heapam_handler.c | 1 + src/backend/access/index/indexam.c | 6 ++ src/backend/access/table/tableamapi.c | 1 + src/backend/executor/nodeIndexonlyscan.c | 83 ++++++++++++++++-------- src/backend/utils/adt/selfuncs.c | 76 +++++++++++++--------- 9 files changed, 254 insertions(+), 57 deletions(-) diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 1640d9c32f7..a820f150509 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -378,6 +378,8 @@ extern void simple_heap_update(Relation relation, ItemPointer otid, extern TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate); +extern void heap_index_vischeck_tuples(Relation rel, + TM_IndexVisibilityCheckOp *checkop); /* in heap/pruneheap.c */ struct GlobalVisState; diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c0..93a6f65ab0e 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -26,6 +26,9 @@ struct ParallelTableScanDescData; +enum TMVC_Result; + + /* * Generic descriptor for table scans. This is the base-class for table scans, * which needs to be embedded in the scans of individual AMs. @@ -176,6 +179,8 @@ typedef struct IndexScanDescData bool xs_recheck; /* T means scan keys must be rechecked */ + int xs_visrecheck; /* TM_VisCheckResult from tableam.h */ + /* * When fetching with an ordering operator, the values of the ORDER BY * expressions of the last returned tuple, according to the index. If diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index b8cb1e744ad..2dbdb9287f1 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -255,6 +255,49 @@ typedef struct TM_IndexDeleteOp TM_IndexStatus *status; } TM_IndexDeleteOp; +/* + * State used when calling table_index_delete_tuples() + * + * Index-only scans need to know the visibility of the associated table tuples + * before they can return the index tuple. If the index tuple is known to be + * visible with a cheap check, we can return it directly without requesting + * the visibility info from the table AM directly. + * + * This AM API exposes a cheap visibility checking API to indexes, allowing + * these indexes to check multiple tuples worth of visibility info at once, + * and allowing the AM to store these checks, improving the pinning ergonomics + * of index AMs by allowing a scan to cache index tuples in memory without + * holding pins on index tuples' pages until the index tuples were returned. + * + * The AM is called with a list of TIDs, and its output will indicate the + * visibility state of each tuple: Unchecked, Dead, MaybeVisible, or Visible. + * + * HeapAM's implementation of visibility maps only allows for cheap checks of + * *definitely visible*; all other results are *maybe visible*. A result for + * *definitely not visible* aka dead is currently not accounted for by lack of + * Table AMs which support such visibility lookups cheaply. + */ +typedef enum TMVC_Result +{ + TMVC_Unchecked, + TMVC_MaybeVisible, + TMVC_Visible, +} TMVC_Result; + +typedef struct TM_VisCheck +{ + ItemPointerData tid; /* table TID from index tuple */ + OffsetNumber idxoffnum; /* identifier for the TID in this call */ + TMVC_Result vischeckresult; /* output of the visibilitycheck */ +} TM_VisCheck; + +typedef struct TM_IndexVisibilityCheckOp +{ + int nchecktids; /* number of TIDs to check */ + Buffer *vmbuf; /* pointer to VM buffer to reuse across calls */ + TM_VisCheck *checktids; /* the checks to execute */ +} TM_IndexVisibilityCheckOp; + /* "options" flag bits for table_tuple_insert */ /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */ #define TABLE_INSERT_SKIP_FSM 0x0002 @@ -501,6 +544,10 @@ typedef struct TableAmRoutine TransactionId (*index_delete_tuples) (Relation rel, TM_IndexDeleteOp *delstate); + /* see table_index_delete_tuples() */ + void (*index_vischeck_tuples) (Relation rel, + TM_IndexVisibilityCheckOp *checkop); + /* ------------------------------------------------------------------------ * Manipulations of physical tuples. @@ -1328,6 +1375,32 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) return rel->rd_tableam->index_delete_tuples(rel, delstate); } +static inline void +table_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop) +{ + return rel->rd_tableam->index_vischeck_tuples(rel, checkop); +} + +static inline TMVC_Result +table_index_vischeck_tuple(Relation rel, Buffer *vmbuffer, ItemPointer tid) +{ + TM_IndexVisibilityCheckOp checkOp; + TM_VisCheck op; + + op.idxoffnum = 0; + op.tid = *tid; + op.vischeckresult = TMVC_Unchecked; + checkOp.checktids = &op; + checkOp.nchecktids = 1; + checkOp.vmbuf = vmbuffer; + + rel->rd_tableam->index_vischeck_tuples(rel, &checkOp); + + Assert(op.vischeckresult != TMVC_Unchecked); + + return op.vischeckresult; +} + /* ---------------------------------------------------------------------------- * Functions for manipulations of physical tuples. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b12b583c4d9..a58f19761aa 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -102,6 +102,7 @@ static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status bool logLockFailure); static void index_delete_sort(TM_IndexDeleteOp *delstate); static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate); +static int heap_cmp_index_vischeck(const void *a, const void *b); static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy); @@ -8775,6 +8776,69 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) return nblocksfavorable; } +/* + * heapam implementation of tableam's index_vischeck_tuples interface. + * + * This helper function is called by index AMs during index-only scans, + * to do VM-based visibility checks on individual tuples, so that the AM + * can hold the tuple in memory for e.g. reordering for extended periods of + * time while without holding thousands of pins to conflict with VACUUM. + * + * It's possible for this to generate a fair amount of I/O, since we may be + * checking hundreds of tuples from a single index block, but that is + * preferred over holding thousands of pins. + */ +void +heap_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop) +{ + BlockNumber prevBlk = InvalidBlockNumber; + TMVC_Result lastResult = TMVC_Unchecked; + Buffer *vmbuf = checkop->vmbuf; + TM_VisCheck *checkTids = checkop->checktids; + + /* + * Order the TIDs to heap order, so that we will only need to visit every + * VM page at most once. + */ + if (checkop->nchecktids > 1) + qsort(checkTids, checkop->nchecktids, sizeof(TM_VisCheck), + heap_cmp_index_vischeck); + + for (int i = 0; i < checkop->nchecktids; i++) + { + TM_VisCheck *check = &checkop->checktids[i]; + ItemPointer tid = &check->tid; + BlockNumber blkno = ItemPointerGetBlockNumber(tid); + + /* Visibility should be checked just once per tuple. */ + Assert(check->vischeckresult == TMVC_Unchecked); + + if (blkno != prevBlk) + { + if (VM_ALL_VISIBLE(rel, blkno, vmbuf)) + lastResult = TMVC_Visible; + else + lastResult = TMVC_MaybeVisible; + + prevBlk = blkno; + } + + check->vischeckresult = lastResult; + } +} + +/* + * Compare TM_VisChecks for an efficient ordering. + */ +static int +heap_cmp_index_vischeck(const void *a, const void *b) +{ + const TM_VisCheck *visa = (const TM_VisCheck *) a; + const TM_VisCheck *visb = (const TM_VisCheck *) b; + return ItemPointerCompare(unconstify(ItemPointerData *, &visa->tid), + unconstify(ItemPointerData *, &visb->tid)); +} + /* * Perform XLogInsert for a heap-visible operation. 'block' is the block * being marked all-visible, and vm_buffer is the buffer containing the diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 4da4dc84580..65c23ff6658 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2676,6 +2676,7 @@ static const TableAmRoutine heapam_methods = { .tuple_tid_valid = heapam_tuple_tid_valid, .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot, .index_delete_tuples = heap_index_delete_tuples, + .index_vischeck_tuples = heap_index_vischeck_tuples, .relation_set_new_filelocator = heapam_relation_set_new_filelocator, .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 55ec4c10352..370e442e24e 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -627,6 +627,12 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* XXX: we should assert that a snapshot is pushed or registered */ Assert(TransactionIdIsValid(RecentXmin)); + /* + * Reset xs_visrecheck, so we don't confuse the next tuple's visibility + * state with that of the previous. + */ + scan->xs_visrecheck = TMVC_Unchecked; + /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_heaptid. It should also set diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 476663b66aa..b3ce90ceaea 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -61,6 +61,7 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->tuple_get_latest_tid != NULL); Assert(routine->tuple_satisfies_snapshot != NULL); Assert(routine->index_delete_tuples != NULL); + Assert(routine->index_vischeck_tuples != NULL); Assert(routine->tuple_insert != NULL); diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index f464cca9507..e02fc1652ff 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -121,6 +121,7 @@ IndexOnlyNext(IndexOnlyScanState *node) while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { bool tuple_from_heap = false; + TMVC_Result vischeck = scandesc->xs_visrecheck; CHECK_FOR_INTERRUPTS(); @@ -128,6 +129,9 @@ IndexOnlyNext(IndexOnlyScanState *node) * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, * we'll use the index tuple not the heap tuple as the data source. + * The index may have already pre-checked the visibility of the tuple + * for us, and stored the result in xs_visrecheck, in which case we + * can skip the call. * * Note on Memory Ordering Effects: visibilitymap_get_status does not * lock the visibility map buffer, and therefore the result we read @@ -157,37 +161,60 @@ IndexOnlyNext(IndexOnlyScanState *node) * * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. + * + * The index doing these checks for us doesn't materially change these + * considerations. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) - { - /* - * Rats, we have to visit the heap to check visibility. - */ - InstrCountTuples2(node, 1); - if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) - continue; /* no visible tuple, try next index entry */ + if (vischeck == TMVC_Unchecked) + vischeck = table_index_vischeck_tuple(scandesc->heapRelation, + &node->ioss_VMBuffer, + tid); - ExecClearTuple(node->ioss_TableSlot); - - /* - * Only MVCC snapshots are supported here, so there should be no - * need to keep following the HOT chain once a visible entry has - * been found. If we did want to allow that, we'd need to keep - * more state to remember not to call index_getnext_tid next time. - */ - if (scandesc->xs_heap_continue) - elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + Assert(vischeck != TMVC_Unchecked); - /* - * Note: at this point we are holding a pin on the heap page, as - * recorded in scandesc->xs_cbuf. We could release that pin now, - * but it's not clear whether it's a win to do so. The next index - * entry might require a visit to the same heap page. - */ - - tuple_from_heap = true; + switch (vischeck) + { + case TMVC_Unchecked: + elog(ERROR, "Failed to check visibility for tuple"); + /* + * In case of compilers that don't undertand that elog(ERROR) + * doens't exit, and which have -Wimplicit-fallthrough: + */ + /* fallthrough */ + case TMVC_MaybeVisible: + { + /* + * Rats, we have to visit the heap to check visibility. + */ + InstrCountTuples2(node, 1); + if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) + continue; /* no visible tuple, try next index entry */ + + ExecClearTuple(node->ioss_TableSlot); + + /* + * Only MVCC snapshots are supported here, so there should be + * no need to keep following the HOT chain once a visible + * entry has been found. If we did want to allow that, we'd + * need to keep more state to remember not to call + * index_getnext_tid next time. + */ + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + + /* + * Note: at this point we are holding a pin on the heap page, + * as recorded in scandesc->xs_cbuf. We could release that + * pin now, but it's not clear whether it's a win to do so. + * The next index entry might require a visit to the same heap + * page. + */ + + tuple_from_heap = true; + break; + } + case TMVC_Visible: + break; } /* diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 5b35debc8ff..7e9ca616a67 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6561,44 +6561,62 @@ get_actual_variable_endpoint(Relation heapRel, while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) { BlockNumber block = ItemPointerGetBlockNumber(tid); + TMVC_Result visres = index_scan->xs_visrecheck; - if (!VM_ALL_VISIBLE(heapRel, - block, - &vmbuffer)) + if (visres == TMVC_Unchecked) + visres = table_index_vischeck_tuple(heapRel, &vmbuffer, tid); + + Assert(visres != TMVC_Unchecked); + + switch (visres) { - /* Rats, we have to visit the heap to check visibility */ - if (!index_fetch_heap(index_scan, tableslot)) - { + case TMVC_Unchecked: + elog(ERROR, "Failed to check visibility for tuple"); /* - * No visible tuple for this index entry, so we need to - * advance to the next entry. Before doing so, count heap - * page fetches and give up if we've done too many. - * - * We don't charge a page fetch if this is the same heap page - * as the previous tuple. This is on the conservative side, - * since other recently-accessed pages are probably still in - * buffers too; but it's good enough for this heuristic. + * In case of compilers that don't undertand that elog(ERROR) + * doens't exit, and which have -Wimplicit-fallthrough: */ + /* fallthrough */ + case TMVC_MaybeVisible: + { + /* Rats, we have to visit the heap to check visibility */ + if (!index_fetch_heap(index_scan, tableslot)) + { + /* + * No visible tuple for this index entry, so we need to + * advance to the next entry. Before doing so, count heap + * page fetches and give up if we've done too many. + * + * We don't charge a page fetch if this is the same heap + * page as the previous tuple. This is on the + * conservative side, since other recently-accessed pages + * are probably still in buffers too; but it's good enough + * for this heuristic. + */ #define VISITED_PAGES_LIMIT 100 - if (block != last_heap_block) - { - last_heap_block = block; - n_visited_heap_pages++; - if (n_visited_heap_pages > VISITED_PAGES_LIMIT) - break; - } + if (block != last_heap_block) + { + last_heap_block = block; + n_visited_heap_pages++; + if (n_visited_heap_pages > VISITED_PAGES_LIMIT) + break; + } - continue; /* no visible tuple, try next index entry */ - } + continue; /* no visible tuple, try next index entry */ + } - /* We don't actually need the heap tuple for anything */ - ExecClearTuple(tableslot); + /* We don't actually need the heap tuple for anything */ + ExecClearTuple(tableslot); - /* - * We don't care whether there's more than one visible tuple in - * the HOT chain; if any are visible, that's good enough. - */ + /* + * We don't care whether there's more than one visible tuple in + * the HOT chain; if any are visible, that's good enough. + */ + break; + } + case TMVC_Visible: + break; } /* -- 2.45.2