From e7432523481b9b6184d25848a3a722a3cce296cd Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 19 Dec 2025 23:58:40 +0100 Subject: [PATCH v13 3/8] TableAM: Support AM-specific fast visibility tests Previously, we assumed VM_ALL_VISIBLE(...) is universal across all AMs. This is probably not the case, so we introduce a new table method called "table_index_vischeck_tuples" which allows anyone to ask the AM whether a tuple (or list of tuples) is definitely visible to us, or might be deleted or otherwise invisible. We implement that method directly for HeapAM; usage of the facility will follow in later commits. --- src/backend/access/heap/heapam.c | 124 ++++++++++++++++++++++ src/backend/access/heap/heapam_handler.c | 1 + src/backend/access/table/tableamapi.c | 1 + src/include/access/heapam.h | 2 + src/include/access/tableam.h | 125 +++++++++++++++++++++++ 5 files changed, 253 insertions(+) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 6daf4a87dec..d29346a2fee 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -106,6 +106,20 @@ static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy); +/* sort template definitions for index visibility checks */ +#define ST_SORT heap_ivc_sortby_tidheapblk +#define ST_ELEMENT_TYPE TM_VisCheck +#define ST_DECLARE +#define ST_DEFINE +#define ST_SCOPE static inline +#define ST_COMPARE(a, b) ( \ + a->tidblkno < b->tidblkno ? -1 : ( \ + a->tidblkno > b->tidblkno ? 1 : 0 \ + ) \ +) + +#include "lib/sort_template.h" + /* * Each tuple lock mode has a corresponding heavyweight lock, and one or two @@ -8813,6 +8827,116 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) return nblocksfavorable; } +/* + * heapam implementation of tableam's index_vischeck_tuples interface. + * + * This helper function is called by index AMs during index-only scans, + * to do VM-based visibility checks on individual tuples, so that the AM + * can hold the tuple in memory for e.g. reordering for extended periods of + * time while without holding thousands of pins to conflict with VACUUM. + * + * It's possible for this to generate a fair amount of I/O, since we may be + * checking hundreds of tuples from a single index block, but that is + * preferred over holding thousands of pins. + * + * We use heuristics to balance the costs of sorting TIDs with VM page + * lookups. + */ +void +heap_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop) +{ + TM_VisCheck *checks = checkop->checktids; + int checkntids = checkop->checkntids; + int nblocks = 1; + BlockNumber *blknos; + uint8 *status; + TMVC_Result res; + + if (checkntids == 0) + return; + + /* + * Order the TIDs to heap order, so that we will only need to visit every + * VM page at most once. + */ + heap_ivc_sortby_tidheapblk(checks, checkntids); + + for (int i = 0; i < checkntids - 1; i++) + { + if (checks[i].tidblkno != checks[i + 1].tidblkno) + { + Assert(checks[i].tidblkno < checks[i + 1].tidblkno); + nblocks++; + } + } + + /* + * No need to allocate arrays or do other (comparatively expensive) + * bookkeeping when we have only one block to check. + */ + if (nblocks == 1) + { + if (VM_ALL_VISIBLE(rel, checks[0].tidblkno, checkop->vmbuf)) + res = TMVC_Visible; + else + res = TMVC_MaybeVisible; + + for (int i = 0; i < checkntids; i++) + checks[i].vischeckresult = res; + + return; + } + + blknos = palloc_array(BlockNumber, nblocks); + status = palloc_array(uint8, nblocks); + + blknos[0] = checks[0].tidblkno; + + /* fill in the rest of the blknos array with unique block numbers */ + for (int i = 0, j = 0; i < checkntids; i++) + { + Assert(BlockNumberIsValid(checks[i].tidblkno)); + + if (checks[i].tidblkno != blknos[j]) + blknos[++j] = checks[i].tidblkno; + } + + /* do the actual visibility checks */ + visibilitymap_get_statusv(rel, blknos, status, nblocks, checkop->vmbuf); + + /* + * 'res' is the current TMVC value for blknos[j] below. It is updated + * inside the loop, but only when j is updated, so we must initialize it + * here, or we'll store uninitialized data instead of an TMVC value for + * the first block's result. + */ + if (status[0] & VISIBILITYMAP_ALL_VISIBLE) + res = TMVC_Visible; + else + res = TMVC_MaybeVisible; + + /* copy the results of blknos into the TM_VisChecks */ + for (int i = 0, j = 0; i < checkntids; i++) + { + if (checks[i].tidblkno != blknos[j]) + { + j += 1; + Assert(checks[i].tidblkno == blknos[j]); + + if (status[j] & VISIBILITYMAP_ALL_VISIBLE) + res = TMVC_Visible; + else + res = TMVC_MaybeVisible; + } + + checks[i].vischeckresult = res; + } + + /* and clean up the resources we'd used */ + pfree(status); + pfree(blknos); +} + /* * Perform XLogInsert for a heap-visible operation. 'block' is the block * being marked all-visible, and vm_buffer is the buffer containing the diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index dd4fe6bf62f..6189557cbbb 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2648,6 +2648,7 @@ static const TableAmRoutine heapam_methods = { .tuple_tid_valid = heapam_tuple_tid_valid, .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot, .index_delete_tuples = heap_index_delete_tuples, + .index_vischeck_tuples = heap_index_vischeck_tuples, .relation_set_new_filelocator = heapam_relation_set_new_filelocator, .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate, diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 476663b66aa..b3ce90ceaea 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -61,6 +61,7 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->tuple_get_latest_tid != NULL); Assert(routine->tuple_satisfies_snapshot != NULL); Assert(routine->index_delete_tuples != NULL); + Assert(routine->index_vischeck_tuples != NULL); Assert(routine->tuple_insert != NULL); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index f7e4ae3843c..faf4f3a585a 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -407,6 +407,8 @@ extern void simple_heap_update(Relation relation, const ItemPointerData *otid, extern TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate); +extern void heap_index_vischeck_tuples(Relation rel, + TM_IndexVisibilityCheckOp *checkop); /* in heap/pruneheap.c */ extern void heap_page_prune_opt(Relation relation, Buffer buffer); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 2fa790b6bf5..52acf8c1985 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -254,6 +254,69 @@ typedef struct TM_IndexDeleteOp TM_IndexStatus *status; } TM_IndexDeleteOp; +/* + * State used when calling table_index_delete_tuples() + * + * Index-only scans need to know the visibility of the associated table tuples + * before they can return the index tuple. If the index tuple is known to be + * visible with a cheap check, we can return it directly without requesting + * the visibility info from the table AM directly. + * + * This AM API exposes a cheap bulk visibility checking API to indexes, + * allowing these indexes to check multiple tuples worth of visibility info at + * once, and allows the AM to store these checks. This improves the pinning + * ergonomics of index AMs by allowing a scan to cache index tuples in memory + * without holding pins on these index tuple pages until the index tuples are + * returned. + * + * The method is called with a list of TIDs, and its output will indicate the + * visibility state of each tuple: Unchecked, Dead, MaybeVisible, or Visible. + * + * HeapAM's implementation of visibility maps only allows for cheap checks of + * *definitely visible*; all other results are *maybe visible*. A result for + * *definitely not visible* aka dead is currently not accounted for by lack of + * Table AMs which support such visibility lookups cheaply. However, if a + * Table AM were to implement this, it could be used to quickly skip the + * current tuple in index scans, without having to ask the Table AM for that + * TID's data. + */ +typedef enum TMVC_Result +{ + TMVC_Unchecked = 0, + TMVC_Visible = 1, + TMVC_MaybeVisible = 2, + +#define TMVC_MAX TMVC_MaybeVisible +} TMVC_Result; + +typedef struct TM_VisCheck +{ + /* TID from index tuple; deformed to not waste time during sort ops */ + BlockNumber tidblkno; + uint16 tidoffset; + /* identifier for the TID in this visibility check operation context */ + OffsetNumber idxoffnum; + /* the result of the visibility check operation */ + TMVC_Result vischeckresult; +} TM_VisCheck; + +static inline void +PopulateTMVischeck(TM_VisCheck *check, ItemPointer tid, OffsetNumber idxoff) +{ + Assert(ItemPointerIsValid(tid)); + check->tidblkno = ItemPointerGetBlockNumberNoCheck(tid); + check->tidoffset = ItemPointerGetOffsetNumberNoCheck(tid); + check->idxoffnum = idxoff; + check->vischeckresult = TMVC_Unchecked; +} + +typedef struct TM_IndexVisibilityCheckOp +{ + int checkntids; /* number of TIDs to check */ + Buffer *vmbuf; /* pointer to VM buffer to reuse across calls */ + TM_VisCheck *checktids; /* the checks to execute */ +} TM_IndexVisibilityCheckOp; + /* "options" flag bits for table_tuple_insert */ /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */ #define TABLE_INSERT_SKIP_FSM 0x0002 @@ -500,6 +563,10 @@ typedef struct TableAmRoutine TransactionId (*index_delete_tuples) (Relation rel, TM_IndexDeleteOp *delstate); + /* see table_index_vischeck_tuples() */ + void (*index_vischeck_tuples) (Relation rel, + TM_IndexVisibilityCheckOp *checkop); + /* ------------------------------------------------------------------------ * Manipulations of physical tuples. @@ -1333,6 +1400,64 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) return rel->rd_tableam->index_delete_tuples(rel, delstate); } +/* + * Determine rough visibility information of index tuples based on each TID. + * + * Determines which entries from index AM caller's TM_IndexVisibilityCheckOp + * state point to TMVC_VISIBLE or TMVC_MAYBE_VISIBLE table tuples, at low IO + * overhead. For the heap AM, the implementation is effectively a wrapper + * around VM_ALL_FROZEN. + * + * On return, all TM_VisChecks indicated by checkop->checktids will have been + * updated with the correct visibility status. + * + * Note that there is no value for "definitely dead" tuples, as the Heap AM + * doesn't have an efficient method to determine that a tuple is dead to all + * users, as it would have to go into the heap. If and when AMs are built + * that would support VM checks with an equivalent to VM_ALL_DEAD this + * decision can be reconsidered. + */ +static inline void +table_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop) +{ + rel->rd_tableam->index_vischeck_tuples(rel, checkop); + +#if USE_ASSERT_CHECKING + for (int i = 0; i < checkop->checkntids; i++) + { + TMVC_Result res = checkop->checktids[i].vischeckresult; + + if (res <= TMVC_Unchecked || res > TMVC_MAX) + { + elog(PANIC, "Unexpected vischeckresult %d at offset %d/%d, expected value between %d and %d inclusive", + checkop->checktids[i].vischeckresult, + i, checkop->checkntids, + TMVC_Visible, + TMVC_MaybeVisible); + } + } +#endif +} + +static inline TMVC_Result +table_index_vischeck_tuple(Relation rel, Buffer *vmbuffer, ItemPointer tid) +{ + TM_IndexVisibilityCheckOp checkOp; + TM_VisCheck op; + + PopulateTMVischeck(&op, tid, 0); + + checkOp.checktids = &op; + checkOp.checkntids = 1; + checkOp.vmbuf = vmbuffer; + + rel->rd_tableam->index_vischeck_tuples(rel, &checkOp); + + Assert(op.vischeckresult != TMVC_Unchecked); + + return op.vischeckresult; +} + /* ---------------------------------------------------------------------------- * Functions for manipulations of physical tuples. -- 2.50.1 (Apple Git-155)