From d7e8f76144cb27e761e2d4bc9c687dd0a2de203e Mon Sep 17 00:00:00 2001 From: Amit Langote Date: Thu, 12 Mar 2026 09:18:04 +0900 Subject: [PATCH v6 1/5] heapam: store full HeapTupleData in rs_vistuples[] for pagemode scans page_collect_tuples() builds full HeapTupleData headers for every visible tuple on a page -- t_data, t_len, t_self, t_tableOid -- but previously discarded them immediately after writing just the OffsetNumber of each survivor into rs_vistuples[]. heapgettup_pagemode() then re-derived those same values on every call from the saved OffsetNumber via PageGetItemId() and PageGetItem(). Change rs_vistuples[] element type from OffsetNumber to HeapTupleData and populate it inside page_collect_tuples() while lpp, lineoff, page, block, and relid are already in scope, so no additional page reads are needed. For the all_visible path (the common case on a primary not under active modification) the write piggy-backs on the existing per-lineoff loop. For the !all_visible path, HeapTupleData entries are written during the visibility loop and compacted to visible survivors afterwards using batchmvcc.visible[], avoiding a return to pd_linp[] via PageGetItemId(). With rs_vistuples[] populated, heapgettup_pagemode() replaces the per-tuple PageGetItemId/PageGetItem calls with a single struct copy: *tuple = scan->rs_vistuples[lineindex]; The stack-local HeapTupleData array in BatchMVCCState is eliminated by passing rs_vistuples[] directly to HeapTupleSatisfiesMVCCBatch(), saving MaxHeapTuplesPerPage * 24 bytes of stack per page_collect_tuples() call. HeapTupleSatisfiesMVCCBatch() loses its vistuples_dense parameter since compaction is now handled by the caller. t_tableOid is pre-initialized for all rs_vistuples[] entries at scan start in heap_beginscan(), eliminating a store per visible tuple from the fill loop. The raw ItemId word is read once per tuple with lp_off and lp_len extracted via mask and shift rather than calling ItemIdGetOffset() and ItemIdGetLength() separately, avoiding a potential second load from the same address in the inner loop. Having pre-built HeapTupleData headers available at the scan descriptor level also lays groundwork for a batched tuple interface, where an AM can serve multiple tuples per call without repeating the line pointer traversal. Suggested-by: Andres Freund --- src/backend/access/heap/heapam.c | 73 ++++++++++++--------- src/backend/access/heap/heapam_handler.c | 19 ++---- src/backend/access/heap/heapam_visibility.c | 21 +++--- src/include/access/heapam.h | 5 +- 4 files changed, 58 insertions(+), 60 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index e5bd062de77..c6d0aacc5c9 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -524,7 +524,6 @@ page_collect_tuples(HeapScanDesc scan, Snapshot snapshot, BlockNumber block, int lines, bool all_visible, bool check_serializable) { - Oid relid = RelationGetRelid(scan->rs_base.rs_rd); int ntup = 0; int nvis = 0; BatchMVCCState batchmvcc; @@ -536,7 +535,7 @@ page_collect_tuples(HeapScanDesc scan, Snapshot snapshot, for (OffsetNumber lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++) { ItemId lpp = PageGetItemId(page, lineoff); - HeapTuple tup; + HeapTuple tup = &scan->rs_vistuples[ntup]; if (unlikely(!ItemIdIsNormal(lpp))) continue; @@ -549,25 +548,33 @@ page_collect_tuples(HeapScanDesc scan, Snapshot snapshot, */ if (!all_visible || check_serializable) { - tup = &batchmvcc.tuples[ntup]; + uint32 lp_val = *(uint32 *) lpp; - tup->t_data = (HeapTupleHeader) PageGetItem(page, lpp); - tup->t_len = ItemIdGetLength(lpp); - tup->t_tableOid = relid; + tup->t_data = (HeapTupleHeader) ((char *) page + (lp_val & 0x7fff)); + tup->t_len = lp_val >> 17; + Assert(tup->t_tableOid == RelationGetRelid(scan->rs_base.rs_rd)); ItemPointerSet(&(tup->t_self), block, lineoff); } - /* - * If the page is all visible, these fields otherwise won't be - * populated in loop below. - */ if (all_visible) { if (check_serializable) - { batchmvcc.visible[ntup] = true; + + /* + * In the all_visible && !check_serializable path, the block + * above was skipped, so tup's fields have not been set yet. + * Fill them here while lpp is still in hand. + */ + if (!check_serializable) + { + uint32 lp_val = *(uint32 *) lpp; + + tup->t_data = (HeapTupleHeader) ((char *) page + (lp_val & 0x7fff)); + tup->t_len = lp_val >> 17; + Assert(tup->t_tableOid == RelationGetRelid(scan->rs_base.rs_rd)); + ItemPointerSet(&tup->t_self, block, lineoff); } - scan->rs_vistuples[ntup] = lineoff; } ntup++; @@ -598,11 +605,24 @@ page_collect_tuples(HeapScanDesc scan, Snapshot snapshot, { HeapCheckForSerializableConflictOut(batchmvcc.visible[i], scan->rs_base.rs_rd, - &batchmvcc.tuples[i], + &scan->rs_vistuples[i], buffer, snapshot); } } + + /* Now compact rs_vistuples[] to visible survivors only */ + if (!all_visible) + { + int dst = 0; + for (int i = 0; i < ntup; i++) + { + if (batchmvcc.visible[i]) + scan->rs_vistuples[dst++] = scan->rs_vistuples[i]; + } + Assert(dst == nvis); + } + return nvis; } @@ -1073,14 +1093,13 @@ heapgettup_pagemode(HeapScanDesc scan, ScanKey key) { HeapTuple tuple = &(scan->rs_ctup); - Page page; uint32 lineindex; uint32 linesleft; if (likely(scan->rs_inited)) { /* continue from previously returned page/tuple */ - page = BufferGetPage(scan->rs_cbuf); + Assert(BufferIsValid(scan->rs_cbuf)); lineindex = scan->rs_cindex + dir; if (ScanDirectionIsForward(dir)) @@ -1108,29 +1127,21 @@ heapgettup_pagemode(HeapScanDesc scan, /* prune the page and determine visible tuple offsets */ heap_prepare_pagescan((TableScanDesc) scan); - page = BufferGetPage(scan->rs_cbuf); linesleft = scan->rs_ntuples; lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1; - /* block is the same for all tuples, set it once outside the loop */ - ItemPointerSetBlockNumber(&tuple->t_self, scan->rs_cblock); - /* lineindex now references the next or previous visible tid */ continue_page: for (; linesleft > 0; linesleft--, lineindex += dir) { - ItemId lpp; - OffsetNumber lineoff; - - Assert(lineindex < scan->rs_ntuples); - lineoff = scan->rs_vistuples[lineindex]; - lpp = PageGetItemId(page, lineoff); - Assert(ItemIdIsNormal(lpp)); - - tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp); - tuple->t_len = ItemIdGetLength(lpp); - ItemPointerSetOffsetNumber(&tuple->t_self, lineoff); + /* + * Headers were pre-built by page_collect_tuples() into + * rs_vistuples[]. Copy the entry; t_data still points into the + * pinned page, which is safe for the lifetime of the current page + * scan. + */ + *tuple = scan->rs_vistuples[lineindex]; /* skip any tuples that don't match the scan key */ if (key != NULL && @@ -1244,6 +1255,8 @@ heap_beginscan(Relation relation, Snapshot snapshot, /* we only need to set this up once */ scan->rs_ctup.t_tableOid = RelationGetRelid(relation); + for (int i = 0; i < MaxHeapTuplesPerPage; i++) + scan->rs_vistuples[i].t_tableOid = RelationGetRelid(relation); /* * Allocate memory to keep track of page allocation for parallel workers diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 253a735b6c1..2fd120028bb 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2153,9 +2153,6 @@ heapam_scan_bitmap_next_tuple(TableScanDesc scan, { BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan; HeapScanDesc hscan = (HeapScanDesc) bscan; - OffsetNumber targoffset; - Page page; - ItemId lp; /* * Out of range? If so, nothing more to look at on this page @@ -2170,15 +2167,7 @@ heapam_scan_bitmap_next_tuple(TableScanDesc scan, return false; } - targoffset = hscan->rs_vistuples[hscan->rs_cindex]; - page = BufferGetPage(hscan->rs_cbuf); - lp = PageGetItemId(page, targoffset); - Assert(ItemIdIsNormal(lp)); - - hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem(page, lp); - hscan->rs_ctup.t_len = ItemIdGetLength(lp); - hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id; - ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset); + hscan->rs_ctup = hscan->rs_vistuples[hscan->rs_cindex]; pgstat_count_heap_fetch(scan->rs_rd); @@ -2456,7 +2445,7 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, while (start < end) { uint32 mid = start + (end - start) / 2; - OffsetNumber curoffset = hscan->rs_vistuples[mid]; + OffsetNumber curoffset = hscan->rs_vistuples[mid].t_self.ip_posid; if (tupoffset == curoffset) return true; @@ -2575,7 +2564,7 @@ BitmapHeapScanNextBlock(TableScanDesc scan, ItemPointerSet(&tid, block, offnum); if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, &heapTuple, NULL, true)) - hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + hscan->rs_vistuples[ntup++] = heapTuple; } } else @@ -2604,7 +2593,7 @@ BitmapHeapScanNextBlock(TableScanDesc scan, valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) { - hscan->rs_vistuples[ntup++] = offnum; + hscan->rs_vistuples[ntup++] = loctup; PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot, HeapTupleHeaderGetXmin(loctup.t_data)); } diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index fc64f4343ce..cd6cd4d8d69 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -1670,16 +1670,16 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, } /* - * Perform HeaptupleSatisfiesMVCC() on each passed in tuple. This is more + * Perform HeapTupleSatisfiesMVCC() on each passed in tuple. This is more * efficient than doing HeapTupleSatisfiesMVCC() one-by-one. * - * To be checked tuples are passed via BatchMVCCState->tuples. Each tuple's - * visibility is stored in batchmvcc->visible[]. In addition, - * ->vistuples_dense is set to contain the offsets of visible tuples. + * Each tuple's visibility is stored in batchmvcc->visible[]. The caller + * is responsible for compacting the tuples array to contain only visible + * survivors after this function returns. * - * The reason this is more efficient than HeapTupleSatisfiesMVCC() is that it - * avoids a cross-translation-unit function call for each tuple, allows the - * compiler to optimize across calls to HeapTupleSatisfiesMVCC and allows + * The reason this is more efficient than HeapTupleSatisfiesMVCC() is that + * it avoids a cross-translation-unit function call for each tuple, allows + * the compiler to optimize across calls to HeapTupleSatisfiesMVCC and allows * setting hint bits more efficiently (see the one BufferFinishSetHintBits() * call below). * @@ -1689,7 +1689,7 @@ int HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, int ntups, BatchMVCCState *batchmvcc, - OffsetNumber *vistuples_dense) + HeapTupleData *tuples) { int nvis = 0; SetHintBitsState state = SHB_INITIAL; @@ -1699,16 +1699,13 @@ HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, for (int i = 0; i < ntups; i++) { bool valid; - HeapTuple tup = &batchmvcc->tuples[i]; + HeapTuple tup = &tuples[i]; valid = HeapTupleSatisfiesMVCC(tup, snapshot, buffer, &state); batchmvcc->visible[i] = valid; if (likely(valid)) - { - vistuples_dense[nvis] = tup->t_self.ip_posid; nvis++; - } } if (state == SHB_ENABLED) diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 2fdc50b865b..09b9566d0ac 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -103,7 +103,7 @@ typedef struct HeapScanDescData /* these fields only used in page-at-a-time mode and for bitmap scans */ uint32 rs_cindex; /* current tuple's index in vistuples */ uint32 rs_ntuples; /* number of visible tuples on page */ - OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ + HeapTupleData rs_vistuples[MaxHeapTuplesPerPage]; /* tuples */ } HeapScanDescData; typedef struct HeapScanDescData *HeapScanDesc; @@ -483,14 +483,13 @@ extern bool HeapTupleIsSurelyDead(HeapTuple htup, */ typedef struct BatchMVCCState { - HeapTupleData tuples[MaxHeapTuplesPerPage]; bool visible[MaxHeapTuplesPerPage]; } BatchMVCCState; extern int HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, int ntups, BatchMVCCState *batchmvcc, - OffsetNumber *vistuples_dense); + HeapTupleData *tuples); /* * To avoid leaking too much knowledge about reorderbuffer implementation -- 2.47.3