From e85497578afa4b3031050652daeec6fc6c8953c4 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Thu, 20 Feb 2025 12:24:49 -0500 Subject: [PATCH v31 1/4] Delay extraction of TIDBitmap per page offsets Pages from the bitmap created by the TIDBitmap API can be exact or lossy. Exact pages extract all the tuple offsets from the bitmap and put them in an array for the convenience of the caller. This was done in tbm_private|shared_iterate(). However, as long as tbm_private|shared_iterate() set a reference to the PagetableEntry in the TBMIterateResult, the offset extraction can be done later. Waiting to extract the tuple offsets has a few benefits. For the shared iterator case, it allows us to extract the offsets after dropping the shared iterator state lock, reducing time spent holding a contended lock. Separating the iteration step and extracting the offsets later also allows us to avoid extracting the offsets for prefetched blocks. Those offsets were never used, so the overhead of extracting and storing them was wasted. The real motivation for this change, however, is that future commits will make bitmap heap scan use the read stream API. This requires a TBMIterateResult per issued block. By removing the array of tuple offsets from the TBMIterateResult and only extracting the offsets when they are used, we reduce the memory required for per buffer data substantially. Suggested-by: Thomas Munro Discussion: https://postgr.es/m/CA%2BhUKGLHbKP3jwJ6_%2BhnGi37Pw3BD5j2amjV3oSk7j-KyCnY7Q%40mail.gmail.com --- src/backend/access/gin/ginget.c | 16 ++++++-- src/backend/access/heap/heapam_handler.c | 10 ++++- src/backend/nodes/tidbitmap.c | 52 +++++++++--------------- src/include/access/gin_private.h | 1 + src/include/nodes/tidbitmap.h | 42 ++++++++++++++++--- 5 files changed, 80 insertions(+), 41 deletions(-) diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 63dd1f3679f..9fbe178ad47 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -845,6 +845,15 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, break; } + /* + * Exact pages need their tuple offsets extracted. + * tbm_extract_page_tuple() will set ntuples to the correct + * number. + */ + if (entry->matchResult->ntuples == -2) + tbm_extract_page_tuple(entry->matchResult, + entry->matchTupleOffsets); + /* * Reset counter to the beginning of entry->matchResult. Note: * entry->offset is still greater than matchResult->ntuples if @@ -884,20 +893,21 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, * page. If that's > advancePast, so are all the other * offsets, so just go back to the top to get the next page. */ - if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff) + if (entry->matchTupleOffsets[entry->matchResult->ntuples - 1] <= + advancePastOff) { entry->offset = entry->matchResult->ntuples; continue; } /* Otherwise scan to find the first item > advancePast */ - while (entry->matchResult->offsets[entry->offset] <= advancePastOff) + while (entry->matchTupleOffsets[entry->offset] <= advancePastOff) entry->offset++; } ItemPointerSet(&entry->curItem, entry->matchResult->blockno, - entry->matchResult->offsets[entry->offset]); + entry->matchTupleOffsets[entry->offset]); entry->offset++; /* Done unless we need to reduce the result */ diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index c0bec014154..408d4b44240 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2127,6 +2127,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, Snapshot snapshot; int ntup; TBMIterateResult *tbmres; + TBMOffsets offsets; Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN); @@ -2145,6 +2146,13 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, if (tbmres == NULL) return false; + /* + * Exact pages need their tuple offsets extracted. + * tbm_extract_page_tuple() will set ntuples to the correct number. + */ + if (tbmres->ntuples == -2) + tbm_extract_page_tuple(tbmres, offsets); + /* * Ignore any claimed entries past what we think is the end of the * relation. It may have been extended after the start of our scan (we @@ -2218,7 +2226,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, for (curslot = 0; curslot < tbmres->ntuples; curslot++) { - OffsetNumber offnum = tbmres->offsets[curslot]; + OffsetNumber offnum = offsets[curslot]; ItemPointerData tid; HeapTupleData heapTuple; diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 66b3c387d53..13ba247824d 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -40,7 +40,6 @@ #include -#include "access/htup_details.h" #include "common/hashfn.h" #include "common/int.h" #include "nodes/bitmapset.h" @@ -48,14 +47,6 @@ #include "storage/lwlock.h" #include "utils/dsa.h" -/* - * The maximum number of tuples per page is not large (typically 256 with - * 8K pages, or 1024 with 32K pages). So there's not much point in making - * the per-page bitmaps variable size. We just legislate that the size - * is this: - */ -#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage - /* * When we have to switch over to lossy storage, we use a data structure * with one bit per page, where all pages having the same number DIV @@ -67,7 +58,7 @@ * table, using identical data structures. (This is because the memory * management for hashtables doesn't easily/efficiently allow space to be * transferred easily from one hashtable to another.) Therefore it's best - * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not + * if PAGES_PER_CHUNK is the same as TBM_MAX_TUPLES_PER_PAGE, or at least not * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to * avoid expensive integer remainder operations. So, define it like this: */ @@ -79,7 +70,7 @@ #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD) /* number of active words for an exact page: */ -#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1) +#define WORDS_PER_PAGE ((TBM_MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1) /* number of active words for a lossy chunk: */ #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1) @@ -181,7 +172,7 @@ struct TBMPrivateIterator int spageptr; /* next spages index */ int schunkptr; /* next schunks index */ int schunkbit; /* next bit to check in current schunk */ - TBMIterateResult output; /* MUST BE LAST (because variable-size) */ + TBMIterateResult output; }; /* @@ -222,7 +213,7 @@ struct TBMSharedIterator PTEntryArray *ptbase; /* pagetable element array */ PTIterationArray *ptpages; /* sorted exact page index list */ PTIterationArray *ptchunks; /* sorted lossy page index list */ - TBMIterateResult output; /* MUST BE LAST (because variable-size) */ + TBMIterateResult output; }; /* Local function prototypes */ @@ -390,7 +381,7 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bitnum; /* safety check to ensure we don't overrun bit array bounds */ - if (off < 1 || off > MAX_TUPLES_PER_PAGE) + if (off < 1 || off > TBM_MAX_TUPLES_PER_PAGE) elog(ERROR, "tuple offset out of range: %u", off); /* @@ -696,9 +687,7 @@ tbm_begin_private_iterate(TIDBitmap *tbm) * Create the TBMPrivateIterator struct, with enough trailing space to * serve the needs of the TBMIterateResult sub-struct. */ - iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator) + - MAX_TUPLES_PER_PAGE * - sizeof(OffsetNumber)); + iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator)); iterator->tbm = tbm; /* @@ -906,11 +895,13 @@ tbm_prepare_shared_iterate(TIDBitmap *tbm) /* * tbm_extract_page_tuple - extract the tuple offsets from a page * - * The extracted offsets are stored into TBMIterateResult. + * Sets ntuples in the TBMIterateResult */ -static inline int -tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output) +void +tbm_extract_page_tuple(TBMIterateResult *iteritem, + TBMOffsets offsets) { + PagetableEntry *page = iteritem->internal_page; int wordnum; int ntuples = 0; @@ -925,14 +916,14 @@ tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output) while (w != 0) { if (w & 1) - output->offsets[ntuples++] = (OffsetNumber) off; + offsets[ntuples++] = (OffsetNumber) off; off++; w >>= 1; } } } - return ntuples; + iteritem->ntuples = ntuples; } /* @@ -1021,7 +1012,6 @@ tbm_private_iterate(TBMPrivateIterator *iterator) if (iterator->spageptr < tbm->npages) { PagetableEntry *page; - int ntuples; /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */ if (tbm->status == TBM_ONE_PAGE) @@ -1029,10 +1019,10 @@ tbm_private_iterate(TBMPrivateIterator *iterator) else page = tbm->spages[iterator->spageptr]; - /* scan bitmap to extract individual offset numbers */ - ntuples = tbm_extract_page_tuple(page, output); + output->internal_page = page; + /* ntuples will be calculated later */ + output->ntuples = -2; output->blockno = page->blockno; - output->ntuples = ntuples; output->recheck = page->recheck; iterator->spageptr++; return output; @@ -1116,12 +1106,11 @@ tbm_shared_iterate(TBMSharedIterator *iterator) if (istate->spageptr < istate->npages) { PagetableEntry *page = &ptbase[idxpages[istate->spageptr]]; - int ntuples; - /* scan bitmap to extract individual offset numbers */ - ntuples = tbm_extract_page_tuple(page, output); + output->internal_page = page; + /* ntuples will be calculated later */ + output->ntuples = -2; output->blockno = page->blockno; - output->ntuples = ntuples; output->recheck = page->recheck; istate->spageptr++; @@ -1468,8 +1457,7 @@ tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp) * Create the TBMSharedIterator struct, with enough trailing space to * serve the needs of the TBMIterateResult sub-struct. */ - iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) + - MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); + iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator)); istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp); diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index dcd1ae3fc34..9a9461cdfd1 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -354,6 +354,7 @@ typedef struct GinScanEntryData TIDBitmap *matchBitmap; TBMPrivateIterator *matchIterator; TBMIterateResult *matchResult; + TBMOffsets matchTupleOffsets; /* used for Posting list and one page in Posting tree */ ItemPointerData *list; diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index a6ffeac90be..17462fb554f 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -22,9 +22,22 @@ #ifndef TIDBITMAP_H #define TIDBITMAP_H +#include "access/htup_details.h" #include "storage/itemptr.h" #include "utils/dsa.h" +/* + * The maximum number of tuples per page is not large (typically 256 with + * 8K pages, or 1024 with 32K pages). So there's not much point in making + * the per-page bitmaps variable size. We just legislate that the size + * is this: + */ +#define TBM_MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage + +/* + * The tuple OffsetNumbers extracted from a single page in the bitmap. + */ +typedef OffsetNumber TBMOffsets[TBM_MAX_TUPLES_PER_PAGE]; /* * Actual bitmap representation is private to tidbitmap.c. Callers can @@ -36,6 +49,7 @@ typedef struct TIDBitmap TIDBitmap; typedef struct TBMPrivateIterator TBMPrivateIterator; typedef struct TBMSharedIterator TBMSharedIterator; + /* * Callers with both private and shared implementations can use this unified * API. @@ -53,11 +67,26 @@ typedef struct TBMIterator /* Result structure for tbm_iterate */ typedef struct TBMIterateResult { - BlockNumber blockno; /* page number containing tuples */ - int ntuples; /* -1 indicates lossy result */ - bool recheck; /* should the tuples be rechecked? */ - /* Note: recheck is always true if ntuples < 0 */ - OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; + BlockNumber blockno; /* block number containing tuples */ + + /* + * -1 indicates a lossy page. -2 indicates an exact page whose tuple + * offsets have yet to be extracted from the bitmap. + */ + int ntuples; + + /* + * Whether or not the tuples should be rechecked. This is always true if + * ntuples < 0 but may also be true if the query requires recheck. + */ + bool recheck; + + /* + * Pointer to the page containing the bitmap for this block. It is a void * + * to avoid exposing the details of the tidbitmap PagetableEntry to API + * users. + */ + void *internal_page; } TBMIterateResult; /* function prototypes in nodes/tidbitmap.c */ @@ -74,6 +103,9 @@ extern void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno); extern void tbm_union(TIDBitmap *a, const TIDBitmap *b); extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b); +extern void tbm_extract_page_tuple(TBMIterateResult *iteritem, + TBMOffsets offsets); + extern bool tbm_is_empty(const TIDBitmap *tbm); extern TBMPrivateIterator *tbm_begin_private_iterate(TIDBitmap *tbm); -- 2.34.1