From 59837dfabd306bc17dcc02bd5f63c7bf5809f9d0 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Thu, 15 Apr 2021 07:01:01 -0400 Subject: [PATCH v1] Index build avoids immed fsync Avoid immediate fsync for just built indexes either by using shared buffers or by leveraging checkpointer's SyncRequest queue. When a checkpoint begins during the index build, if not using shared buffers, the backend will have to do its own fsync. --- src/backend/access/nbtree/nbtree.c | 39 +++--- src/backend/access/nbtree/nbtsort.c | 189 +++++++++++++++++++++++----- src/backend/access/transam/xlog.c | 14 +++ src/include/access/xlog.h | 1 + 4 files changed, 190 insertions(+), 53 deletions(-) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 1360ab80c1..ed3ee8d0e3 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -150,30 +150,29 @@ void btbuildempty(Relation index) { Page metapage; + Buffer metabuf; - /* Construct metapage. */ - metapage = (Page) palloc(BLCKSZ); - _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false)); - + // TODO: test this. /* - * Write the page and log it. It might seem that an immediate sync would - * be sufficient to guarantee that the file exists on disk, but recovery - * itself might remove it while replaying, for example, an - * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need - * this even when wal_level=minimal. + * Construct metapage. + * Because we don't need to lock the relation for extension (since + * noone knows about it yet) and we don't need to initialize the + * new page, as it is done below by _bt_blnewpage(), _bt_getbuf() + * (with P_NEW and BT_WRITE) is overkill. However, it might be worth + * either modifying it or adding a new helper function instead of + * calling ReadBufferExtended() directly. We pass mode RBM_ZERO_AND_LOCK + * because we want to hold an exclusive lock on the buffer content */ - PageSetChecksumInplace(metapage, BTREE_METAPAGE); - smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, - (char *) metapage, true); - log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - BTREE_METAPAGE, metapage, true); + metabuf = ReadBufferExtended(index, MAIN_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, NULL); - /* - * An immediate sync is required even if we xlog'd the page, because the - * write did not go through shared_buffers and therefore a concurrent - * checkpoint may have moved the redo pointer past our xlog record. - */ - smgrimmedsync(index->rd_smgr, INIT_FORKNUM); + metapage = BufferGetPage(metabuf); + _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false)); + + START_CRIT_SECTION(); + MarkBufferDirty(metabuf); + log_newpage_buffer(metabuf, true); + END_CRIT_SECTION(); + _bt_relbuf(index, metabuf); } /* diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 2c4d7f6e25..bde02361e1 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -233,6 +233,7 @@ typedef struct BTPageState { Page btps_page; /* workspace for page building */ BlockNumber btps_blkno; /* block # to write this page at */ + Buffer btps_buf; /* buffer to write this page to */ IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */ OffsetNumber btps_lastoff; /* last item offset loaded */ Size btps_lastextra; /* last item's extra posting list space */ @@ -250,9 +251,11 @@ typedef struct BTWriteState Relation index; BTScanInsert inskey; /* generic insertion scankey */ bool btws_use_wal; /* dump pages to WAL? */ - BlockNumber btws_pages_alloced; /* # pages allocated */ - BlockNumber btws_pages_written; /* # pages written out */ + BlockNumber btws_pages_alloced; /* # pages allocated for index build outside SB */ + BlockNumber btws_pages_written; /* # pages written out for index build outside SB */ Page btws_zeropage; /* workspace for filling zeroes */ + XLogRecPtr redo; /* cached redo pointer to determine if backend fsync is required at end of index build */ + bool use_shared_buffers; } BTWriteState; @@ -261,10 +264,11 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, + bool use_shared_buffers); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); -static Page _bt_blnewpage(uint32 level); +static Page _bt_blnewpage(uint32 level, Buffer buf); static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); static void _bt_slideleft(Page rightmostpage); static void _bt_sortaddtup(Page page, Size itemsize, @@ -275,7 +279,8 @@ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate); -static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); +static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state, + Buffer metabuf, Page metapage); static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2); static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, @@ -323,13 +328,24 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); + /* + * Based on the number of tuples, either create a buffered or unbuffered + * write state. if the number of tuples is small, make a buffered write + * if the number of tuples is larger, then we make an unbuffered write state + * and must ensure that we check the redo pointer to know whether or not we + * need to fsync ourselves + */ /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2); + if (reltuples > 1000) + _bt_leafbuild(buildstate.spool, buildstate.spool2, false); + else + _bt_leafbuild(buildstate.spool, buildstate.spool2, true); + _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); @@ -535,7 +551,7 @@ _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool use_shared_buffers) { BTWriteState wstate; @@ -565,9 +581,11 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.btws_use_wal = RelationNeedsWAL(wstate.index); /* reserve the metapage */ - wstate.btws_pages_alloced = BTREE_METAPAGE + 1; + wstate.btws_pages_alloced = 0; wstate.btws_pages_written = 0; wstate.btws_zeropage = NULL; /* until needed */ + wstate.redo = InvalidXLogRecPtr; + wstate.use_shared_buffers = use_shared_buffers; pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); @@ -605,14 +623,18 @@ _bt_build_callback(Relation index, /* * allocate workspace for a new, clean btree page, not linked to any siblings. + * If index is not built in shared buffers, buf should be InvalidBuffer */ static Page -_bt_blnewpage(uint32 level) +_bt_blnewpage(uint32 level, Buffer buf) { Page page; BTPageOpaque opaque; - page = (Page) palloc(BLCKSZ); + if (buf) + page = BufferGetPage(buf); + else + page = (Page) palloc(BLCKSZ); /* Zero the page and set up standard page header info */ _bt_pageinit(page, BLCKSZ); @@ -634,8 +656,20 @@ _bt_blnewpage(uint32 level) * emit a completed btree page, and release the working storage. */ static void -_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) +_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno, Buffer buf) { + if (wstate->use_shared_buffers) + { + Assert(buf); + START_CRIT_SECTION(); + MarkBufferDirty(buf); + if (wstate->btws_use_wal) + log_newpage_buffer(buf, true); + END_CRIT_SECTION(); + _bt_relbuf(wstate->index, buf); + return; + } + /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ RelationOpenSmgr(wstate->index); @@ -661,7 +695,7 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, wstate->btws_pages_written++, (char *) wstate->btws_zeropage, - true); + false); } PageSetChecksumInplace(page, blkno); @@ -674,14 +708,14 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) { /* extending the file... */ smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, - (char *) page, true); + (char *) page, false); wstate->btws_pages_written++; } else { /* overwriting a block we zero-filled before */ smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, - (char *) page, true); + (char *) page, false); } pfree(page); @@ -694,13 +728,37 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level) { + Buffer buf; + BlockNumber blkno; + BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); - /* create initial page for level */ - state->btps_page = _bt_blnewpage(level); + if (wstate->use_shared_buffers) + { + /* + * Because we don't need to lock the relation for extension (since + * noone knows about it yet) and we don't need to initialize the + * new page, as it is done below by _bt_blnewpage(), _bt_getbuf() + * (with P_NEW and BT_WRITE) is overkill. However, it might be worth + * either modifying it or adding a new helper function instead of + * calling ReadBufferExtended() directly. We pass mode RBM_ZERO_AND_LOCK + * because we want to hold an exclusive lock on the buffer content + */ + buf = ReadBufferExtended(wstate->index, MAIN_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, NULL); + + blkno = BufferGetBlockNumber(buf); + } + else + { + buf = InvalidBuffer; + blkno = wstate->btws_pages_alloced++; + } + /* create initial page for level */ + state->btps_page = _bt_blnewpage(level, buf); /* and assign it a page position */ - state->btps_blkno = wstate->btws_pages_alloced++; + state->btps_blkno = blkno; + state->btps_buf = buf; state->btps_lowkey = NULL; /* initialize lastoff so first item goes into P_FIRSTKEY */ @@ -835,6 +893,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, { Page npage; BlockNumber nblkno; + Buffer nbuf; OffsetNumber last_off; Size last_truncextra; Size pgspc; @@ -849,6 +908,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, npage = state->btps_page; nblkno = state->btps_blkno; + nbuf = state->btps_buf; last_off = state->btps_lastoff; last_truncextra = state->btps_lastextra; state->btps_lastextra = truncextra; @@ -905,16 +965,37 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, */ Page opage = npage; BlockNumber oblkno = nblkno; + Buffer obuf = nbuf; ItemId ii; ItemId hii; IndexTuple oitup; - /* Create new page of same level */ - npage = _bt_blnewpage(state->btps_level); + if (wstate->use_shared_buffers) + { + /* + * Get a new shared buffer. + * Because we don't need to lock the relation for extension (since + * noone knows about it yet) and we don't need to initialize the + * new page, as it is done below by _bt_blnewpage(), _bt_getbuf() + * (with P_NEW and BT_WRITE) is overkill. However, it might be worth + * either modifying it or adding a new helper function instead of + * calling ReadBufferExtended() directly. We pass mode RBM_ZERO_AND_LOCK + * because we want to hold an exclusive lock on the buffer content + */ + nbuf = ReadBufferExtended(wstate->index, MAIN_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, NULL); - /* and assign it a page position */ - nblkno = wstate->btws_pages_alloced++; + /* assign a page position */ + nblkno = BufferGetBlockNumber(nbuf); + } + else + { + nbuf = InvalidBuffer; + /* assign a page position */ + nblkno = wstate->btws_pages_alloced++; + } + /* Create new page of same level */ + npage = _bt_blnewpage(state->btps_level, nbuf); /* * We copy the last item on the page into the new page, and then * rearrange the old page so that the 'last item' becomes its high key @@ -1023,10 +1104,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, /* * Write out the old page. We never need to touch it again, so we can - * free the opage workspace too. + * free the opage workspace too. obuf has been released and is no longer + * valid. */ - _bt_blwritepage(wstate, opage, oblkno); - + _bt_blwritepage(wstate, opage, oblkno, obuf); /* * Reset last_off to point to new page */ @@ -1060,6 +1141,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, state->btps_page = npage; state->btps_blkno = nblkno; + state->btps_buf = nbuf; state->btps_lastoff = last_off; } @@ -1105,12 +1187,11 @@ _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, * Finish writing out the completed btree. */ static void -_bt_uppershutdown(BTWriteState *wstate, BTPageState *state) +_bt_uppershutdown(BTWriteState *wstate, BTPageState *state, Buffer metabuf, Page metapage) { BTPageState *s; BlockNumber rootblkno = P_NONE; uint32 rootlevel = 0; - Page metapage; /* * Each iteration of this loop completes one more level of the tree. @@ -1156,20 +1237,22 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * back one slot. Then we can dump out the page. */ _bt_slideleft(s->btps_page); - _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); + _bt_blwritepage(wstate, s->btps_page, s->btps_blkno, s->btps_buf); + s->btps_buf = InvalidBuffer; s->btps_page = NULL; /* writepage freed the workspace */ } /* - * As the last step in the process, construct the metapage and make it + * As the last step in the process, initialize the metapage and make it * point to the new root (unless we had no data at all, in which case it's * set to point to "P_NONE"). This changes the index to the "valid" state * by filling in a valid magic number in the metapage. + * After this, metapage will have been freed or invalid and metabuf, if ever + * valid, will have been released. */ - metapage = (Page) palloc(BLCKSZ); _bt_initmetapage(metapage, rootblkno, rootlevel, wstate->inskey->allequalimage); - _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); + _bt_blwritepage(wstate, metapage, BTREE_METAPAGE, metabuf); } /* @@ -1190,10 +1273,47 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) SortSupport sortKeys; int64 tuples_done = 0; bool deduplicate; + Buffer metabuf; + Page metapage; deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * Extend the index relation upfront to reserve the metapage + */ + if (wstate->use_shared_buffers) + { + /* + * We should not need to LockRelationForExtension() as no one else knows + * about this index yet? + * Extend the index relation by one block for the metapage. _bt_getbuf() + * is not used here as it does _bt_pageinit() which is one later by + * _bt_initmetapage(). We will fill in the metapage and write it out at + * the end of index build when we have all of the information required + * for the metapage. However, we initially extend the relation for it to + * occupy block 0 because it is much easier when using shared buffers to + * extend the relation with a block number that is always increasing by + * 1. Also, by passing RBM_ZERO_AND_LOCK, we have LW_EXCLUSIVE on the + * buffer content and thus don't need _bt_lockbuf(). + */ + metabuf = ReadBufferExtended(wstate->index, MAIN_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, NULL); + metapage = BufferGetPage(metabuf); + } + else + { + wstate->redo = GetRedoRecPtr(); + metabuf = InvalidBuffer; + metapage = (Page) palloc(BLCKSZ); + RelationOpenSmgr(wstate->index); + + /* extending the file... */ + smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, BTREE_METAPAGE, + (char *) metapage, false); + wstate->btws_pages_written++; + wstate->btws_pages_alloced++; + } + if (merge) { /* @@ -1415,7 +1535,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) } /* Close down final pages and write the metapage */ - _bt_uppershutdown(wstate, state); + _bt_uppershutdown(wstate, state, metabuf, metapage); /* * When we WAL-logged index pages, we must nonetheless fsync index files. @@ -1428,8 +1548,11 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) */ if (wstate->btws_use_wal) { - RelationOpenSmgr(wstate->index); - smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); + if (!wstate->use_shared_buffers && RedoRecPtrChanged(wstate->redo)) + { + RelationOpenSmgr(wstate->index); + smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); + } } } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index adfc6f67e2..d3b6c60278 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8546,6 +8546,20 @@ GetRedoRecPtr(void) return RedoRecPtr; } +bool +RedoRecPtrChanged(XLogRecPtr comparator_ptr) +{ + XLogRecPtr ptr; + + SpinLockAcquire(&XLogCtl->info_lck); + ptr = XLogCtl->RedoRecPtr; + SpinLockRelease(&XLogCtl->info_lck); + + if (RedoRecPtr < ptr) + RedoRecPtr = ptr; + return RedoRecPtr != comparator_ptr; +} + /* * Return information needed to decide whether a modified block needs a * full-page image to be included in the WAL record. diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index f542af0a26..44e4b01559 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -346,6 +346,7 @@ extern XLogRecPtr XLogRestorePoint(const char *rpName); extern void UpdateFullPageWrites(void); extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p); extern XLogRecPtr GetRedoRecPtr(void); +extern bool RedoRecPtrChanged(XLogRecPtr comparator_ptr); extern XLogRecPtr GetInsertRecPtr(void); extern XLogRecPtr GetFlushRecPtr(void); extern XLogRecPtr GetLastImportantRecPtr(void); -- 2.27.0