From 029ae568865d36c6267f5aa963b6f0817c154aba Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Thu, 17 Oct 2024 14:14:35 -0400 Subject: [PATCH v10 7/8] WIP: bufmgr: Don't copy pages while writing out After the series of preceding commits introducing and using BufferBeginSetHintBits()/BufferSetHintBits16() hint bits are not set anymore while IO is going on. Therefore we do not need to copy pages while they are being written out anymore. TODO: Update comments Author: Reviewed-by: Discussion: https://postgr.es/m/ Backpatch: --- src/include/storage/bufpage.h | 3 +- src/backend/access/hash/hashpage.c | 2 +- src/backend/access/transam/xloginsert.c | 43 ++++++---------------- src/backend/storage/buffer/bufmgr.c | 21 +++++------ src/backend/storage/buffer/localbuf.c | 2 +- src/backend/storage/page/bufpage.c | 48 ++++--------------------- src/backend/storage/smgr/bulk_write.c | 2 +- src/test/modules/test_aio/test_aio.c | 2 +- 8 files changed, 33 insertions(+), 90 deletions(-) diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index ae3725b3b81..31ec9a8a047 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -504,7 +504,6 @@ extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems); extern void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum); extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, const void *newtup, Size newsize); -extern char *PageSetChecksumCopy(Page page, BlockNumber blkno); -extern void PageSetChecksumInplace(Page page, BlockNumber blkno); +extern void PageSetChecksum(Page page, BlockNumber blkno); #endif /* BUFPAGE_H */ diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 8e220a3ae16..52c20208c66 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -1029,7 +1029,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) zerobuf.data, true); - PageSetChecksumInplace(page, lastblock); + PageSetChecksum(page, lastblock); smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data, false); diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 92c48e768c3..4bab484fd5d 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -261,8 +261,11 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags) */ #ifdef USE_ASSERT_CHECKING if (!(flags & REGBUF_NO_CHANGE)) - Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE) && - BufferIsDirty(buffer)); + { + Assert(BufferIsDirty(buffer)); + Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE) || + BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE)); + } #endif if (block_id >= max_registered_block_id) @@ -1066,7 +1069,7 @@ XLogCheckBufferNeedsBackup(Buffer buffer) * Write a backup block if needed when we are setting a hint. Note that * this may be called for a variety of page types, not just heaps. * - * Callable while holding just share lock on the buffer content. + * Callable while holding just share-exclusive lock on the buffer content. * * We can't use the plain backup block mechanism since that relies on the * Buffer being exclusively locked. Since some modifications (setting LSN, hint @@ -1074,6 +1077,8 @@ XLogCheckBufferNeedsBackup(Buffer buffer) * failures. So instead we copy the page and insert the copied data as normal * record data. * + * FIXME: outdated + * * We only need to do something if page has not yet been full page written in * this checkpoint round. The LSN of the inserted wal record is returned if we * had to write, InvalidXLogRecPtr otherwise. @@ -1102,46 +1107,20 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) /* * We assume page LSN is first data on *every* page that can be passed to - * XLogInsert, whether it has the standard page layout or not. Since we're - * only holding a share-lock on the page, we must take the buffer header - * lock when we look at the LSN. + * XLogInsert, whether it has the standard page layout or not. */ lsn = BufferGetLSNAtomic(buffer); if (lsn <= RedoRecPtr) { - int flags = 0; - PGAlignedBlock copied_buffer; - char *origdata = (char *) BufferGetBlock(buffer); - RelFileLocator rlocator; - ForkNumber forkno; - BlockNumber blkno; - - /* - * Copy buffer so we don't have to worry about concurrent hint bit or - * lsn updates. We assume pd_lower/upper cannot be changed without an - * exclusive lock, so the contents bkp are not racy. - */ - if (buffer_std) - { - /* Assume we can omit data between pd_lower and pd_upper */ - Page page = BufferGetPage(buffer); - uint16 lower = ((PageHeader) page)->pd_lower; - uint16 upper = ((PageHeader) page)->pd_upper; - - memcpy(copied_buffer.data, origdata, lower); - memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper); - } - else - memcpy(copied_buffer.data, origdata, BLCKSZ); + int flags = REGBUF_NO_CHANGE; XLogBeginInsert(); if (buffer_std) flags |= REGBUF_STANDARD; - BufferGetTag(buffer, &rlocator, &forkno, &blkno); - XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data, flags); + XLogRegisterBuffer(0, buffer, flags); recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT); } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9574baa36cb..e114e64fdd9 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -4416,7 +4416,6 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, ErrorContextCallback errcallback; instr_time io_start; Block bufBlock; - char *bufToWrite; uint64 buf_state; Assert(BufferLockHeldByMeInMode(buf, BUFFER_LOCK_EXCLUSIVE) || @@ -4487,12 +4486,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, */ bufBlock = BufHdrGetBlock(buf); - /* - * Update page checksum if desired. Since we have only shared lock on the - * buffer, other processes might be updating hint bits in it, so we must - * copy the page to private storage if we do checksumming. - */ - bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); + /* Update page checksum if desired. */ + PageSetChecksum((Page) bufBlock, buf->tag.blockNum); io_start = pgstat_prepare_io_time(track_io_timing); @@ -4502,7 +4497,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, smgrwrite(reln, BufTagGetForkNum(&buf->tag), buf->tag.blockNum, - bufToWrite, + bufBlock, false); /* @@ -4626,8 +4621,8 @@ BufferIsPermanent(Buffer buffer) /* * BufferGetLSNAtomic * Retrieves the LSN of the buffer atomically using a buffer header lock. - * This is necessary for some callers who may not have an exclusive lock - * on the buffer. + * This is necessary for some callers who may not have a (share-)exclusive + * lock on the buffer. */ XLogRecPtr BufferGetLSNAtomic(Buffer buffer) @@ -5679,6 +5674,12 @@ MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, b * It's possible we may enter here without an xid, so it is * essential that CreateCheckPoint waits for virtual transactions * rather than full transactionids. + * + * FIXME: I think we now should simply mark the page dirty before + * WAL logging the hint bit - afaikt it then should work just like + * any other buffer write (due to SyncBuffers()/SyncOneBuffer() + * seeing the dirty bit and trying to lock the page + * share-exclusive, and thus having to wait). */ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); MyProc->delayChkptFlags |= DELAY_CHKPT_START; diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 04a540379a2..55e17e03acb 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -199,7 +199,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln) reln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyProcNumber); - PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + PageSetChecksum(localpage, bufHdr->tag.blockNum); io_start = pgstat_prepare_io_time(track_io_timing); diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index de85911e3ac..2072bb1c72c 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -1494,51 +1494,15 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum, /* * Set checksum for a page in shared buffers. * - * If checksums are disabled, or if the page is not initialized, just return - * the input. Otherwise, we must make a copy of the page before calculating - * the checksum, to prevent concurrent modifications (e.g. setting hint bits) - * from making the final checksum invalid. It doesn't matter if we include or - * exclude hints during the copy, as long as we write a valid page and - * associated checksum. + * If checksums are disabled, or if the page is not initialized, just + * return. Otherwise compute and set the checksum. * - * Returns a pointer to the block-sized data that needs to be written. Uses - * statically-allocated memory, so the caller must immediately write the - * returned page and not refer to it again. - */ -char * -PageSetChecksumCopy(Page page, BlockNumber blkno) -{ - static char *pageCopy = NULL; - - /* If we don't need a checksum, just return the passed-in data */ - if (PageIsNew(page) || !DataChecksumsEnabled()) - return page; - - /* - * We allocate the copy space once and use it over on each subsequent - * call. The point of palloc'ing here, rather than having a static char - * array, is first to ensure adequate alignment for the checksumming code - * and second to avoid wasting space in processes that never call this. - */ - if (pageCopy == NULL) - pageCopy = MemoryContextAllocAligned(TopMemoryContext, - BLCKSZ, - PG_IO_ALIGN_SIZE, - 0); - - memcpy(pageCopy, page, BLCKSZ); - ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); - return pageCopy; -} - -/* - * Set checksum for a page in private memory. - * - * This must only be used when we know that no other process can be modifying - * the page buffer. + * In the past this needed to be done on a copy of the page, due to the + * possibility of e.g. hint bits being set concurrently. However, this is not + * necessary anymore as hint bits won't be set while IO is going on. */ void -PageSetChecksumInplace(Page page, BlockNumber blkno) +PageSetChecksum(Page page, BlockNumber blkno) { /* If we don't need a checksum, just return */ if (PageIsNew(page) || !DataChecksumsEnabled()) diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c index 36b28824ec8..f3c24082a69 100644 --- a/src/backend/storage/smgr/bulk_write.c +++ b/src/backend/storage/smgr/bulk_write.c @@ -279,7 +279,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate) BlockNumber blkno = pending_writes[i].blkno; Page page = pending_writes[i].buf->data; - PageSetChecksumInplace(page, blkno); + PageSetChecksum(page, blkno); if (blkno >= bulkstate->relsize) { diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c index b1aa8af9ec0..2ae4a559fab 100644 --- a/src/test/modules/test_aio/test_aio.c +++ b/src/test/modules/test_aio/test_aio.c @@ -288,7 +288,7 @@ modify_rel_block(PG_FUNCTION_ARGS) } else { - PageSetChecksumInplace(page, blkno); + PageSetChecksum(page, blkno); } smgrwrite(RelationGetSmgr(rel), -- 2.48.1.76.g4e746b1a31.dirty