From 9a3bdf2bf08bf617432135da306e0002fdeedba3 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Fri, 23 Nov 2018 17:13:56 +1300 Subject: [PATCH 2/2] Fix deadlock by sending without content lock, but still marked BM_DIRTY. --- src/backend/postmaster/checkpointer.c | 32 +++++++++++++++++---------- src/backend/storage/buffer/bufmgr.c | 23 +++++++++++++++++++ src/backend/storage/smgr/md.c | 19 ++++++++++++++++ src/backend/storage/smgr/smgr.c | 14 +++++++++++- src/include/storage/smgr.h | 6 ++++- 5 files changed, 80 insertions(+), 14 deletions(-) diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 892654dc053..65e7dde5760 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -111,7 +111,7 @@ typedef struct uint32 type; SmgrFileTag tag; bool contains_fd; - int ckpt_started; + uint64 sync_cycle; uint64 open_seq; /* might add a real request-type field later; not needed yet */ } CheckpointerRequest; @@ -175,7 +175,7 @@ static bool IsCheckpointOnSchedule(double progress); static bool ImmediateCheckpointRequested(void); static void UpdateSharedMemoryConfig(void); static void SendFsyncRequest(CheckpointerRequest *request, int fd); -static bool AbsorbFsyncRequest(bool stop_at_current_cycle); +static bool AbsorbFsyncRequest(uint64 max_cycle); /* Signal handlers */ @@ -1143,13 +1143,10 @@ ForwardFsyncRequest(const SmgrFileTag *tag, File file) request.open_seq = request.contains_fd ? FileGetOpenSeq(file) : (uint64) -1; /* - * We read ckpt_started without synchronization. It is used to prevent - * AbsorbAllFsyncRequests() from reading new values from after a - * checkpoint began. A slightly out-of-date value here will only cause - * it to do a little bit more work than strictly necessary, but that's - * OK. + * Include the current sync cycle. This is used to prevent + * AbsorbAllFsyncRequests() from consuming messages sent after it began. */ - request.ckpt_started = CheckpointerShmem->ckpt_started; + request.sync_cycle = GetCheckpointSyncCycle(); SendFsyncRequest(&request, request.contains_fd ? FileGetRawDesc(file) : -1); @@ -1198,6 +1195,8 @@ AbsorbFsyncRequests(void) void AbsorbAllFsyncRequests(void) { + uint64 max_cycle; + if (!AmCheckpointerProcess()) return; @@ -1207,12 +1206,20 @@ AbsorbAllFsyncRequests(void) BgWriterStats.m_buf_fsync_backend += pg_atomic_exchange_u32(&CheckpointerShmem->num_backend_fsync, 0); + /* + * The highest cycle number we normally expect to see is the current cycle + * number. Even though we only want to consume messages from the previous + * cycle, they may be hiding behind other messages, so we consume until + * the pipe is empty or until we see a future cycle (caused by running out + * of fds while we're in the loop). + */ + max_cycle = GetCheckpointSyncCycle(); for (;;) { if (!FlushFsyncRequestQueueIfNecessary()) elog(FATAL, "may not happen"); - if (!AbsorbFsyncRequest(true)) + if (!AbsorbFsyncRequest(max_cycle)) break; } } @@ -1220,9 +1227,11 @@ AbsorbAllFsyncRequests(void) /* * AbsorbFsyncRequest * Retrieve one queued fsync request and pass them to local smgr. + * Return false if there is nothing to absorb or we see a message + * with a sync cycle higher than max_cycle. */ static bool -AbsorbFsyncRequest(bool stop_at_current_cycle) +AbsorbFsyncRequest(uint64 max_cycle) { static CheckpointerRequest req; int fd = -1; @@ -1306,8 +1315,7 @@ AbsorbFsyncRequest(bool stop_at_current_cycle) RememberFsyncRequest(&req.tag, fd, req.open_seq); END_CRIT_SECTION(); - if (stop_at_current_cycle && - req.ckpt_started == CheckpointerShmem->ckpt_started) + if (req.sync_cycle > max_cycle) return false; return true; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 256cc5e0217..8a73d4fb384 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -178,6 +178,7 @@ static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer); static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move); static inline int32 GetPrivateRefCount(Buffer buffer); static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); +static void ScheduleBufferTagForFsync(const BufferTag *tag, SMgrRelation reln); /* * Ensure that the PrivateRefCountArray has sufficient space to store one more @@ -1142,6 +1143,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, FlushBuffer(buf, NULL); LWLockRelease(BufferDescriptorGetContentLock(buf)); + ScheduleBufferTagForFsync(&buf->tag, NULL); ScheduleBufferTagForWriteback(&BackendWritebackContext, &buf->tag); @@ -2401,6 +2403,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) UnpinBuffer(bufHdr, true); + ScheduleBufferTagForFsync(&tag, NULL); ScheduleBufferTagForWriteback(wb_context, &tag); return result | BUF_WRITTEN; @@ -2662,6 +2665,11 @@ BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, * * If the caller has an smgr reference for the buffer's relation, pass it * as the second parameter. If not, pass NULL. + * + * The caller must call ScheduleBufferTagForFsync() after releasing the + * content lock, but before clearing the BM_DIRTY flag. This ensures that a + * concurrent checkpoint will either receive the fsync request, or consider it + * dirty and flush it (again) itself. */ static void FlushBuffer(BufferDesc *buf, SMgrRelation reln) @@ -3223,6 +3231,7 @@ FlushRelationBuffers(Relation rel) FlushBuffer(bufHdr, rel->rd_smgr); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr, true); + ScheduleBufferTagForFsync(&bufHdr->tag, rel->rd_smgr); } else UnlockBufHdr(bufHdr, buf_state); @@ -3277,6 +3286,7 @@ FlushDatabaseBuffers(Oid dbid) FlushBuffer(bufHdr, NULL); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr, true); + ScheduleBufferTagForFsync(&bufHdr->tag, NULL); } else UnlockBufHdr(bufHdr, buf_state); @@ -4239,6 +4249,19 @@ WritebackContextInit(WritebackContext *context, int *max_pending) context->nr_pending = 0; } +/* + * Register a block that is dirty in the kernel page cache, for later fsync. + */ +static void +ScheduleBufferTagForFsync(const BufferTag *tag, SMgrRelation reln) +{ + /* Open if not already passed in. */ + if (reln == NULL) + reln = smgropen(tag->rnode, InvalidBackendId); + + smgrregdirtyblock(reln, tag->forkNum, tag->blockNum); +} + /* * Add buffer to list of pending writeback requests. */ diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 344e0e12d6f..56c4d15fa60 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -967,6 +967,25 @@ mdpath(const SmgrFileTag *tag, char *out) pfree(path); } +/* + * Register the file behind a dirty block for syncing. + */ +void +mdregdirtyblock(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + MdfdVec *seg; + int segno; + + /* Find the segment. */ + segno = blocknum / RELSEG_SIZE; + if (segno >= reln->md_num_open_segs[forknum]) + elog(ERROR, "block number past end of relation"); + seg = &reln->md_seg_fds[forknum][segno]; + + /* Register it as dirty. */ + register_dirty_segment(reln, forknum, seg); +} + /* * register_dirty_segment() -- Mark a relation segment as needing fsync */ diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index c36ba4298b7..95794b3a945 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -61,6 +61,8 @@ typedef struct f_smgr BlockNumber nblocks); void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); void (*smgr_path) (const SmgrFileTag *tag, char *out); + void (*smgr_regdirtyblock) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); } f_smgr; @@ -81,7 +83,8 @@ static const f_smgr smgrsw[] = { .smgr_nblocks = mdnblocks, .smgr_truncate = mdtruncate, .smgr_immedsync = mdimmedsync, - .smgr_path = mdpath + .smgr_path = mdpath, + .smgr_regdirtyblock = mdregdirtyblock } }; @@ -768,6 +771,15 @@ smgrpath(const SmgrFileTag *tag, char *out) smgrsw[which_for_file_tag(tag)].smgr_path(tag, out); } +/* + * smgrregdirtblock() -- Register a dirty block for later fsync. + */ +void +smgrregdirtyblock(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + smgrsw[reln->smgr_which].smgr_regdirtyblock(reln, forknum, blocknum); +} + /* * AtEOXact_SMgr * diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index dc22efbe0a8..c8afb6ee1ca 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -118,7 +118,9 @@ extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); -extern void smgrpath(const SmgrFileTag *tag, char *out); +extern void smgrpath(const SmgrFileTag *file_tag, char *out); +extern void smgrregdirtyblock(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); extern void AtEOXact_SMgr(void); @@ -145,6 +147,8 @@ extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); extern void mdpath(const SmgrFileTag *tag, char *out); +extern void mdregdirtyblock(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); extern bool FlushFsyncRequestQueueIfNecessary(void); extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum); -- 2.19.1