commit aa7a7db17aa794fa583dc4bee6e0eacf009fa076 Author: Alexander Korotkov Date: Fri Apr 19 12:18:14 2019 +0300 Improve relation truncation algorithm Make relation truncation use ExclusiveLock instead of AccessExclusiveLock lock. In order to implement that make dropping of relation buffers two-phase. First phase happens before actual file truncation and prevents any dirty buffers past truncation point from being written. After file truncation second phase wipes out past truncation point buffers. TODO: * Tolerate past-truncation point reads, which might happen concurrently to truncation. diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 8dc76fa8583..cb1907da578 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -1920,7 +1920,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) lock_retry = 0; while (true) { - if (ConditionalLockRelation(onerel, AccessExclusiveLock)) + if (ConditionalLockRelation(onerel, ExclusiveLock)) break; /* @@ -1961,7 +1961,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) * numbers alone amounts to assuming that the new pages have the * same tuple density as existing ones, which is less unlikely. */ - UnlockRelation(onerel, AccessExclusiveLock); + UnlockRelation(onerel, ExclusiveLock); return; } @@ -1976,7 +1976,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) if (new_rel_pages >= old_rel_pages) { /* can't do anything after all */ - UnlockRelation(onerel, AccessExclusiveLock); + UnlockRelation(onerel, ExclusiveLock); return; } @@ -1992,7 +1992,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) * that should happen as part of standard invalidation processing once * they acquire lock on the relation. */ - UnlockRelation(onerel, AccessExclusiveLock); + UnlockRelation(onerel, ExclusiveLock); /* * Update statistics. Here, it *is* correct to adjust rel_pages diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 887023fc8a5..7b19a84d562 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1108,7 +1108,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * happens to be trying to split the page the first one got from * StrategyGetBuffer.) */ - if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + if (!(oldFlags & BM_DIRTY_BARRIER) && + LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED)) { /* @@ -1305,6 +1306,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * just like permanent relations. */ buf->tag = newTag; + Assert((buf_state & BM_DIRTY_BARRIER) == 0); buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT | BUF_USAGECOUNT_MASK); @@ -1483,6 +1485,7 @@ MarkBufferDirty(Buffer buffer) buf_state = old_buf_state; Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); + Assert((buf_state & BM_DIRTY_BARRIER) == 0); buf_state |= BM_DIRTY | BM_JUST_DIRTIED; if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state, @@ -2371,6 +2374,29 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) */ buf_state = LockBufHdr(bufHdr); + if (buf_state & BM_DIRTY_BARRIER) + { + if (skip_recently_used) + { + UnlockBufHdr(bufHdr, buf_state); + return result; + } + else + { + /* + * Can't sync buffer if it's BM_DIRTY_BARRIER. So, wait till this + * flag is cleared. + */ + while (buf_state & BM_DIRTY_BARRIER) + { + UnlockBufHdr(bufHdr, buf_state); + pg_usleep(10000L); + buf_state = LockBufHdr(bufHdr); + } + } + } + + if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 && BUF_STATE_GET_USAGECOUNT(buf_state) == 0) { @@ -2859,6 +2885,204 @@ BufferGetLSNAtomic(Buffer buffer) return lsn; } +/* + * Set BM_DIRTY_BARRIER flag to the buffer. + */ +static bool +SetBufferDirtyBarrier(BufferDesc *buf, uint32 buf_state) +{ + bool result = false; + + while (buf_state & BM_IO_IN_PROGRESS) + { + UnlockBufHdr(buf, buf_state); + WaitIO(buf); + buf_state = LockBufHdr(buf); + } + + if (buf_state & BM_DIRTY) + { + buf_state |= BM_DIRTY_BARRIER; + result = true; + } + + UnlockBufHdr(buf, buf_state); + + return result; +} + +/* + * Clear BM_DIRTY_BARRIER flag and also optionally BM_DIRTY and + * BM_JUST_DIRTIED flags. + */ +static void +UnsetBufferDirtyBarrier(BufferDesc *buf, bool clean_dirty) +{ + uint32 buf_state; + + buf_state = LockBufHdr(buf); + buf_state &= ~BM_DIRTY_BARRIER; + if (clean_dirty) + buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED); + UnlockBufHdr(buf, buf_state); +} + +typedef struct +{ + int *buf_ids; + int count; + int nallocated; + RelFileNodeBackend rnode; + ForkNumber forkNum; + BlockNumber firstDelBlock; +} BuffersTrucateState; + + +volatile BuffersTrucateState truncate_state = {NULL, 0, 0}; + +static void +init_truncate_state(void) +{ + truncate_state.count = 0; + truncate_state.nallocated = 16; + truncate_state.buf_ids = (int *) palloc(sizeof(int) * + truncate_state.nallocated); +} + +static void +extend_truncate_state_if_needed(void) +{ + int *tmp; + + if (truncate_state.count + 1 <= truncate_state.nallocated) + return; + + truncate_state.nallocated *= 2; + tmp = (int *) palloc(sizeof(int) * + truncate_state.nallocated); + memcpy(tmp, truncate_state.buf_ids, truncate_state.count * sizeof(int)); + truncate_state.buf_ids = tmp; +} + +static void +free_truncate_state(void) +{ + if (truncate_state.buf_ids) + pfree(truncate_state.buf_ids); + truncate_state.buf_ids = NULL; + truncate_state.count = 0; + truncate_state.nallocated = 0; +} + +/* + * Prepare for truncation of relfilenode buffers. Sets BM_DIRTY_BARRIER to + * every dirty buffer to be truncated. Requires ExclusiveLock on relation, + * so no more buffers should be dirtied. Therefore, after execution of this + * function no more past truncation point buffers will be written out. + */ +void +RelFileNodeBuffersTruncatePrepare(RelFileNodeBackend rnode, + ForkNumber forkNum, + BlockNumber firstDelBlock) +{ + int i; + WritebackContext wb_context; + bool barriers_overflow = false; + + /* If it's a local relation, it's localbuf.c's problem. */ + if (RelFileNodeBackendIsTemp(rnode)) + { + if (rnode.backend == MyBackendId) + DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock); + return; + } + + init_truncate_state(); + truncate_state.rnode = rnode; + truncate_state.forkNum = forkNum; + truncate_state.firstDelBlock = firstDelBlock; + WritebackContextInit(&wb_context, &checkpoint_flush_after); + + /* Look for past truncation point buffers */ + for (i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node)) + continue; + + buf_state = LockBufHdr(bufHdr); + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) && + bufHdr->tag.forkNum == forkNum && + bufHdr->tag.blockNum >= firstDelBlock) + { + extend_truncate_state_if_needed(); + + /* + * Mark dirty buffers as BM_DIRTY_BARRIER. IncDirtyBarriers() + * ensures that there is not more than NBuffers/2 of barriered + * buffers. If we exceed the limite, then just write out dirty + * buffers (unlikely to happen). + */ + if (!barriers_overflow) + { + START_CRIT_SECTION(); + if (SetBufferDirtyBarrier(bufHdr, buf_state)) + { + if (IncDirtyBarriers()) + { + truncate_state.buf_ids[truncate_state.count++] = i; + } + else + { + UnsetBufferDirtyBarrier(bufHdr, false); + barriers_overflow = true; + } + } + END_CRIT_SECTION(); + } + + if (barriers_overflow) + { + UnlockBufHdr(bufHdr, buf_state); + SyncOneBuffer(i, false, &wb_context); + } + } + else + { + UnlockBufHdr(bufHdr, buf_state); + } + } + + IssuePendingWritebacks(&wb_context); +} + +/* + * Finish truncation of node buffers. When commit == true, all the buffers + * past truncation point are removed. When commit == false, just removes + * BM_DIRTY_BARRIER flag. + */ +void +RelFileNodeBuffersTruncateFinish(bool commit) +{ + int i; + + START_CRIT_SECTION(); + for (i = 0; i < truncate_state.count; i++) + { + BufferDesc *buf = GetBufferDescriptor(truncate_state.buf_ids[i]); + UnsetBufferDirtyBarrier(buf, commit); + } + SubDirtyBarriers(truncate_state.count); + if (commit) + DropRelFileNodeBuffers(truncate_state.rnode, + truncate_state.forkNum, + truncate_state.firstDelBlock); + free_truncate_state(); + END_CRIT_SECTION(); +} + /* --------------------------------------------------------------------- * DropRelFileNodeBuffers * @@ -3173,6 +3397,8 @@ FlushRelationBuffers(Relation rel) ErrorContextCallback errcallback; Page localpage; + Assert((buf_state & BM_DIRTY_BARRIER) == 0); + localpage = (char *) LocalBufHdrGetBlock(bufHdr); /* Setup error traceback support for ereport() */ @@ -3273,6 +3499,7 @@ FlushDatabaseBuffers(Oid dbid) ReservePrivateRefCountEntry(); buf_state = LockBufHdr(bufHdr); + Assert((buf_state & BM_DIRTY_BARRIER) == 0); if (bufHdr->tag.rnode.dbNode == dbid && (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) { @@ -3894,6 +4121,8 @@ StartBufferIO(BufferDesc *buf, bool forInput) buf_state = LockBufHdr(buf); + Assert((buf_state & BM_DIRTY_BARRIER) == 0); + if (!(buf_state & BM_IO_IN_PROGRESS)) break; @@ -3953,6 +4182,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits) buf_state = LockBufHdr(buf); + Assert((buf_state & BM_DIRTY_BARRIER) == 0); Assert(buf_state & BM_IO_IN_PROGRESS); buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR); @@ -3994,6 +4224,7 @@ AbortBufferIO(void) LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE); buf_state = LockBufHdr(buf); + Assert((buf_state & BM_DIRTY_BARRIER) == 0); Assert(buf_state & BM_IO_IN_PROGRESS); if (IsForInput) { diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 03caceaf7b0..bece4878506 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -58,6 +58,8 @@ typedef struct * StrategyNotifyBgWriter. */ int bgwprocno; + + pg_atomic_uint32 numDirtyBarriers; } BufferStrategyControl; /* Pointers to shared state */ @@ -300,7 +302,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) */ local_buf_state = LockBufHdr(buf); if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 - && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0) + && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0 + && (local_buf_state & BM_DIRTY_BARRIER) == 0) { if (strategy != NULL) AddBufferToRing(strategy, buf); @@ -324,7 +327,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) */ local_buf_state = LockBufHdr(buf); - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) + if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 && + (local_buf_state & BM_DIRTY_BARRIER) == 0) { if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0) { @@ -519,6 +523,8 @@ StrategyInitialize(bool init) StrategyControl->completePasses = 0; pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); + pg_atomic_init_u32(&StrategyControl->numDirtyBarriers, 0); + /* No pending notification */ StrategyControl->bgwprocno = -1; } @@ -643,7 +649,8 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) buf = GetBufferDescriptor(bufnum - 1); local_buf_state = LockBufHdr(buf); if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 - && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1) + && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1 + && (local_buf_state & BM_DIRTY_BARRIER) == 0) { strategy->current_was_in_ring = true; *buf_state = local_buf_state; @@ -702,3 +709,30 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf) return true; } + +bool +IncDirtyBarriers(void) +{ + uint32 num; + + num = pg_atomic_read_u32(&StrategyControl->numDirtyBarriers); + + do + { + if (num >= NBuffers / 2) + return false; + + if (pg_atomic_compare_exchange_u32(&StrategyControl->numDirtyBarriers, + &num, + num + 1)) + return true; + + } + while (true); +} + +void +SubDirtyBarriers(uint32 sub) +{ + (void) pg_atomic_fetch_sub_u32(&StrategyControl->numDirtyBarriers, sub); +} diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 8191118b619..ea9739ca13c 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -645,28 +645,38 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum) void smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) { - /* - * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will - * just drop them without bothering to write the contents. - */ - DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nblocks); - - /* - * Send a shared-inval message to force other backends to close any smgr - * references they may have for this rel. This is useful because they - * might have open file pointers to segments that got removed, and/or - * smgr_targblock variables pointing past the new rel end. (The inval - * message will come back to our backend, too, causing a - * probably-unnecessary local smgr flush. But we don't expect that this - * is a performance-critical path.) As in the unlink code, we want to be - * sure the message is sent before we start changing things on-disk. - */ - CacheInvalidateSmgr(reln->smgr_rnode); - - /* - * Do the truncation. - */ - smgrsw[reln->smgr_which].smgr_truncate(reln, forknum, nblocks); + PG_TRY(); + { + /* + * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will + * just drop them without bothering to write the contents. + */ + RelFileNodeBuffersTruncatePrepare(reln->smgr_rnode, forknum, nblocks); + + /* + * Send a shared-inval message to force other backends to close any smgr + * references they may have for this rel. This is useful because they + * might have open file pointers to segments that got removed, and/or + * smgr_targblock variables pointing past the new rel end. (The inval + * message will come back to our backend, too, causing a + * probably-unnecessary local smgr flush. But we don't expect that this + * is a performance-critical path.) As in the unlink code, we want to be + * sure the message is sent before we start changing things on-disk. + */ + CacheInvalidateSmgr(reln->smgr_rnode); + + /* + * Do the truncation. + */ + smgrsw[reln->smgr_which].smgr_truncate(reln, forknum, nblocks); + } + PG_CATCH(); + { + RelFileNodeBuffersTruncateFinish(false); + PG_RE_THROW(); + } + PG_END_TRY(); + RelFileNodeBuffersTruncateFinish(true); } /* diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index ba1b5463fc3..fb86b3fbb82 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -29,9 +29,9 @@ /* * Buffer state is a single 32-bit variable where following data is combined. * - * - 18 bits refcount + * - 17 bits refcount * - 4 bits usage count - * - 10 bits of flags + * - 11 bits of flags * * Combining these values allows to perform some operations without locking * the buffer header, by modifying them together with a CAS loop. @@ -39,11 +39,11 @@ * The definition of buffer state components is below. */ #define BUF_REFCOUNT_ONE 1 -#define BUF_REFCOUNT_MASK ((1U << 18) - 1) -#define BUF_USAGECOUNT_MASK 0x003C0000U -#define BUF_USAGECOUNT_ONE (1U << 18) -#define BUF_USAGECOUNT_SHIFT 18 -#define BUF_FLAG_MASK 0xFFC00000U +#define BUF_REFCOUNT_MASK ((1U << 17) - 1) +#define BUF_USAGECOUNT_MASK 0x001E0000U +#define BUF_USAGECOUNT_ONE (1U << 17) +#define BUF_USAGECOUNT_SHIFT 17 +#define BUF_FLAG_MASK 0xFFE00000U /* Get refcount and usagecount from buffer state */ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK) @@ -55,8 +55,10 @@ * Note: TAG_VALID essentially means that there is a buffer hashtable * entry associated with the buffer's tag. */ -#define BM_LOCKED (1U << 22) /* buffer header is locked */ -#define BM_DIRTY (1U << 23) /* data needs writing */ +#define BM_LOCKED (1U << 21) /* buffer header is locked */ +#define BM_DIRTY (1U << 22) /* data needs writing */ +#define BM_DIRTY_BARRIER (1U << 23) /* data writing is temporary + * forbidden */ #define BM_VALID (1U << 24) /* data is valid */ #define BM_TAG_VALID (1U << 25) /* tag is assigned */ #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index c5826f691de..08f018804e6 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -190,8 +190,13 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, extern void FlushOneBuffer(Buffer buffer); extern void FlushRelationBuffers(Relation rel); extern void FlushDatabaseBuffers(Oid dbid); -extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, - ForkNumber forkNum, BlockNumber firstDelBlock); + +extern void RelFileNodeBuffersTruncatePrepare(RelFileNodeBackend rnode, + ForkNumber forkNum, + BlockNumber firstDelBlock); +extern void RelFileNodeBuffersTruncateFinish(bool commit); +extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum, + BlockNumber firstDelBlock); extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes); extern void DropDatabaseBuffers(Oid dbid); @@ -230,6 +235,8 @@ extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation); /* in freelist.c */ extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype); extern void FreeAccessStrategy(BufferAccessStrategy strategy); +extern bool IncDirtyBarriers(void); +extern void SubDirtyBarriers(uint32 sub); /* inline functions */