From 57e54857cb815f479016dfbf95b796159368ed2c Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Fri, 17 Nov 2023 14:42:25 +0530 Subject: [PATCH v7 3/3] Remove the centralized control lock and LRU counter The previous patch has divided SLRU buffer pool into associative banks. This patch is further optimizing it by introducing multiple SLRU locks instead of a common centralized lock this will reduce the contention on the slru control lock. Basically, we will have at max 128 bank locks and if the number of banks is <= 128 then each lock will cover exactly one bank otherwise they will cover multiple banks we will find the bank-to-lock mapping by (bankno % 128). This patch also removes the centralized lru counter and now we will have bank-wise lru counters that will help in frequent cache invalidation while modifying this counter. Dilip Kumar based on design inputs from Robert Haas, Andrey M. Borodin, and Alvaro Herrera --- src/backend/access/transam/clog.c | 122 ++++++++---- src/backend/access/transam/commit_ts.c | 43 ++-- src/backend/access/transam/multixact.c | 175 +++++++++++----- src/backend/access/transam/slru.c | 244 +++++++++++++++++------ src/backend/access/transam/subtrans.c | 58 ++++-- src/backend/commands/async.c | 43 ++-- src/backend/storage/lmgr/lwlock.c | 14 ++ src/backend/storage/lmgr/lwlocknames.txt | 14 +- src/backend/storage/lmgr/predicate.c | 33 +-- src/include/access/slru.h | 62 ++++-- src/include/storage/lwlock.h | 7 + src/test/modules/test_slru/test_slru.c | 32 +-- 12 files changed, 597 insertions(+), 250 deletions(-) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 44008222da..18ec2a47b5 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -275,15 +275,20 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids, XLogRecPtr lsn, int pageno, bool all_xact_same_page) { + LWLock *lock; + /* Can't use group update when PGPROC overflows. */ StaticAssertDecl(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS, "group clog threshold less than PGPROC cached subxids"); + /* Get the SLRU bank lock for the page we are going to access. */ + lock = SimpleLruGetBankLock(XactCtl, pageno); + /* - * When there is contention on XactSLRULock, we try to group multiple + * When there is contention on Xact SLRU lock, we try to group multiple * updates; a single leader process will perform transaction status - * updates for multiple backends so that the number of times XactSLRULock - * needs to be acquired is reduced. + * updates for multiple backends so that the number of times the Xact SLRU + * lock needs to be acquired is reduced. * * For this optimization to be safe, the XID and subxids in MyProc must be * the same as the ones for which we're setting the status. Check that @@ -301,17 +306,17 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids, nsubxids * sizeof(TransactionId)) == 0)) { /* - * If we can immediately acquire XactSLRULock, we update the status of + * If we can immediately acquire SLRU lock, we update the status of * our own XID and release the lock. If not, try use group XID * update. If that doesn't work out, fall back to waiting for the * lock to perform an update for this transaction only. */ - if (LWLockConditionalAcquire(XactSLRULock, LW_EXCLUSIVE)) + if (LWLockConditionalAcquire(lock, LW_EXCLUSIVE)) { /* Got the lock without waiting! Do the update. */ TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status, lsn, pageno); - LWLockRelease(XactSLRULock); + LWLockRelease(lock); return; } else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno)) @@ -324,10 +329,10 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids, } /* Group update not applicable, or couldn't accept this page number. */ - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status, lsn, pageno); - LWLockRelease(XactSLRULock); + LWLockRelease(lock); } /* @@ -346,7 +351,8 @@ TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, Assert(status == TRANSACTION_STATUS_COMMITTED || status == TRANSACTION_STATUS_ABORTED || (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid))); - Assert(LWLockHeldByMeInMode(XactSLRULock, LW_EXCLUSIVE)); + Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl, pageno), + LW_EXCLUSIVE)); /* * If we're doing an async commit (ie, lsn is valid), then we must wait @@ -397,14 +403,13 @@ TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, } /* - * When we cannot immediately acquire XactSLRULock in exclusive mode at + * When we cannot immediately acquire SLRU bank lock in exclusive mode at * commit time, add ourselves to a list of processes that need their XIDs * status update. The first process to add itself to the list will acquire - * XactSLRULock in exclusive mode and set transaction status as required - * on behalf of all group members. This avoids a great deal of contention - * around XactSLRULock when many processes are trying to commit at once, - * since the lock need not be repeatedly handed off from one committing - * process to the next. + * the lock in exclusive mode and set transaction status as required on behalf + * of all group members. This avoids a great deal of contention when many + * processes are trying to commit at once, since the lock need not be + * repeatedly handed off from one committing process to the next. * * Returns true when transaction status has been updated in clog; returns * false if we decided against applying the optimization because the page @@ -418,6 +423,8 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, PGPROC *proc = MyProc; uint32 nextidx; uint32 wakeidx; + int prevpageno; + LWLock *prevlock = NULL; /* We should definitely have an XID whose status needs to be updated. */ Assert(TransactionIdIsValid(xid)); @@ -498,13 +505,10 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, return true; } - /* We are the leader. Acquire the lock on behalf of everyone. */ - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); - /* - * Now that we've got the lock, clear the list of processes waiting for - * group XID status update, saving a pointer to the head of the list. - * Trying to pop elements one at a time could lead to an ABA problem. + * We are leader so clear the list of processes waiting for group XID + * status update, saving a pointer to the head of the list. Trying to pop + * elements one at a time could lead to an ABA problem. */ nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst, INVALID_PGPROCNO); @@ -512,10 +516,44 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, /* Remember head of list so we can perform wakeups after dropping lock. */ wakeidx = nextidx; + /* + * Acquire the SLRU bank lock for the first page in the group. And if + * there are multiple pages in the group which falls under different banks + * then we will release this lock and acquire the new lock before accessing + * the new page. There is rare a possibility that there may be more than + * one page in a group (for detail refer comment in above while loop) and + * that it could be from a different bank, but we are safe since we will be + * releasing the old lock before getting the new lock, so if the concurrent + * updaters lock in opposite orders, there shouldn't be any deadlocks. + */ + prevpageno = ProcGlobal->allProcs[nextidx].clogGroupMemberPage; + prevlock = SimpleLruGetBankLock(XactCtl, prevpageno); + LWLockAcquire(prevlock, LW_EXCLUSIVE); + /* Walk the list and update the status of all XIDs. */ while (nextidx != INVALID_PGPROCNO) { PGPROC *nextproc = &ProcGlobal->allProcs[nextidx]; + int thispageno = nextproc->clogGroupMemberPage; + + /* + * If the SLRU bank lock for the current page is not in the same as + * that of the last page then we need to release the lock on the + * previous bank and acquire the lock on the bank for the page we + * are going to update now. + */ + if (thispageno != prevpageno) + { + LWLock *lock = SimpleLruGetBankLock(XactCtl, thispageno); + + if (prevlock != lock) + { + LWLockRelease(prevlock); + LWLockAcquire(lock, LW_EXCLUSIVE); + } + prevlock = lock; + prevpageno = thispageno; + } /* * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs @@ -535,7 +573,8 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, } /* We're done with the lock now. */ - LWLockRelease(XactSLRULock); + if (prevlock != NULL) + LWLockRelease(prevlock); /* * Now that we've released the lock, go back and wake everybody up. We @@ -564,10 +603,11 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, /* * Sets the commit status of a single transaction. * - * Must be called with XactSLRULock held + * Must be called with slot specific SLRU bank's lock held */ static void -TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno) +TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, + int slotno) { int byteno = TransactionIdToByte(xid); int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; @@ -656,7 +696,7 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn) lsnindex = GetLSNIndex(slotno, xid); *lsn = XactCtl->shared->group_lsn[lsnindex]; - LWLockRelease(XactSLRULock); + LWLockRelease(SimpleLruGetBankLock(XactCtl, pageno)); return status; } @@ -690,8 +730,8 @@ CLOGShmemInit(void) { XactCtl->PagePrecedes = CLOGPagePrecedes; SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, - XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER, - SYNC_HANDLER_CLOG); + "pg_xact", LWTRANCHE_XACT_BUFFER, + LWTRANCHE_XACT_SLRU, SYNC_HANDLER_CLOG); SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE); } @@ -705,8 +745,9 @@ void BootStrapCLOG(void) { int slotno; + LWLock *lock = SimpleLruGetBankLock(XactCtl, 0); - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); /* Create and zero the first page of the commit log */ slotno = ZeroCLOGPage(0, false); @@ -715,7 +756,7 @@ BootStrapCLOG(void) SimpleLruWritePage(XactCtl, slotno); Assert(!XactCtl->shared->page_dirty[slotno]); - LWLockRelease(XactSLRULock); + LWLockRelease(lock); } /* @@ -750,14 +791,10 @@ StartupCLOG(void) TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); int pageno = TransactionIdToPage(xid); - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); - /* * Initialize our idea of the latest page number. */ - XactCtl->shared->latest_page_number = pageno; - - LWLockRelease(XactSLRULock); + pg_atomic_init_u32(&XactCtl->shared->latest_page_number, pageno); } /* @@ -768,8 +805,9 @@ TrimCLOG(void) { TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); int pageno = TransactionIdToPage(xid); + LWLock *lock = SimpleLruGetBankLock(XactCtl, pageno); - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); /* * Zero out the remainder of the current clog page. Under normal @@ -801,7 +839,7 @@ TrimCLOG(void) XactCtl->shared->page_dirty[slotno] = true; } - LWLockRelease(XactSLRULock); + LWLockRelease(lock); } /* @@ -833,6 +871,7 @@ void ExtendCLOG(TransactionId newestXact) { int pageno; + LWLock *lock; /* * No work except at first XID of a page. But beware: just after @@ -843,13 +882,14 @@ ExtendCLOG(TransactionId newestXact) return; pageno = TransactionIdToPage(newestXact); + lock = SimpleLruGetBankLock(XactCtl, pageno); - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); /* Zero the page and make an XLOG entry about it */ ZeroCLOGPage(pageno, true); - LWLockRelease(XactSLRULock); + LWLockRelease(lock); } @@ -987,16 +1027,18 @@ clog_redo(XLogReaderState *record) { int pageno; int slotno; + LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(int)); - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + lock = SimpleLruGetBankLock(XactCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); slotno = ZeroCLOGPage(pageno, false); SimpleLruWritePage(XactCtl, slotno); Assert(!XactCtl->shared->page_dirty[slotno]); - LWLockRelease(XactSLRULock); + LWLockRelease(lock); } else if (info == CLOG_TRUNCATE) { diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 96810959ab..9afa3beaa7 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -219,8 +219,9 @@ SetXidCommitTsInPage(TransactionId xid, int nsubxids, { int slotno; int i; + LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid); @@ -230,13 +231,13 @@ SetXidCommitTsInPage(TransactionId xid, int nsubxids, CommitTsCtl->shared->page_dirty[slotno] = true; - LWLockRelease(CommitTsSLRULock); + LWLockRelease(lock); } /* * Sets the commit timestamp of a single transaction. * - * Must be called with CommitTsSLRULock held + * Must be called with slot specific SLRU bank's Lock held */ static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, @@ -337,7 +338,7 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, if (nodeid) *nodeid = entry.nodeid; - LWLockRelease(CommitTsSLRULock); + LWLockRelease(SimpleLruGetBankLock(CommitTsCtl, pageno)); return *ts != 0; } @@ -527,9 +528,8 @@ CommitTsShmemInit(void) CommitTsCtl->PagePrecedes = CommitTsPagePrecedes; SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0, - CommitTsSLRULock, "pg_commit_ts", - LWTRANCHE_COMMITTS_BUFFER, - SYNC_HANDLER_COMMIT_TS); + "pg_commit_ts", LWTRANCHE_COMMITTS_BUFFER, + LWTRANCHE_COMMITTS_SLRU, SYNC_HANDLER_COMMIT_TS); SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE); commitTsShared = ShmemInitStruct("CommitTs shared", @@ -685,9 +685,7 @@ ActivateCommitTs(void) /* * Re-Initialize our idea of the latest page number. */ - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); - CommitTsCtl->shared->latest_page_number = pageno; - LWLockRelease(CommitTsSLRULock); + pg_atomic_write_u32(&CommitTsCtl->shared->latest_page_number, pageno); /* * If CommitTs is enabled, but it wasn't in the previous server run, we @@ -714,12 +712,13 @@ ActivateCommitTs(void) if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) { int slotno; + LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); slotno = ZeroCommitTsPage(pageno, false); SimpleLruWritePage(CommitTsCtl, slotno); Assert(!CommitTsCtl->shared->page_dirty[slotno]); - LWLockRelease(CommitTsSLRULock); + LWLockRelease(lock); } /* Change the activation status in shared memory. */ @@ -768,9 +767,9 @@ DeactivateCommitTs(void) * be overwritten anyway when we wrap around, but it seems better to be * tidy.) */ - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + SimpleLruAcquireAllBankLock(CommitTsCtl, LW_EXCLUSIVE); (void) SlruScanDirectory(CommitTsCtl, SlruScanDirCbDeleteAll, NULL); - LWLockRelease(CommitTsSLRULock); + SimpleLruReleaseAllBankLock(CommitTsCtl); } /* @@ -802,6 +801,7 @@ void ExtendCommitTs(TransactionId newestXact) { int pageno; + LWLock *lock; /* * Nothing to do if module not enabled. Note we do an unlocked read of @@ -822,12 +822,14 @@ ExtendCommitTs(TransactionId newestXact) pageno = TransactionIdToCTsPage(newestXact); - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + lock = SimpleLruGetBankLock(CommitTsCtl, pageno); + + LWLockAcquire(lock, LW_EXCLUSIVE); /* Zero the page and make an XLOG entry about it */ ZeroCommitTsPage(pageno, !InRecovery); - LWLockRelease(CommitTsSLRULock); + LWLockRelease(lock); } /* @@ -981,16 +983,18 @@ commit_ts_redo(XLogReaderState *record) { int pageno; int slotno; + LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); slotno = ZeroCommitTsPage(pageno, false); SimpleLruWritePage(CommitTsCtl, slotno); Assert(!CommitTsCtl->shared->page_dirty[slotno]); - LWLockRelease(CommitTsSLRULock); + LWLockRelease(lock); } else if (info == COMMIT_TS_TRUNCATE) { @@ -1002,7 +1006,8 @@ commit_ts_redo(XLogReaderState *record) * During XLOG replay, latest_page_number isn't set up yet; insert a * suitable value to bypass the sanity test in SimpleLruTruncate. */ - CommitTsCtl->shared->latest_page_number = trunc->pageno; + pg_atomic_write_u32(&CommitTsCtl->shared->latest_page_number, + trunc->pageno); SimpleLruTruncate(CommitTsCtl, trunc->pageno); } diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 77511c6342..f204cb4db3 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -193,10 +193,10 @@ static SlruCtlData MultiXactMemberCtlData; /* * MultiXact state shared across all backends. All this state is protected - * by MultiXactGenLock. (We also use MultiXactOffsetSLRULock and - * MultiXactMemberSLRULock to guard accesses to the two sets of SLRU - * buffers. For concurrency's sake, we avoid holding more than one of these - * locks at a time.) + * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and + * MultiXactMember to guard accesses to the two sets of SLRU buffers. For + * concurrency's sake, we avoid holding more than one of these locks at a + * time.) */ typedef struct MultiXactStateData { @@ -871,12 +871,15 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int slotno; MultiXactOffset *offptr; int i; - - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + LWLock *lock; + LWLock *prevlock = NULL; pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + /* * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction" * to complain about if there's any I/O error. This is kinda bogus, but @@ -892,10 +895,8 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, MultiXactOffsetCtl->shared->page_dirty[slotno] = true; - /* Exchange our lock */ - LWLockRelease(MultiXactOffsetSLRULock); - - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + /* Release MultiXactOffset SLRU lock. */ + LWLockRelease(lock); prev_pageno = -1; @@ -917,6 +918,20 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, if (pageno != prev_pageno) { + /* + * MultiXactMember SLRU page is changed so check if this new page + * fall into the different SLRU bank then release the old bank's + * lock and acquire lock on the new bank. + */ + lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); + if (lock != prevlock) + { + if (prevlock != NULL) + LWLockRelease(prevlock); + + LWLockAcquire(lock, LW_EXCLUSIVE); + prevlock = lock; + } slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); prev_pageno = pageno; } @@ -937,7 +952,8 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, MultiXactMemberCtl->shared->page_dirty[slotno] = true; } - LWLockRelease(MultiXactMemberSLRULock); + if (prevlock != NULL) + LWLockRelease(prevlock); } /* @@ -1240,6 +1256,8 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, MultiXactId tmpMXact; MultiXactOffset nextOffset; MultiXactMember *ptr; + LWLock *lock; + LWLock *prevlock = NULL; debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); @@ -1343,11 +1361,22 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * time on every multixact creation. */ retry: - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); - pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + /* + * If this page falls under a different bank, release the old bank's lock + * and acquire the lock of the new bank. + */ + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); + if (lock != prevlock) + { + if (prevlock != NULL) + LWLockRelease(prevlock); + LWLockAcquire(lock, LW_EXCLUSIVE); + prevlock = lock; + } + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; @@ -1380,7 +1409,21 @@ retry: entryno = MultiXactIdToOffsetEntry(tmpMXact); if (pageno != prev_pageno) + { + /* + * Since we're going to access a different SLRU page, if this page + * falls under a different bank, release the old bank's lock and + * acquire the lock of the new bank. + */ + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); + if (prevlock != lock) + { + LWLockRelease(prevlock); + LWLockAcquire(lock, LW_EXCLUSIVE); + prevlock = lock; + } slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact); + } offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; @@ -1389,7 +1432,8 @@ retry: if (nextMXOffset == 0) { /* Corner case 2: next multixact is still being filled in */ - LWLockRelease(MultiXactOffsetSLRULock); + LWLockRelease(prevlock); + prevlock = NULL; CHECK_FOR_INTERRUPTS(); pg_usleep(1000L); goto retry; @@ -1398,13 +1442,11 @@ retry: length = nextMXOffset - offset; } - LWLockRelease(MultiXactOffsetSLRULock); + LWLockRelease(prevlock); + prevlock = NULL; ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); - /* Now get the members themselves. */ - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); - truelength = 0; prev_pageno = -1; for (i = 0; i < length; i++, offset++) @@ -1420,6 +1462,20 @@ retry: if (pageno != prev_pageno) { + /* + * Since we're going to access a different SLRU page, if this page + * falls under a different bank, release the old bank's lock and + * acquire the lock of the new bank. + */ + lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); + if (lock != prevlock) + { + if (prevlock) + LWLockRelease(prevlock); + LWLockAcquire(lock, LW_EXCLUSIVE); + prevlock = lock; + } + slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); prev_pageno = pageno; } @@ -1443,7 +1499,8 @@ retry: truelength++; } - LWLockRelease(MultiXactMemberSLRULock); + if (prevlock) + LWLockRelease(prevlock); /* A multixid with zero members should not happen */ Assert(truelength > 0); @@ -1853,14 +1910,14 @@ MultiXactShmemInit(void) SimpleLruInit(MultiXactOffsetCtl, "MultiXactOffset", multixact_offsets_buffers, 0, - MultiXactOffsetSLRULock, "pg_multixact/offsets", - LWTRANCHE_MULTIXACTOFFSET_BUFFER, + "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER, + LWTRANCHE_MULTIXACTOFFSET_SLRU, SYNC_HANDLER_MULTIXACT_OFFSET); SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE); SimpleLruInit(MultiXactMemberCtl, "MultiXactMember", multixact_members_buffers, 0, - MultiXactMemberSLRULock, "pg_multixact/members", - LWTRANCHE_MULTIXACTMEMBER_BUFFER, + "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER, + LWTRANCHE_MULTIXACTMEMBER_SLRU, SYNC_HANDLER_MULTIXACT_MEMBER); /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */ @@ -1895,8 +1952,10 @@ void BootStrapMultiXact(void) { int slotno; + LWLock *lock; - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, 0); + LWLockAcquire(lock, LW_EXCLUSIVE); /* Create and zero the first page of the offsets log */ slotno = ZeroMultiXactOffsetPage(0, false); @@ -1905,9 +1964,10 @@ BootStrapMultiXact(void) SimpleLruWritePage(MultiXactOffsetCtl, slotno); Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - LWLockRelease(MultiXactOffsetSLRULock); + LWLockRelease(lock); - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + lock = SimpleLruGetBankLock(MultiXactMemberCtl, 0); + LWLockAcquire(lock, LW_EXCLUSIVE); /* Create and zero the first page of the members log */ slotno = ZeroMultiXactMemberPage(0, false); @@ -1916,7 +1976,7 @@ BootStrapMultiXact(void) SimpleLruWritePage(MultiXactMemberCtl, slotno); Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - LWLockRelease(MultiXactMemberSLRULock); + LWLockRelease(lock); } /* @@ -1976,10 +2036,12 @@ static void MaybeExtendOffsetSlru(void) { int pageno; + LWLock *lock; pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) { @@ -1994,7 +2056,7 @@ MaybeExtendOffsetSlru(void) SimpleLruWritePage(MultiXactOffsetCtl, slotno); } - LWLockRelease(MultiXactOffsetSLRULock); + LWLockRelease(lock); } /* @@ -2016,13 +2078,15 @@ StartupMultiXact(void) * Initialize offset's idea of the latest page number. */ pageno = MultiXactIdToOffsetPage(multi); - MultiXactOffsetCtl->shared->latest_page_number = pageno; + pg_atomic_init_u32(&MultiXactOffsetCtl->shared->latest_page_number, + pageno); /* * Initialize member's idea of the latest page number. */ pageno = MXOffsetToMemberPage(offset); - MultiXactMemberCtl->shared->latest_page_number = pageno; + pg_atomic_init_u32(&MultiXactMemberCtl->shared->latest_page_number, + pageno); } /* @@ -2047,13 +2111,13 @@ TrimMultiXact(void) LWLockRelease(MultiXactGenLock); /* Clean up offsets state */ - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); /* * (Re-)Initialize our idea of the latest page number for offsets. */ pageno = MultiXactIdToOffsetPage(nextMXact); - MultiXactOffsetCtl->shared->latest_page_number = pageno; + pg_atomic_write_u32(&MultiXactOffsetCtl->shared->latest_page_number, + pageno); /* * Zero out the remainder of the current offsets page. See notes in @@ -2068,7 +2132,9 @@ TrimMultiXact(void) { int slotno; MultiXactOffset *offptr; + LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; @@ -2076,18 +2142,17 @@ TrimMultiXact(void) MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + LWLockRelease(lock); } - LWLockRelease(MultiXactOffsetSLRULock); - /* And the same for members */ - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); /* * (Re-)Initialize our idea of the latest page number for members. */ pageno = MXOffsetToMemberPage(offset); - MultiXactMemberCtl->shared->latest_page_number = pageno; + pg_atomic_write_u32(&MultiXactMemberCtl->shared->latest_page_number, + pageno); /* * Zero out the remainder of the current members page. See notes in @@ -2099,7 +2164,9 @@ TrimMultiXact(void) int slotno; TransactionId *xidptr; int memberoff; + LWLock *lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); memberoff = MXOffsetToMemberOffset(offset); slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset); xidptr = (TransactionId *) @@ -2114,10 +2181,9 @@ TrimMultiXact(void) */ MultiXactMemberCtl->shared->page_dirty[slotno] = true; + LWLockRelease(lock); } - LWLockRelease(MultiXactMemberSLRULock); - /* signal that we're officially up */ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->finishedStartup = true; @@ -2405,6 +2471,7 @@ static void ExtendMultiXactOffset(MultiXactId multi) { int pageno; + LWLock *lock; /* * No work except at first MultiXactId of a page. But beware: just after @@ -2415,13 +2482,14 @@ ExtendMultiXactOffset(MultiXactId multi) return; pageno = MultiXactIdToOffsetPage(multi); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); /* Zero the page and make an XLOG entry about it */ ZeroMultiXactOffsetPage(pageno, true); - LWLockRelease(MultiXactOffsetSLRULock); + LWLockRelease(lock); } /* @@ -2454,15 +2522,17 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) if (flagsoff == 0 && flagsbit == 0) { int pageno; + LWLock *lock; pageno = MXOffsetToMemberPage(offset); + lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); /* Zero the page and make an XLOG entry about it */ ZeroMultiXactMemberPage(pageno, true); - LWLockRelease(MultiXactMemberSLRULock); + LWLockRelease(lock); } /* @@ -2760,7 +2830,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; offset = *offptr; - LWLockRelease(MultiXactOffsetSLRULock); + LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno)); *result = offset; return true; @@ -3242,31 +3312,33 @@ multixact_redo(XLogReaderState *record) { int pageno; int slotno; + LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(int)); - - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); slotno = ZeroMultiXactOffsetPage(pageno, false); SimpleLruWritePage(MultiXactOffsetCtl, slotno); Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - LWLockRelease(MultiXactOffsetSLRULock); + LWLockRelease(lock); } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { int pageno; int slotno; + LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(int)); - - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); slotno = ZeroMultiXactMemberPage(pageno, false); SimpleLruWritePage(MultiXactMemberCtl, slotno); Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - LWLockRelease(MultiXactMemberSLRULock); + LWLockRelease(lock); } else if (info == XLOG_MULTIXACT_CREATE_ID) { @@ -3332,7 +3404,8 @@ multixact_redo(XLogReaderState *record) * SimpleLruTruncate. */ pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff); - MultiXactOffsetCtl->shared->latest_page_number = pageno; + pg_atomic_write_u32(&MultiXactOffsetCtl->shared->latest_page_number, + pageno); PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff); LWLockRelease(MultiXactTruncationLock); diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index b0d90a4bd2..902f76def7 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -72,6 +72,21 @@ */ #define MAX_WRITEALL_BUFFERS 16 +/* + * Macro to get the index of lock for a given slotno in bank_lock array in + * SlruSharedData. + * + * Basically, the slru buffer pool is divided into banks of buffer and there is + * total SLRU_MAX_BANKLOCKS number of locks to protect access to buffer in the + * banks. Since we have max limit on the number of locks we can not always have + * one lock for each bank. So until the number of banks are + * <= SLRU_MAX_BANKLOCKS then there would be one lock protecting each bank + * otherwise one lock might protect multiple banks based on the number of + * banks. + */ +#define SLRU_SLOTNO_GET_BANKLOCKNO(slotno) \ + (((slotno) / SLRU_BANK_SIZE) % SLRU_MAX_BANKLOCKS) + typedef struct SlruWriteAllData { int num_files; /* # files actually open */ @@ -93,34 +108,6 @@ typedef struct SlruWriteAllData *SlruWriteAll; (a).segno = (xx_segno) \ ) -/* - * Macro to mark a buffer slot "most recently used". Note multiple evaluation - * of arguments! - * - * The reason for the if-test is that there are often many consecutive - * accesses to the same page (particularly the latest page). By suppressing - * useless increments of cur_lru_count, we reduce the probability that old - * pages' counts will "wrap around" and make them appear recently used. - * - * We allow this code to be executed concurrently by multiple processes within - * SimpleLruReadPage_ReadOnly(). As long as int reads and writes are atomic, - * this should not cause any completely-bogus values to enter the computation. - * However, it is possible for either cur_lru_count or individual - * page_lru_count entries to be "reset" to lower values than they should have, - * in case a process is delayed while it executes this macro. With care in - * SlruSelectLRUPage(), this does little harm, and in any case the absolute - * worst possible consequence is a nonoptimal choice of page to evict. The - * gain from allowing concurrent reads of SLRU pages seems worth it. - */ -#define SlruRecentlyUsed(shared, slotno) \ - do { \ - int new_lru_count = (shared)->cur_lru_count; \ - if (new_lru_count != (shared)->page_lru_count[slotno]) { \ - (shared)->cur_lru_count = ++new_lru_count; \ - (shared)->page_lru_count[slotno] = new_lru_count; \ - } \ - } while (0) - /* Saved info for SlruReportIOError */ typedef enum { @@ -147,6 +134,7 @@ static int SlruSelectLRUPage(SlruCtl ctl, int pageno); static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data); static void SlruInternalDeleteSegment(SlruCtl ctl, int segno); +static inline void SlruRecentlyUsed(SlruShared shared, int slotno); /* * Initialization of shared memory @@ -156,6 +144,8 @@ Size SimpleLruShmemSize(int nslots, int nlsns) { Size sz; + int nbanks = nslots / SLRU_BANK_SIZE; + int nbanklocks = Min(nbanks, SLRU_MAX_BANKLOCKS); /* we assume nslots isn't so large as to risk overflow */ sz = MAXALIGN(sizeof(SlruSharedData)); @@ -165,6 +155,8 @@ SimpleLruShmemSize(int nslots, int nlsns) sz += MAXALIGN(nslots * sizeof(int)); /* page_number[] */ sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */ sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */ + sz += MAXALIGN(nbanklocks * sizeof(LWLockPadded)); /* bank_locks[] */ + sz += MAXALIGN(nbanks * sizeof(int)); /* bank_cur_lru_count[] */ if (nlsns > 0) sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */ @@ -181,16 +173,19 @@ SimpleLruShmemSize(int nslots, int nlsns) * nlsns: number of LSN groups per page (set to zero if not relevant). * ctllock: LWLock to use to control access to the shared control structure. * subdir: PGDATA-relative subdirectory that will contain the files. - * tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks. + * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks. + * bank_tranche_id: tranche ID to use for the bank LWLocks. * sync_handler: which set of functions to use to handle sync requests */ void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, - LWLock *ctllock, const char *subdir, int tranche_id, + const char *subdir, int buffer_tranche_id, int bank_tranche_id, SyncRequestHandler sync_handler) { SlruShared shared; bool found; + int nbanks = nslots / SLRU_BANK_SIZE; + int nbanklocks = Min(nbanks, SLRU_MAX_BANKLOCKS); shared = (SlruShared) ShmemInitStruct(name, SimpleLruShmemSize(nslots, nlsns), @@ -202,18 +197,16 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, char *ptr; Size offset; int slotno; + int bankno; + int banklockno; Assert(!found); memset(shared, 0, sizeof(SlruSharedData)); - shared->ControlLock = ctllock; - shared->num_slots = nslots; shared->lsn_groups_per_page = nlsns; - shared->cur_lru_count = 0; - /* shared->latest_page_number will be set later */ shared->slru_stats_idx = pgstat_get_slru_index(name); @@ -234,6 +227,10 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, /* Initialize LWLocks */ shared->buffer_locks = (LWLockPadded *) (ptr + offset); offset += MAXALIGN(nslots * sizeof(LWLockPadded)); + shared->bank_locks = (LWLockPadded *) (ptr + offset); + offset += MAXALIGN(nbanklocks * sizeof(LWLockPadded)); + shared->bank_cur_lru_count = (int *) (ptr + offset); + offset += MAXALIGN(nbanks * sizeof(int)); if (nlsns > 0) { @@ -245,7 +242,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, for (slotno = 0; slotno < nslots; slotno++) { LWLockInitialize(&shared->buffer_locks[slotno].lock, - tranche_id); + buffer_tranche_id); shared->page_buffer[slotno] = ptr; shared->page_status[slotno] = SLRU_PAGE_EMPTY; @@ -254,6 +251,15 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, ptr += BLCKSZ; } + /* Initialize the bank locks. */ + for (banklockno = 0; banklockno < nbanklocks; banklockno++) + LWLockInitialize(&shared->bank_locks[banklockno].lock, + bank_tranche_id); + + /* Initialize the bank lru counters. */ + for (bankno = 0; bankno < nbanks; bankno++) + shared->bank_cur_lru_count[bankno] = 0; + /* Should fit to estimated shmem size */ Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns)); } @@ -307,7 +313,7 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno) SimpleLruZeroLSNs(ctl, slotno); /* Assume this page is now the latest active page */ - shared->latest_page_number = pageno; + pg_atomic_write_u32(&shared->latest_page_number, pageno); /* update the stats counter of zeroed pages */ pgstat_count_slru_page_zeroed(shared->slru_stats_idx); @@ -346,12 +352,13 @@ static void SimpleLruWaitIO(SlruCtl ctl, int slotno) { SlruShared shared = ctl->shared; + int banklockno = SLRU_SLOTNO_GET_BANKLOCKNO(slotno); /* See notes at top of file */ - LWLockRelease(shared->ControlLock); + LWLockRelease(&shared->bank_locks[banklockno].lock); LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED); LWLockRelease(&shared->buffer_locks[slotno].lock); - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(&shared->bank_locks[banklockno].lock, LW_EXCLUSIVE); /* * If the slot is still in an io-in-progress state, then either someone @@ -402,10 +409,14 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, { SlruShared shared = ctl->shared; + /* Caller must hold the bank lock for the input page. */ + Assert(LWLockHeldByMe(SimpleLruGetBankLock(ctl, pageno))); + /* Outer loop handles restart if we must wait for someone else's I/O */ for (;;) { int slotno; + int banklockno; bool ok; /* See if page already is in memory; if not, pick victim slot */ @@ -448,9 +459,10 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); + banklockno = SLRU_SLOTNO_GET_BANKLOCKNO(slotno); /* Release control lock while doing I/O */ - LWLockRelease(shared->ControlLock); + LWLockRelease(&shared->bank_locks[banklockno].lock); /* Do the read */ ok = SlruPhysicalReadPage(ctl, pageno, slotno); @@ -459,7 +471,7 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, SimpleLruZeroLSNs(ctl, slotno); /* Re-acquire control lock and update page state */ - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(&shared->bank_locks[banklockno].lock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS && @@ -503,9 +515,10 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) int slotno; int bankstart = (pageno & ctl->bank_mask) * SLRU_BANK_SIZE; int bankend = bankstart + SLRU_BANK_SIZE; + int banklockno = SLRU_SLOTNO_GET_BANKLOCKNO(bankstart); /* Try to find the page while holding only shared lock */ - LWLockAcquire(shared->ControlLock, LW_SHARED); + LWLockAcquire(&shared->bank_locks[banklockno].lock, LW_SHARED); /* * See if the page is already in a buffer pool. The buffer pool is @@ -529,8 +542,8 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) } /* No luck, so switch to normal exclusive lock and do regular read */ - LWLockRelease(shared->ControlLock); - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + LWLockRelease(&shared->bank_locks[banklockno].lock); + LWLockAcquire(&shared->bank_locks[banklockno].lock, LW_EXCLUSIVE); return SimpleLruReadPage(ctl, pageno, true, xid); } @@ -552,6 +565,7 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata) SlruShared shared = ctl->shared; int pageno = shared->page_number[slotno]; bool ok; + int banklockno = SLRU_SLOTNO_GET_BANKLOCKNO(slotno); /* If a write is in progress, wait for it to finish */ while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && @@ -580,7 +594,7 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata) LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); /* Release control lock while doing I/O */ - LWLockRelease(shared->ControlLock); + LWLockRelease(&shared->bank_locks[banklockno].lock); /* Do the write */ ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); @@ -595,7 +609,7 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata) } /* Re-acquire control lock and update page state */ - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(&shared->bank_locks[banklockno].lock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS); @@ -1039,13 +1053,14 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) int bestinvalidslot = 0; /* keep compiler quiet */ int best_invalid_delta = -1; int best_invalid_page_number = 0; /* keep compiler quiet */ - int bankstart = (pageno & ctl->bank_mask) * SLRU_BANK_SIZE; + int bankno = pageno & ctl->bank_mask; + int bankstart = bankno * SLRU_BANK_SIZE; int bankend = bankstart + SLRU_BANK_SIZE; /* * See if the page is already in a buffer pool. The buffer pool is - * divided into banks of buffers and each pageno may reside only in one - * bank so limit the search within the bank. + * divided into banks of buffers and each pageno may reside only in + * one bank so limit the search within the bank. */ for (slotno = bankstart; slotno < bankend; slotno++) { @@ -1081,7 +1096,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) * That gets us back on the path to having good data when there are * multiple pages with the same lru_count. */ - cur_count = (shared->cur_lru_count)++; + cur_count = (shared->bank_cur_lru_count[bankno])++; for (slotno = bankstart; slotno < bankend; slotno++) { int this_delta; @@ -1103,7 +1118,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) this_delta = 0; } this_page_number = shared->page_number[slotno]; - if (this_page_number == shared->latest_page_number) + if (this_page_number == pg_atomic_read_u32(&shared->latest_page_number)) continue; if (shared->page_status[slotno] == SLRU_PAGE_VALID) { @@ -1177,6 +1192,7 @@ SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied) int slotno; int pageno = 0; int i; + int prevlockno = SLRU_SLOTNO_GET_BANKLOCKNO(0); bool ok; /* update the stats counter of flushes */ @@ -1187,10 +1203,23 @@ SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied) */ fdata.num_files = 0; - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(&shared->bank_locks[prevlockno].lock, LW_EXCLUSIVE); for (slotno = 0; slotno < shared->num_slots; slotno++) { + int curlockno = SLRU_SLOTNO_GET_BANKLOCKNO(slotno); + + /* + * If the current bank lock is not same as the previous bank lock then + * release the previous lock and acquire the new lock. + */ + if (curlockno != prevlockno) + { + LWLockRelease(&shared->bank_locks[prevlockno].lock); + LWLockAcquire(&shared->bank_locks[curlockno].lock, LW_EXCLUSIVE); + prevlockno = curlockno; + } + SlruInternalWritePage(ctl, slotno, &fdata); /* @@ -1204,7 +1233,7 @@ SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied) !shared->page_dirty[slotno])); } - LWLockRelease(shared->ControlLock); + LWLockRelease(&shared->bank_locks[prevlockno].lock); /* * Now close any files that were open @@ -1244,6 +1273,7 @@ SimpleLruTruncate(SlruCtl ctl, int cutoffPage) { SlruShared shared = ctl->shared; int slotno; + int prevlockno; /* update the stats counter of truncates */ pgstat_count_slru_truncate(shared->slru_stats_idx); @@ -1254,25 +1284,38 @@ SimpleLruTruncate(SlruCtl ctl, int cutoffPage) * or just after a checkpoint, any dirty pages should have been flushed * already ... we're just being extra careful here.) */ - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); - restart: /* * While we are holding the lock, make an important safety check: the * current endpoint page must not be eligible for removal. */ - if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage)) + if (ctl->PagePrecedes(pg_atomic_read_u32(&shared->latest_page_number), + cutoffPage)) { - LWLockRelease(shared->ControlLock); ereport(LOG, (errmsg("could not truncate directory \"%s\": apparent wraparound", ctl->Dir))); return; } + prevlockno = SLRU_SLOTNO_GET_BANKLOCKNO(0); + LWLockAcquire(&shared->bank_locks[prevlockno].lock, LW_EXCLUSIVE); for (slotno = 0; slotno < shared->num_slots; slotno++) { + int curlockno = SLRU_SLOTNO_GET_BANKLOCKNO(slotno); + + /* + * If the current bank lock is not same as the previous bank lock then + * release the previous lock and acquire the new lock. + */ + if (curlockno != prevlockno) + { + LWLockRelease(&shared->bank_locks[prevlockno].lock); + LWLockAcquire(&shared->bank_locks[curlockno].lock, LW_EXCLUSIVE); + prevlockno = curlockno; + } + if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) continue; if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage)) @@ -1302,10 +1345,12 @@ restart: SlruInternalWritePage(ctl, slotno, NULL); else SimpleLruWaitIO(ctl, slotno); + + LWLockRelease(&shared->bank_locks[prevlockno].lock); goto restart; } - LWLockRelease(shared->ControlLock); + LWLockRelease(&shared->bank_locks[prevlockno].lock); /* Now we can remove the old segment(s) */ (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage); @@ -1346,15 +1391,29 @@ SlruDeleteSegment(SlruCtl ctl, int segno) SlruShared shared = ctl->shared; int slotno; bool did_write; + int prevlockno = SLRU_SLOTNO_GET_BANKLOCKNO(0); /* Clean out any possibly existing references to the segment. */ - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(&shared->bank_locks[prevlockno].lock, LW_EXCLUSIVE); restart: did_write = false; for (slotno = 0; slotno < shared->num_slots; slotno++) { - int pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT; + int pagesegno; + int curlockno = SLRU_SLOTNO_GET_BANKLOCKNO(slotno); + /* + * If the current bank lock is not same as the previous bank lock then + * release the previous lock and acquire the new lock. + */ + if (curlockno != prevlockno) + { + LWLockRelease(&shared->bank_locks[prevlockno].lock); + LWLockAcquire(&shared->bank_locks[curlockno].lock, LW_EXCLUSIVE); + prevlockno = curlockno; + } + + pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT; if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) continue; @@ -1388,7 +1447,7 @@ restart: SlruInternalDeleteSegment(ctl, segno); - LWLockRelease(shared->ControlLock); + LWLockRelease(&shared->bank_locks[prevlockno].lock); } /* @@ -1630,6 +1689,37 @@ SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path) return result; } +/* + * Function to mark a buffer slot "most recently used". + * + * The reason for the if-test is that there are often many consecutive + * accesses to the same page (particularly the latest page). By suppressing + * useless increments of bank_cur_lru_count, we reduce the probability that old + * pages' counts will "wrap around" and make them appear recently used. + * + * We allow this code to be executed concurrently by multiple processes within + * SimpleLruReadPage_ReadOnly(). As long as int reads and writes are atomic, + * this should not cause any completely-bogus values to enter the computation. + * However, it is possible for either bank_cur_lru_count or individual + * page_lru_count entries to be "reset" to lower values than they should have, + * in case a process is delayed while it executes this function. With care in + * SlruSelectLRUPage(), this does little harm, and in any case the absolute + * worst possible consequence is a nonoptimal choice of page to evict. The + * gain from allowing concurrent reads of SLRU pages seems worth it. + */ +static inline void +SlruRecentlyUsed(SlruShared shared, int slotno) +{ + int bankno = slotno / SLRU_BANK_SIZE; + int new_lru_count = shared->bank_cur_lru_count[bankno]; + + if (new_lru_count != shared->page_lru_count[slotno]) + { + shared->bank_cur_lru_count[bankno] = ++new_lru_count; + shared->page_lru_count[slotno] = new_lru_count; + } +} + /* * Helper function for GUC check_hook to check whether slru buffers are in * multiples of SLRU_BANK_SIZE. @@ -1646,3 +1736,37 @@ check_slru_buffers(const char *name, int *newval) SLRU_BANK_SIZE); return false; } + +/* + * Function to acquire all bank's lock of the given SlruCtl + */ +void +SimpleLruAcquireAllBankLock(SlruCtl ctl, LWLockMode mode) +{ + SlruShared shared = ctl->shared; + int banklockno; + int nbanklocks; + + /* Compute number of bank locks. */ + nbanklocks = Min(shared->num_slots / SLRU_BANK_SIZE, SLRU_MAX_BANKLOCKS); + + for (banklockno = 0; banklockno < nbanklocks; banklockno++) + LWLockAcquire(&shared->bank_locks[banklockno].lock, mode); +} + +/* + * Function to release all bank's lock of the given SlruCtl + */ +void +SimpleLruReleaseAllBankLock(SlruCtl ctl) +{ + SlruShared shared = ctl->shared; + int banklockno; + int nbanklocks; + + /* Compute number of bank locks. */ + nbanklocks = Min(shared->num_slots / SLRU_BANK_SIZE, SLRU_MAX_BANKLOCKS); + + for (banklockno = 0; banklockno < nbanklocks; banklockno++) + LWLockRelease(&shared->bank_locks[banklockno].lock); +} diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 923e706535..44c3969650 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -78,12 +78,14 @@ SubTransSetParent(TransactionId xid, TransactionId parent) int pageno = TransactionIdToPage(xid); int entryno = TransactionIdToEntry(xid); int slotno; + LWLock *lock; TransactionId *ptr; Assert(TransactionIdIsValid(parent)); Assert(TransactionIdFollows(xid, parent)); - LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + lock = SimpleLruGetBankLock(SubTransCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid); ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; @@ -101,7 +103,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent) SubTransCtl->shared->page_dirty[slotno] = true; } - LWLockRelease(SubtransSLRULock); + LWLockRelease(lock); } /* @@ -131,7 +133,7 @@ SubTransGetParent(TransactionId xid) parent = *ptr; - LWLockRelease(SubtransSLRULock); + LWLockRelease(SimpleLruGetBankLock(SubTransCtl, pageno)); return parent; } @@ -194,8 +196,9 @@ SUBTRANSShmemInit(void) { SubTransCtl->PagePrecedes = SubTransPagePrecedes; SimpleLruInit(SubTransCtl, "Subtrans", subtrans_buffers, 0, - SubtransSLRULock, "pg_subtrans", - LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE); + "pg_subtrans", LWTRANCHE_SUBTRANS_BUFFER, + LWTRANCHE_SUBTRANS_SLRU, + SYNC_HANDLER_NONE); SlruPagePrecedesUnitTests(SubTransCtl, SUBTRANS_XACTS_PER_PAGE); } @@ -213,8 +216,9 @@ void BootStrapSUBTRANS(void) { int slotno; + LWLock *lock = SimpleLruGetBankLock(SubTransCtl, 0); - LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); /* Create and zero the first page of the subtrans log */ slotno = ZeroSUBTRANSPage(0); @@ -223,7 +227,7 @@ BootStrapSUBTRANS(void) SimpleLruWritePage(SubTransCtl, slotno); Assert(!SubTransCtl->shared->page_dirty[slotno]); - LWLockRelease(SubtransSLRULock); + LWLockRelease(lock); } /* @@ -253,6 +257,8 @@ StartupSUBTRANS(TransactionId oldestActiveXID) FullTransactionId nextXid; int startPage; int endPage; + LWLock *prevlock; + LWLock *lock; /* * Since we don't expect pg_subtrans to be valid across crashes, we @@ -260,23 +266,47 @@ StartupSUBTRANS(TransactionId oldestActiveXID) * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero * the new page without regard to whatever was previously on disk. */ - LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); - startPage = TransactionIdToPage(oldestActiveXID); nextXid = ShmemVariableCache->nextXid; endPage = TransactionIdToPage(XidFromFullTransactionId(nextXid)); + prevlock = SimpleLruGetBankLock(SubTransCtl, startPage); + LWLockAcquire(prevlock, LW_EXCLUSIVE); while (startPage != endPage) { + lock = SimpleLruGetBankLock(SubTransCtl, startPage); + + /* + * Check if we need to acquire the lock on the new bank then release + * the lock on the old bank and acquire on the new bank. + */ + if (prevlock != lock) + { + LWLockRelease(prevlock); + LWLockAcquire(lock, LW_EXCLUSIVE); + prevlock = lock; + } + (void) ZeroSUBTRANSPage(startPage); startPage++; /* must account for wraparound */ if (startPage > TransactionIdToPage(MaxTransactionId)) startPage = 0; } - (void) ZeroSUBTRANSPage(startPage); - LWLockRelease(SubtransSLRULock); + lock = SimpleLruGetBankLock(SubTransCtl, startPage); + + /* + * Check if we need to acquire the lock on the new bank then release the + * lock on the old bank and acquire on the new bank. + */ + if (prevlock != lock) + { + LWLockRelease(prevlock); + LWLockAcquire(lock, LW_EXCLUSIVE); + } + (void) ZeroSUBTRANSPage(startPage); + LWLockRelease(lock); } /* @@ -310,6 +340,7 @@ void ExtendSUBTRANS(TransactionId newestXact) { int pageno; + LWLock *lock; /* * No work except at first XID of a page. But beware: just after @@ -321,12 +352,13 @@ ExtendSUBTRANS(TransactionId newestXact) pageno = TransactionIdToPage(newestXact); - LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + lock = SimpleLruGetBankLock(SubTransCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); /* Zero the page */ ZeroSUBTRANSPage(pageno); - LWLockRelease(SubtransSLRULock); + LWLockRelease(lock); } diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 98449cbdde..4b5bb0ed16 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -268,9 +268,10 @@ typedef struct QueueBackendStatus * both NotifyQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends * can change the tail pointers. * - * NotifySLRULock is used as the control lock for the pg_notify SLRU buffers. + * SLRU buffer pool is divided in banks and bank wise SLRU lock is used as + * the control lock for the pg_notify SLRU buffers. * In order to avoid deadlocks, whenever we need multiple locks, we first get - * NotifyQueueTailLock, then NotifyQueueLock, and lastly NotifySLRULock. + * NotifyQueueTailLock, then NotifyQueueLock, and lastly SLRU bank lock. * * Each backend uses the backend[] array entry with index equal to its * BackendId (which can range from 1 to MaxBackends). We rely on this to make @@ -571,7 +572,7 @@ AsyncShmemInit(void) */ NotifyCtl->PagePrecedes = asyncQueuePagePrecedes; SimpleLruInit(NotifyCtl, "Notify", notify_buffers, 0, - NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER, + "pg_notify", LWTRANCHE_NOTIFY_BUFFER, LWTRANCHE_NOTIFY_SLRU, SYNC_HANDLER_NONE); if (!found) @@ -1403,7 +1404,7 @@ asyncQueueNotificationToEntry(Notification *n, AsyncQueueEntry *qe) * Eventually we will return NULL indicating all is done. * * We are holding NotifyQueueLock already from the caller and grab - * NotifySLRULock locally in this function. + * page specific SLRU bank lock locally in this function. */ static ListCell * asyncQueueAddEntries(ListCell *nextNotify) @@ -1413,9 +1414,7 @@ asyncQueueAddEntries(ListCell *nextNotify) int pageno; int offset; int slotno; - - /* We hold both NotifyQueueLock and NotifySLRULock during this operation */ - LWLockAcquire(NotifySLRULock, LW_EXCLUSIVE); + LWLock *prevlock; /* * We work with a local copy of QUEUE_HEAD, which we write back to shared @@ -1439,6 +1438,11 @@ asyncQueueAddEntries(ListCell *nextNotify) * wrapped around, but re-zeroing the page is harmless in that case.) */ pageno = QUEUE_POS_PAGE(queue_head); + prevlock = SimpleLruGetBankLock(NotifyCtl, pageno); + + /* We hold both NotifyQueueLock and SLRU bank lock during this operation */ + LWLockAcquire(prevlock, LW_EXCLUSIVE); + if (QUEUE_POS_IS_ZERO(queue_head)) slotno = SimpleLruZeroPage(NotifyCtl, pageno); else @@ -1484,6 +1488,17 @@ asyncQueueAddEntries(ListCell *nextNotify) /* Advance queue_head appropriately, and detect if page is full */ if (asyncQueueAdvance(&(queue_head), qe.length)) { + LWLock *lock; + + pageno = QUEUE_POS_PAGE(queue_head); + lock = SimpleLruGetBankLock(NotifyCtl, pageno); + if (lock != prevlock) + { + LWLockRelease(prevlock); + LWLockAcquire(lock, LW_EXCLUSIVE); + prevlock = lock; + } + /* * Page is full, so we're done here, but first fill the next page * with zeroes. The reason to do this is to ensure that slru.c's @@ -1510,7 +1525,7 @@ asyncQueueAddEntries(ListCell *nextNotify) /* Success, so update the global QUEUE_HEAD */ QUEUE_HEAD = queue_head; - LWLockRelease(NotifySLRULock); + LWLockRelease(prevlock); return nextNotify; } @@ -1989,9 +2004,9 @@ asyncQueueReadAllNotifications(void) /* * We copy the data from SLRU into a local buffer, so as to avoid - * holding the NotifySLRULock while we are examining the entries - * and possibly transmitting them to our frontend. Copy only the - * part of the page we will actually inspect. + * holding the SLRU lock while we are examining the entries and + * possibly transmitting them to our frontend. Copy only the part + * of the page we will actually inspect. */ slotno = SimpleLruReadPage_ReadOnly(NotifyCtl, curpage, InvalidTransactionId); @@ -2011,7 +2026,7 @@ asyncQueueReadAllNotifications(void) NotifyCtl->shared->page_buffer[slotno] + curoffset, copysize); /* Release lock that we got from SimpleLruReadPage_ReadOnly() */ - LWLockRelease(NotifySLRULock); + LWLockRelease(SimpleLruGetBankLock(NotifyCtl, curpage)); /* * Process messages up to the stop position, end of page, or an @@ -2052,7 +2067,7 @@ asyncQueueReadAllNotifications(void) * * The current page must have been fetched into page_buffer from shared * memory. (We could access the page right in shared memory, but that - * would imply holding the NotifySLRULock throughout this routine.) + * would imply holding the SLRU bank lock throughout this routine.) * * We stop if we reach the "stop" position, or reach a notification from an * uncommitted transaction, or reach the end of the page. @@ -2205,7 +2220,7 @@ asyncQueueAdvanceTail(void) if (asyncQueuePagePrecedes(oldtailpage, boundary)) { /* - * SimpleLruTruncate() will ask for NotifySLRULock but will also + * SimpleLruTruncate() will ask for SLRU bank locks but will also * release the lock again. */ SimpleLruTruncate(NotifyCtl, newtailpage); diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 315a78cda9..1261af0548 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -190,6 +190,20 @@ static const char *const BuiltinTrancheNames[] = { "LogicalRepLauncherDSA", /* LWTRANCHE_LAUNCHER_HASH: */ "LogicalRepLauncherHash", + /* LWTRANCHE_XACT_SLRU: */ + "XactSLRU", + /* LWTRANCHE_COMMITTS_SLRU: */ + "CommitTSSLRU", + /* LWTRANCHE_SUBTRANS_SLRU: */ + "SubtransSLRU", + /* LWTRANCHE_MULTIXACTOFFSET_SLRU: */ + "MultixactOffsetSLRU", + /* LWTRANCHE_MULTIXACTMEMBER_SLRU: */ + "MultixactMemberSLRU", + /* LWTRANCHE_NOTIFY_SLRU: */ + "NotifySLRU", + /* LWTRANCHE_SERIAL_SLRU: */ + "SerialSLRU" }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index f72f2906ce..9e66ecd1ed 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -16,11 +16,11 @@ WALBufMappingLock 7 WALWriteLock 8 ControlFileLock 9 # 10 was CheckpointLock -XactSLRULock 11 -SubtransSLRULock 12 +# 11 was XactSLRULock +# 12 was SubtransSLRULock MultiXactGenLock 13 -MultiXactOffsetSLRULock 14 -MultiXactMemberSLRULock 15 +# 14 was MultiXactOffsetSLRULock +# 15 was MultiXactMemberSLRULock RelCacheInitLock 16 CheckpointerCommLock 17 TwoPhaseStateLock 18 @@ -31,19 +31,19 @@ AutovacuumLock 22 AutovacuumScheduleLock 23 SyncScanLock 24 RelationMappingLock 25 -NotifySLRULock 26 +#26 was NotifySLRULock NotifyQueueLock 27 SerializableXactHashLock 28 SerializableFinishedListLock 29 SerializablePredicateListLock 30 -SerialSLRULock 31 +SerialControlLock 31 SyncRepLock 32 BackgroundWorkerLock 33 DynamicSharedMemoryControlLock 34 AutoFileLock 35 ReplicationSlotAllocationLock 36 ReplicationSlotControlLock 37 -CommitTsSLRULock 38 +#38 was CommitTsSLRULock CommitTsLock 39 ReplicationOriginLock 40 MultiXactTruncationLock 41 diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index e4903c67ec..452b918181 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -809,8 +809,9 @@ SerialInit(void) */ SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically; SimpleLruInit(SerialSlruCtl, "Serial", - serial_buffers, 0, SerialSLRULock, "pg_serial", - LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE); + serial_buffers, 0, "pg_serial", + LWTRANCHE_SERIAL_BUFFER, LWTRANCHE_SERIAL_SLRU, + SYNC_HANDLER_NONE); #ifdef USE_ASSERT_CHECKING SerialPagePrecedesLogicallyUnitTests(); #endif @@ -847,12 +848,14 @@ SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo) int slotno; int firstZeroPage; bool isNewPage; + LWLock *lock; Assert(TransactionIdIsValid(xid)); targetPage = SerialPage(xid); + lock = SimpleLruGetBankLock(SerialSlruCtl, targetPage); - LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); /* * If no serializable transactions are active, there shouldn't be anything @@ -902,7 +905,7 @@ SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo) SerialValue(slotno, xid) = minConflictCommitSeqNo; SerialSlruCtl->shared->page_dirty[slotno] = true; - LWLockRelease(SerialSLRULock); + LWLockRelease(lock); } /* @@ -920,10 +923,10 @@ SerialGetMinConflictCommitSeqNo(TransactionId xid) Assert(TransactionIdIsValid(xid)); - LWLockAcquire(SerialSLRULock, LW_SHARED); + LWLockAcquire(SerialControlLock, LW_SHARED); headXid = serialControl->headXid; tailXid = serialControl->tailXid; - LWLockRelease(SerialSLRULock); + LWLockRelease(SerialControlLock); if (!TransactionIdIsValid(headXid)) return 0; @@ -935,13 +938,13 @@ SerialGetMinConflictCommitSeqNo(TransactionId xid) return 0; /* - * The following function must be called without holding SerialSLRULock, + * The following function must be called without holding SLRU bank lock, * but will return with that lock held, which must then be released. */ slotno = SimpleLruReadPage_ReadOnly(SerialSlruCtl, SerialPage(xid), xid); val = SerialValue(slotno, xid); - LWLockRelease(SerialSLRULock); + LWLockRelease(SimpleLruGetBankLock(SerialSlruCtl, SerialPage(xid))); return val; } @@ -954,7 +957,7 @@ SerialGetMinConflictCommitSeqNo(TransactionId xid) static void SerialSetActiveSerXmin(TransactionId xid) { - LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE); + LWLockAcquire(SerialControlLock, LW_EXCLUSIVE); /* * When no sxacts are active, nothing overlaps, set the xid values to @@ -966,7 +969,7 @@ SerialSetActiveSerXmin(TransactionId xid) { serialControl->tailXid = InvalidTransactionId; serialControl->headXid = InvalidTransactionId; - LWLockRelease(SerialSLRULock); + LWLockRelease(SerialControlLock); return; } @@ -984,7 +987,7 @@ SerialSetActiveSerXmin(TransactionId xid) { serialControl->tailXid = xid; } - LWLockRelease(SerialSLRULock); + LWLockRelease(SerialControlLock); return; } @@ -993,7 +996,7 @@ SerialSetActiveSerXmin(TransactionId xid) serialControl->tailXid = xid; - LWLockRelease(SerialSLRULock); + LWLockRelease(SerialControlLock); } /* @@ -1007,12 +1010,12 @@ CheckPointPredicate(void) { int truncateCutoffPage; - LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE); + LWLockAcquire(SerialControlLock, LW_EXCLUSIVE); /* Exit quickly if the SLRU is currently not in use. */ if (serialControl->headPage < 0) { - LWLockRelease(SerialSLRULock); + LWLockRelease(SerialControlLock); return; } @@ -1072,7 +1075,7 @@ CheckPointPredicate(void) serialControl->headPage = -1; } - LWLockRelease(SerialSLRULock); + LWLockRelease(SerialControlLock); /* Truncate away pages that are no longer required */ SimpleLruTruncate(SerialSlruCtl, truncateCutoffPage); diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 8f20b66776..faa47698c4 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -21,6 +21,7 @@ * SLRU bank size for slotno hash banks */ #define SLRU_BANK_SIZE 16 +#define SLRU_MAX_BANKLOCKS 128 /* * To avoid overflowing internal arithmetic and the size_t data type, the @@ -62,8 +63,6 @@ typedef enum */ typedef struct SlruSharedData { - LWLock *ControlLock; - /* Number of buffers managed by this SLRU structure */ int num_slots; @@ -76,8 +75,35 @@ typedef struct SlruSharedData bool *page_dirty; int *page_number; int *page_lru_count; + + /* The buffer_locks protects the I/O on each buffer slots */ LWLockPadded *buffer_locks; + /* + * Locks to protect the in memory buffer slot access in SLRU bank. If the + * number of banks are <= SLRU_MAX_BANKLOCKS then there will be one lock + * per bank otherwise each lock will protect multiple banks depends upon + * the number of banks. + */ + LWLockPadded *bank_locks; + + /*---------- + * A bank-wise LRU counter is maintained because we do a victim buffer + * search within a bank. Furthermore, manipulating an individual bank + * counter avoids frequent cache invalidation since we update it every time + * we access the page. + * + * We mark a page "most recently used" by setting + * page_lru_count[slotno] = ++bank_cur_lru_count[bankno]; + * The oldest page in the bank is therefore the one with the highest value + * of + * bank_cur_lru_count[bankno] - page_lru_count[slotno] + * The counts will eventually wrap around, but this calculation still + * works as long as no page's age exceeds INT_MAX counts. + *---------- + */ + int *bank_cur_lru_count; + /* * Optional array of WAL flush LSNs associated with entries in the SLRU * pages. If not zero/NULL, we must flush WAL before writing pages (true @@ -89,23 +115,12 @@ typedef struct SlruSharedData XLogRecPtr *group_lsn; int lsn_groups_per_page; - /*---------- - * We mark a page "most recently used" by setting - * page_lru_count[slotno] = ++cur_lru_count; - * The oldest page is therefore the one with the highest value of - * cur_lru_count - page_lru_count[slotno] - * The counts will eventually wrap around, but this calculation still - * works as long as no page's age exceeds INT_MAX counts. - *---------- - */ - int cur_lru_count; - /* * latest_page_number is the page number of the current end of the log; * this is not critical data, since we use it only to avoid swapping out * the latest page. */ - int latest_page_number; + pg_atomic_uint32 latest_page_number; /* SLRU's index for statistics purposes (might not be unique) */ int slru_stats_idx; @@ -154,11 +169,24 @@ typedef struct SlruCtlData typedef SlruCtlData *SlruCtl; +/* + * Get the SLRU bank lock for given SlruCtl and the pageno. + * + * This lock needs to be acquire in order to access the slru buffer slots in + * the respective bank. For more details refer comments in SlruSharedData. + */ +static inline LWLock * +SimpleLruGetBankLock(SlruCtl ctl, int pageno) +{ + int banklockno = (pageno & ctl->bank_mask) % SLRU_MAX_BANKLOCKS; + + return &(ctl->shared->bank_locks[banklockno].lock); +} extern Size SimpleLruShmemSize(int nslots, int nlsns); extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, - LWLock *ctllock, const char *subdir, int tranche_id, - SyncRequestHandler sync_handler); + const char *subdir, int buffer_tranche_id, + int bank_tranche_id, SyncRequestHandler sync_handler); extern int SimpleLruZeroPage(SlruCtl ctl, int pageno); extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, TransactionId xid); @@ -187,4 +215,6 @@ extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, extern bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data); extern bool check_slru_buffers(const char *name, int *newval); +extern void SimpleLruAcquireAllBankLock(SlruCtl ctl, LWLockMode mode); +extern void SimpleLruReleaseAllBankLock(SlruCtl ctl); #endif /* SLRU_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index b038e599c0..87cb812b84 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -207,6 +207,13 @@ typedef enum BuiltinTrancheIds LWTRANCHE_PGSTATS_DATA, LWTRANCHE_LAUNCHER_DSA, LWTRANCHE_LAUNCHER_HASH, + LWTRANCHE_XACT_SLRU, + LWTRANCHE_COMMITTS_SLRU, + LWTRANCHE_SUBTRANS_SLRU, + LWTRANCHE_MULTIXACTOFFSET_SLRU, + LWTRANCHE_MULTIXACTMEMBER_SLRU, + LWTRANCHE_NOTIFY_SLRU, + LWTRANCHE_SERIAL_SLRU, LWTRANCHE_FIRST_USER_DEFINED, } BuiltinTrancheIds; diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c index ae21444c47..9b48eb07c8 100644 --- a/src/test/modules/test_slru/test_slru.c +++ b/src/test/modules/test_slru/test_slru.c @@ -40,10 +40,6 @@ PG_FUNCTION_INFO_V1(test_slru_delete_all); /* Number of SLRU page slots */ #define NUM_TEST_BUFFERS 16 -/* SLRU control lock */ -LWLock TestSLRULock; -#define TestSLRULock (&TestSLRULock) - static SlruCtlData TestSlruCtlData; #define TestSlruCtl (&TestSlruCtlData) @@ -63,9 +59,9 @@ test_slru_page_write(PG_FUNCTION_ARGS) int pageno = PG_GETARG_INT32(0); char *data = text_to_cstring(PG_GETARG_TEXT_PP(1)); int slotno; + LWLock *lock = SimpleLruGetBankLock(TestSlruCtl, pageno); - LWLockAcquire(TestSLRULock, LW_EXCLUSIVE); - + LWLockAcquire(lock, LW_EXCLUSIVE); slotno = SimpleLruZeroPage(TestSlruCtl, pageno); /* these should match */ @@ -80,7 +76,7 @@ test_slru_page_write(PG_FUNCTION_ARGS) BLCKSZ - 1); SimpleLruWritePage(TestSlruCtl, slotno); - LWLockRelease(TestSLRULock); + LWLockRelease(lock); PG_RETURN_VOID(); } @@ -99,13 +95,14 @@ test_slru_page_read(PG_FUNCTION_ARGS) bool write_ok = PG_GETARG_BOOL(1); char *data = NULL; int slotno; + LWLock *lock = SimpleLruGetBankLock(TestSlruCtl, pageno); /* find page in buffers, reading it if necessary */ - LWLockAcquire(TestSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); slotno = SimpleLruReadPage(TestSlruCtl, pageno, write_ok, InvalidTransactionId); data = (char *) TestSlruCtl->shared->page_buffer[slotno]; - LWLockRelease(TestSLRULock); + LWLockRelease(lock); PG_RETURN_TEXT_P(cstring_to_text(data)); } @@ -116,14 +113,15 @@ test_slru_page_readonly(PG_FUNCTION_ARGS) int pageno = PG_GETARG_INT32(0); char *data = NULL; int slotno; + LWLock *lock = SimpleLruGetBankLock(TestSlruCtl, pageno); /* find page in buffers, reading it if necessary */ slotno = SimpleLruReadPage_ReadOnly(TestSlruCtl, pageno, InvalidTransactionId); - Assert(LWLockHeldByMe(TestSLRULock)); + Assert(LWLockHeldByMe(lock)); data = (char *) TestSlruCtl->shared->page_buffer[slotno]; - LWLockRelease(TestSLRULock); + LWLockRelease(lock); PG_RETURN_TEXT_P(cstring_to_text(data)); } @@ -133,10 +131,11 @@ test_slru_page_exists(PG_FUNCTION_ARGS) { int pageno = PG_GETARG_INT32(0); bool found; + LWLock *lock = SimpleLruGetBankLock(TestSlruCtl, pageno); - LWLockAcquire(TestSLRULock, LW_EXCLUSIVE); + LWLockAcquire(lock, LW_EXCLUSIVE); found = SimpleLruDoesPhysicalPageExist(TestSlruCtl, pageno); - LWLockRelease(TestSLRULock); + LWLockRelease(lock); PG_RETURN_BOOL(found); } @@ -215,6 +214,7 @@ test_slru_shmem_startup(void) { const char slru_dir_name[] = "pg_test_slru"; int test_tranche_id; + int test_buffer_tranche_id; if (prev_shmem_startup_hook) prev_shmem_startup_hook(); @@ -228,11 +228,13 @@ test_slru_shmem_startup(void) /* initialize the SLRU facility */ test_tranche_id = LWLockNewTrancheId(); LWLockRegisterTranche(test_tranche_id, "test_slru_tranche"); - LWLockInitialize(TestSLRULock, test_tranche_id); + + test_buffer_tranche_id = LWLockNewTrancheId(); + LWLockRegisterTranche(test_tranche_id, "test_buffer_tranche"); TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically; SimpleLruInit(TestSlruCtl, "TestSLRU", - NUM_TEST_BUFFERS, 0, TestSLRULock, slru_dir_name, + NUM_TEST_BUFFERS, 0, slru_dir_name, test_buffer_tranche_id, test_tranche_id, SYNC_HANDLER_NONE); } -- 2.39.2 (Apple Git-143)