From 5fa38ace34f0c460c9af8889ea922c2d5c4d0b38 Mon Sep 17 00:00:00 2001 From: Dilip kumar Date: Fri, 8 Sep 2023 15:08:32 +0530 Subject: [PATCH v2 1/3] Divide SLRU buffers into banks We want to eliminate linear search within SLRU buffers. To do so we divide SLRU buffers into banks. Each bank holds approximately 8 buffers. Each SLRU pageno may reside only in one bank. Adjacent pagenos reside in different banks. Also invent slru_buffers_size_scale to control SLRU buffers. Andrey M. Borodin, Yura Sokolov, Ivan Lazarev and minor refactoring by Dilip Kumar --- doc/src/sgml/config.sgml | 31 +++++++++++ src/backend/access/transam/clog.c | 29 ++-------- src/backend/access/transam/commit_ts.c | 20 ++----- src/backend/access/transam/slru.c | 54 +++++++++++++++++-- src/backend/access/transam/subtrans.c | 1 + src/backend/utils/init/globals.c | 2 + src/backend/utils/misc/guc_tables.c | 10 ++++ src/backend/utils/misc/postgresql.conf.sample | 3 ++ src/include/access/clog.h | 1 - src/include/access/commit_ts.h | 1 - src/include/access/multixact.h | 4 +- src/include/access/slru.h | 5 ++ src/include/access/subtrans.h | 2 +- src/include/commands/async.h | 2 +- src/include/miscadmin.h | 2 + src/include/storage/predicate.h | 2 +- 16 files changed, 119 insertions(+), 50 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 924309af26..416d979b54 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2006,6 +2006,37 @@ include_dir 'conf.d' + + slru_buffers_size_scale (integer) + + slru_buffers_size_scale configuration parameter + + + + + Specifies power 2 scale for all SLRU shared memory buffers sizes. Buffers sizes depends on + both guc_slru_buffers_size_scale and shared_buffers params. + + + This affects on buffers in the list below (see also ): + + NUM_MULTIXACTOFFSET_BUFFERS = Min(32 << slru_buffers_size_scale, shared_buffers/256) + NUM_MULTIXACTMEMBER_BUFFERS = Min(64 << slru_buffers_size_scale, shared_buffers/256) + NUM_SUBTRANS_BUFFERS = Min(64 << slru_buffers_size_scale, shared_buffers/256) + NUM_NOTIFY_BUFFERS = Min(32 << slru_buffers_size_scale, shared_buffers/256) + NUM_SERIAL_BUFFERS = Min(32 << slru_buffers_size_scale, shared_buffers/256) + NUM_CLOG_BUFFERS = Min(128 << slru_buffers_size_scale, shared_buffers/256) + NUM_COMMIT_TS_BUFFERS = Min(128 << slru_buffers_size_scale, shared_buffers/256) + + + + Value is in 0..7 bounds. + The default value is 2. + This parameter can only be set at server start. + + + + max_stack_depth (integer) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 4a431d5876..d4ac85e052 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -74,6 +74,9 @@ #define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \ ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP) +/* Number of SLRU buffers to use for clog */ +#define NUM_CLOG_BUFFERS (128 << slru_buffers_size_scale) + /* * The number of subtransactions below which we consider to apply clog group * update optimization. Testing reveals that the number higher than this can @@ -660,42 +663,20 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn) return status; } -/* - * Number of shared CLOG buffers. - * - * On larger multi-processor systems, it is possible to have many CLOG page - * requests in flight at one time which could lead to disk access for CLOG - * page if the required page is not found in memory. Testing revealed that we - * can get the best performance by having 128 CLOG buffers, more than that it - * doesn't improve performance. - * - * Unconditionally keeping the number of CLOG buffers to 128 did not seem like - * a good idea, because it would increase the minimum amount of shared memory - * required to start, which could be a problem for people running very small - * configurations. The following formula seems to represent a reasonable - * compromise: people with very low values for shared_buffers will get fewer - * CLOG buffers as well, and everyone else will get 128. - */ -Size -CLOGShmemBuffers(void) -{ - return Min(128, Max(4, NBuffers / 512)); -} - /* * Initialization of shared memory for CLOG */ Size CLOGShmemSize(void) { - return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE); + return SimpleLruShmemSize(NUM_CLOG_BUFFERS, CLOG_LSNS_PER_PAGE); } void CLOGShmemInit(void) { XactCtl->PagePrecedes = CLOGPagePrecedes; - SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, + SimpleLruInit(XactCtl, "Xact", NUM_CLOG_BUFFERS, CLOG_LSNS_PER_PAGE, XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER, SYNC_HANDLER_CLOG); SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE); diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index b897fabc70..26614d5ceb 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -70,6 +70,9 @@ typedef struct CommitTimestampEntry #define TransactionIdToCTsEntry(xid) \ ((xid) % (TransactionId) COMMIT_TS_XACTS_PER_PAGE) +/* Number of SLRU buffers to use for commit_ts */ +#define NUM_COMMIT_TS_BUFFERS (128 << slru_buffers_size_scale) + /* * Link to shared-memory data structures for CommitTs control */ @@ -487,26 +490,13 @@ pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS) PG_RETURN_DATUM(HeapTupleGetDatum(htup)); } -/* - * Number of shared CommitTS buffers. - * - * We use a very similar logic as for the number of CLOG buffers (except we - * scale up twice as fast with shared buffers, and the maximum is twice as - * high); see comments in CLOGShmemBuffers. - */ -Size -CommitTsShmemBuffers(void) -{ - return Min(256, Max(4, NBuffers / 256)); -} - /* * Shared memory sizing for CommitTs */ Size CommitTsShmemSize(void) { - return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) + + return SimpleLruShmemSize(NUM_COMMIT_TS_BUFFERS, 0) + sizeof(CommitTimestampShared); } @@ -520,7 +510,7 @@ CommitTsShmemInit(void) bool found; CommitTsCtl->PagePrecedes = CommitTsPagePrecedes; - SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0, + SimpleLruInit(CommitTsCtl, "CommitTs", NUM_COMMIT_TS_BUFFERS, 0, CommitTsSLRULock, "pg_commit_ts", LWTRANCHE_COMMITTS_BUFFER, SYNC_HANDLER_COMMIT_TS); diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 71ac70fb40..57889b72bd 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -59,6 +59,7 @@ #include "pgstat.h" #include "storage/fd.h" #include "storage/shmem.h" +#include "port/pg_bitutils.h" #define SlruFileName(ctl, path, seg) \ snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) @@ -71,6 +72,17 @@ */ #define MAX_WRITEALL_BUFFERS 16 +/* + * To avoid overflowing internal arithmetic and the size_t data type, the + * number of buffers should not exceed this number. + */ +#define SLRU_MAX_ALLOWED_BUFFERS ((1024 * 1024 * 1024) / BLCKSZ) + +/* + * SLRU bank size for slotno hash banks + */ +#define SLRU_BANK_SIZE 8 + typedef struct SlruWriteAllData { int num_files; /* # files actually open */ @@ -134,7 +146,7 @@ typedef enum static SlruErrorCause slru_errcause; static int slru_errno; - +static void SlruAdjustNSlots(int *nslots, int *bankmask); static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno); static void SimpleLruWaitIO(SlruCtl ctl, int slotno); static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata); @@ -148,6 +160,25 @@ static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data); static void SlruInternalDeleteSegment(SlruCtl ctl, int segno); +/* + * Pick number of slots and bank size optimal for hashed associative SLRU buffers. + * We declare SLRU nslots is always power of 2. + * We split SLRU to 8-sized hash banks, after some performance benchmarks. + * We hash pageno to banks by pageno masked by 3 upper bits. + */ +static void +SlruAdjustNSlots(int *nslots, int *bankmask) +{ + Assert(*nslots > 0); + Assert(*nslots <= SLRU_MAX_ALLOWED_BUFFERS); + + *nslots = (int) pg_nextpower2_32(Max(SLRU_BANK_SIZE, Min(*nslots, NBuffers / 256))); + + *bankmask = *nslots / SLRU_BANK_SIZE - 1; + + elog(DEBUG5, "nslots %d banksize %d nbanks %d bankmask %x", *nslots, SLRU_BANK_SIZE, *nslots / SLRU_BANK_SIZE, *bankmask); +} + /* * Initialization of shared memory */ @@ -156,6 +187,9 @@ Size SimpleLruShmemSize(int nslots, int nlsns) { Size sz; + int bankmask_ignore; + + SlruAdjustNSlots(&nslots, &bankmask_ignore); /* we assume nslots isn't so large as to risk overflow */ sz = MAXALIGN(sizeof(SlruSharedData)); @@ -191,6 +225,9 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, { SlruShared shared; bool found; + int bankmask; + + SlruAdjustNSlots(&nslots, &bankmask); shared = (SlruShared) ShmemInitStruct(name, SimpleLruShmemSize(nslots, nlsns), @@ -258,7 +295,10 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns)); } else + { Assert(found); + Assert(shared->num_slots == nslots); + } /* * Initialize the unshared control struct, including directory path. We @@ -266,6 +306,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, */ ctl->shared = shared; ctl->sync_handler = sync_handler; + ctl->bank_mask = bankmask; strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); } @@ -497,12 +538,14 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) { SlruShared shared = ctl->shared; int slotno; + int bankstart = (pageno & ctl->bank_mask) * SLRU_BANK_SIZE; + int bankend = bankstart + SLRU_BANK_SIZE; /* Try to find the page while holding only shared lock */ LWLockAcquire(shared->ControlLock, LW_SHARED); /* See if page is already in a buffer */ - for (slotno = 0; slotno < shared->num_slots; slotno++) + for (slotno = bankstart; slotno < bankend; slotno++) { if (shared->page_number[slotno] == pageno && shared->page_status[slotno] != SLRU_PAGE_EMPTY && @@ -1031,7 +1074,10 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) int best_invalid_page_number = 0; /* keep compiler quiet */ /* See if page already has a buffer assigned */ - for (slotno = 0; slotno < shared->num_slots; slotno++) + int bankstart = (pageno & ctl->bank_mask) * SLRU_BANK_SIZE; + int bankend = bankstart + SLRU_BANK_SIZE; + + for (slotno = bankstart; slotno < bankend; slotno++) { if (shared->page_number[slotno] == pageno && shared->page_status[slotno] != SLRU_PAGE_EMPTY) @@ -1066,7 +1112,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) * multiple pages with the same lru_count. */ cur_count = (shared->cur_lru_count)++; - for (slotno = 0; slotno < shared->num_slots; slotno++) + for (slotno = bankstart; slotno < bankend; slotno++) { int this_delta; int this_page_number; diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 62bb610167..125273e235 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -31,6 +31,7 @@ #include "access/slru.h" #include "access/subtrans.h" #include "access/transam.h" +#include "miscadmin.h" #include "pg_trace.h" #include "utils/snapmgr.h" diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 011ec18015..61b12d1056 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -154,3 +154,5 @@ int64 VacuumPageDirty = 0; int VacuumCostBalance = 0; /* working state for vacuum */ bool VacuumCostActive = false; + +int slru_buffers_size_scale = 2; /* power 2 scale for SLRU buffers */ diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 16ec6c5ef0..4a182225b7 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2277,6 +2277,16 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"slru_buffers_size_scale", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("SLRU buffers size scale of power 2"), + NULL + }, + &slru_buffers_size_scale, + 2, 0, 7, + NULL, NULL, NULL + }, + { {"temp_buffers", PGC_USERSET, RESOURCES_MEM, gettext_noop("Sets the maximum number of temporary buffers used by each session."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index d08d55c3fe..136ea5f48c 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -157,6 +157,9 @@ # mmap # (change requires restart) #min_dynamic_shared_memory = 0MB # (change requires restart) +#slru_buffers_size_scale = 2 # SLRU buffers size scale of power 2, range 0..7 + # (change requires restart) + #vacuum_buffer_usage_limit = 256kB # size of vacuum and analyze buffer access strategy ring; # 0 to disable vacuum buffer access strategy; # range 128kB to 16GB diff --git a/src/include/access/clog.h b/src/include/access/clog.h index d99444f073..cee7e19b3f 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -40,7 +40,6 @@ extern void TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, TransactionId *subxids, XidStatus status, XLogRecPtr lsn); extern XidStatus TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn); -extern Size CLOGShmemBuffers(void); extern Size CLOGShmemSize(void); extern void CLOGShmemInit(void); extern void BootStrapCLOG(void); diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h index 5087cdce51..155e82eb4f 100644 --- a/src/include/access/commit_ts.h +++ b/src/include/access/commit_ts.h @@ -27,7 +27,6 @@ extern bool TransactionIdGetCommitTsData(TransactionId xid, extern TransactionId GetLatestCommitTsData(TimestampTz *ts, RepOriginId *nodeid); -extern Size CommitTsShmemBuffers(void); extern Size CommitTsShmemSize(void); extern void CommitTsShmemInit(void); extern void BootStrapCommitTs(void); diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 246f757f6a..6a2c914d48 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -30,8 +30,8 @@ #define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF) /* Number of SLRU buffers to use for multixact */ -#define NUM_MULTIXACTOFFSET_BUFFERS 8 -#define NUM_MULTIXACTMEMBER_BUFFERS 16 +#define NUM_MULTIXACTOFFSET_BUFFERS (16 << slru_buffers_size_scale) +#define NUM_MULTIXACTMEMBER_BUFFERS (32 << slru_buffers_size_scale) /* * Possible multixact lock modes ("status"). The first four modes are for diff --git a/src/include/access/slru.h b/src/include/access/slru.h index a8a424d92d..f5f2b5b8b5 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -134,6 +134,11 @@ typedef struct SlruCtlData * it's always the same, it doesn't need to be in shared memory. */ char Dir[64]; + + /* + * mask for slotno hash bank + */ + Size bank_mask; } SlruCtlData; typedef SlruCtlData *SlruCtl; diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h index 46a473c77f..0dad287550 100644 --- a/src/include/access/subtrans.h +++ b/src/include/access/subtrans.h @@ -12,7 +12,7 @@ #define SUBTRANS_H /* Number of SLRU buffers to use for subtrans */ -#define NUM_SUBTRANS_BUFFERS 32 +#define NUM_SUBTRANS_BUFFERS (32 << slru_buffers_size_scale) extern void SubTransSetParent(TransactionId xid, TransactionId parent); extern TransactionId SubTransGetParent(TransactionId xid); diff --git a/src/include/commands/async.h b/src/include/commands/async.h index 02da6ba7e1..b1d59472b1 100644 --- a/src/include/commands/async.h +++ b/src/include/commands/async.h @@ -18,7 +18,7 @@ /* * The number of SLRU page buffers we use for the notification queue. */ -#define NUM_NOTIFY_BUFFERS 8 +#define NUM_NOTIFY_BUFFERS (16 << slru_buffers_size_scale) extern PGDLLIMPORT bool Trace_notify; extern PGDLLIMPORT volatile sig_atomic_t notifyInterruptPending; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 14bd574fc2..f2cec02a2f 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -177,6 +177,7 @@ extern PGDLLIMPORT int MaxBackends; extern PGDLLIMPORT int MaxConnections; extern PGDLLIMPORT int max_worker_processes; extern PGDLLIMPORT int max_parallel_workers; +extern PGDLLIMPORT int slru_buffers_size_scale; extern PGDLLIMPORT int MyProcPid; extern PGDLLIMPORT pg_time_t MyStartTime; @@ -262,6 +263,7 @@ extern PGDLLIMPORT int work_mem; extern PGDLLIMPORT double hash_mem_multiplier; extern PGDLLIMPORT int maintenance_work_mem; extern PGDLLIMPORT int max_parallel_maintenance_workers; +extern PGDLLIMPORT int slru_buffers_size_scale; /* * Upper and lower hard limits for the buffer access strategy ring size diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h index cd48afa17b..794ecd8169 100644 --- a/src/include/storage/predicate.h +++ b/src/include/storage/predicate.h @@ -28,7 +28,7 @@ extern PGDLLIMPORT int max_predicate_locks_per_page; /* Number of SLRU buffers to use for Serial SLRU */ -#define NUM_SERIAL_BUFFERS 16 +#define NUM_SERIAL_BUFFERS (16 << slru_buffers_size_scale) /* * A handle used for sharing SERIALIZABLEXACT objects between the participants -- 2.39.2 (Apple Git-143)