diff --git a/contrib/pg_prewarm/Makefile b/contrib/pg_prewarm/Makefile index 7ad941e..88580d1 100644 --- a/contrib/pg_prewarm/Makefile +++ b/contrib/pg_prewarm/Makefile @@ -1,10 +1,10 @@ # contrib/pg_prewarm/Makefile MODULE_big = pg_prewarm -OBJS = pg_prewarm.o $(WIN32RES) +OBJS = pg_prewarm.o autoprewarm.o $(WIN32RES) EXTENSION = pg_prewarm -DATA = pg_prewarm--1.1.sql pg_prewarm--1.0--1.1.sql +DATA = pg_prewarm--1.1--1.2.sql pg_prewarm--1.1.sql pg_prewarm--1.0--1.1.sql PGFILEDESC = "pg_prewarm - preload relation data into system buffer cache" ifdef USE_PGXS diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c new file mode 100644 index 0000000..ac0f9e4 --- /dev/null +++ b/contrib/pg_prewarm/autoprewarm.c @@ -0,0 +1,1032 @@ +/*------------------------------------------------------------------------- + * + * autoprewarm.c + * Automatically prewarm the shared buffer pool when server restarts. + * + * DESCRIPTION + * + * It is a bgworker which automatically records information about blocks + * which were present in buffer pool before server shutdown and then + * prewarm the buffer pool upon server restart with those blocks. + * + * How does it work? When the shared library "pg_prewarm" is preloaded, a + * bgworker "autoprewarm" is launched immediately after the server has + * reached consistent state. The bgworker will start loading blocks + * recorded in the format BlockInfoRecord + * <> in + * $PGDATA/AUTOPREWARM_FILE, until there is no free buffer left in the + * buffer pool. This way we do not replace any new blocks which were + * loaded either by the recovery process or the querying clients. + * + * Once the "autoprewarm" bgworker has completed its prewarm task, it will + * start a new task to periodically dump the BlockInfoRecords related to + * blocks which are currently in shared buffer pool. Upon next server + * restart, the bgworker will prewarm the buffer pool by loading those + * blocks. The GUC pg_prewarm.dump_interval will control the dumping + * activity of the bgworker. + * + * Copyright (c) 2016-2017, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/autoprewarm.c + *------------------------------------------------------------------------- + */ + +#include "autoprewarm.h" + +PG_FUNCTION_INFO_V1(launch_autoprewarm_dump); +PG_FUNCTION_INFO_V1(autoprewarm_dump_now); + +#define AT_PWARM_OFF -1 +#define AT_PWARM_DUMP_AT_SHUTDOWN_ONLY 0 +#define AT_PWARM_DEFAULT_DUMP_INTERVAL 300 + +#define AUTOPREWARM_FILE "autoprewarm.blocks" + +/* Primary functions */ +void _PG_init(void); +void autoprewarm_main(Datum main_arg); +static void dump_block_info_periodically(void); +static pid_t autoprewarm_dump_launcher(void); +static void setup_autoprewarm(BackgroundWorker *autoprewarm, + const char *worker_name, + const char *worker_function, + Datum main_arg, int restart_time, + int extra_flags); +void load_one_database(Datum main_arg); + +/* + * Signal Handlers. + */ + +static void apw_sigterm_handler(SIGNAL_ARGS); +static void apw_sighup_handler(SIGNAL_ARGS); +static void apw_sigusr1_handler(SIGNAL_ARGS); + +/* flags set by signal handlers */ +static volatile sig_atomic_t got_sigterm = false; +static volatile sig_atomic_t got_sighup = false; + +/* + * Signal handler for SIGTERM + * Set a flag to let the main loop to terminate, and set our latch to wake it + * up. + */ +static void +apw_sigterm_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_sigterm = true; + + if (MyProc) + SetLatch(&MyProc->procLatch); + + errno = save_errno; +} + +/* + * Signal handler for SIGHUP + * Set a flag to tell the process to reread the config file, and set our + * latch to wake it up. + */ +static void +apw_sighup_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_sighup = true; + + if (MyProc) + SetLatch(&MyProc->procLatch); + + errno = save_errno; +} + +/* + * Signal handler for SIGUSR1. + * The prewarm sub-workers will notify with SIGUSR1 on their startup/shutdown. + */ +static void +apw_sigusr1_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + if (MyProc) + SetLatch(&MyProc->procLatch); + + errno = save_errno; +} + +/* ============================================================================ + * ============== types and variables used by autoprewarm ============= + * ============================================================================ + */ + +/* + * Metadata of each persistent block which is dumped and used to load. + */ +typedef struct BlockInfoRecord +{ + Oid database; /* database */ + Oid spcnode; /* tablespace */ + Oid filenode; /* relation's filenode. */ + ForkNumber forknum; /* fork number */ + BlockNumber blocknum; /* block number */ +} BlockInfoRecord; + +/* + * Tasks performed by autoprewarm workers. + */ +typedef enum +{ + TASK_PREWARM_BUFFERPOOL, /* prewarm the buffer pool. */ + TASK_DUMP_BUFFERPOOL_INFO, /* dump the buffer pool block info. */ + TASK_DUMP_IMMEDIATE_ONCE, /* dump the buffer pool block info immediately + * once. */ + TASK_END /* no more tasks to do. */ +} AutoPrewarmTask; + +/* + * Shared state information about the running autoprewarm bgworker. + */ +typedef struct AutoPrewarmSharedState +{ + LWLock lock; /* protects SharedState */ + AutoPrewarmTask current_task; /* current tasks performed by + * autoprewarm workers. */ + bool is_bgworker_running; /* if set can't start another worker. */ + bool can_do_prewarm; /* if set can't do prewarm task. */ +} AutoPrewarmSharedState; + +static AutoPrewarmSharedState *state = NULL; + +/* dsm used during TASK_PREWARM_BUFFERPOOL to store read BlockInfoRecord's. */ +static dsm_segment *seg = NULL; + +/* + * The block_infos allocated to each sub-worker to do prewarming. + */ +typedef struct prewarm_elem +{ + dsm_handle block_info_handle; /* handle to dsm seg of block_infos */ + Oid database; /* database to connect and load */ + uint32 start_pos; /* start position within block_infos from + * which sub-worker start prewaring blocks. */ + uint32 end_of_blockinfos; /* End of block_infos in dsm */ +} prewarm_elem; + +/* GUC variable which control the dump activity of autoprewarm. */ +static int dump_interval = 0; + +/* + * GUC variable which say whether autoprewarm worker has to be started when + * preloaded. + */ +static bool autoprewarm = true; + +/* compare member elements to check if they are not equal. */ +#define cmp_member_elem(fld) \ +do { \ + if (a->fld < b->fld) \ + return -1; \ + else if (a->fld > b->fld) \ + return 1; \ +} while(0); + +/* + * blockinfo_cmp - compare function used for qsort(). + */ +static int +blockinfo_cmp(const void *p, const void *q) +{ + BlockInfoRecord *a = (BlockInfoRecord *) p; + BlockInfoRecord *b = (BlockInfoRecord *) q; + + cmp_member_elem(database); + cmp_member_elem(spcnode); + cmp_member_elem(filenode); + cmp_member_elem(forknum); + cmp_member_elem(blocknum); + return 0; +} + +/* ============================================================================ + * ===================== prewarm part of autoprewarm ======================= + * ============================================================================ + */ + +/* + * reset_shm_state - on_shm_exit reset the prewarm state. + */ + +static void +reset_shm_state(int code, Datum arg) +{ + state->is_bgworker_running = false; + state->current_task = TASK_END; +} + +/* + * detach_blkinfos - on_shm_exit detach the dsm allocated for blockinfos. + */ +static void +detach_blkinfos(int code, Datum arg) +{ + if (seg != NULL) + dsm_detach(seg); +} + +/* + * get_autoprewarm_task - get next task allowed and to be performed by the + * autoprewarm worker. + */ +static AutoPrewarmTask +get_autoprewarm_task(AutoPrewarmTask todo_task) +{ + bool found = false; + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + state = ShmemInitStruct("autoprewarm", + sizeof(AutoPrewarmSharedState), + &found); + if (!found) + { + /* First time through ... */ + LWLockInitialize(&state->lock, LWLockNewTrancheId()); + state->current_task = TASK_END; + state->is_bgworker_running = false; + state->can_do_prewarm = true; + } + + LWLockRelease(AddinShmemInitLock); + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + + /* + * If already a bgworker is running we cannot run another. But if task is + * to just dump immediate and there is no prewarm happening we can go + * further. + */ + if (state->is_bgworker_running && + (todo_task != TASK_DUMP_IMMEDIATE_ONCE || + state->current_task == TASK_PREWARM_BUFFERPOOL)) + { + LWLockRelease(&state->lock); + return TASK_END; + } + + /* + * If asked to do prewarm, check whether we can do so. We avoid prewarm if + * its already done on startup. + */ + if (todo_task == TASK_PREWARM_BUFFERPOOL && !state->can_do_prewarm) + todo_task = TASK_DUMP_BUFFERPOOL_INFO; + + /* + * For now if there was a previous attempt to prewarm or dump any further + * request to prewarm will not be entertained. + */ + state->can_do_prewarm = false; + + if (todo_task != TASK_DUMP_IMMEDIATE_ONCE) + { + state->is_bgworker_running = true; + state->current_task = todo_task; + on_shmem_exit(reset_shm_state, 0); + } + + LWLockRelease(&state->lock); + return todo_task; +} + +/* + * load_one_database -- start of prewarm sub-worker, this will try to load + * blocks of one database starting from block info position passed by main + * prewarm worker. + */ +void +load_one_database(Datum main_arg) +{ + uint32 pos; + BlockInfoRecord *block_info; + Relation rel = NULL; + BlockNumber nblocks = 0; + prewarm_elem pelem; + BlockInfoRecord *old_blk; + + /* Establish signal handlers before unblocking signals. */ + pqsignal(SIGTERM, apw_sigterm_handler); + pqsignal(SIGHUP, apw_sighup_handler); + + /* + * We're now ready to receive signals + */ + BackgroundWorkerUnblockSignals(); + + memcpy(&pelem, MyBgworkerEntry->bgw_extra, sizeof(prewarm_elem)); + + seg = dsm_attach(pelem.block_info_handle); + if (seg == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("unable to map dynamic shared memory segment"))); + on_shmem_exit(detach_blkinfos, 0); + + block_info = (BlockInfoRecord *) dsm_segment_address(seg); + + BackgroundWorkerInitializeConnectionByOid(pelem.database, InvalidOid); + SetCurrentStatementStartTimestamp(); + StartTransactionCommand(); + + old_blk = NULL; + pos = pelem.start_pos; + + while (!got_sigterm && pos < pelem.end_of_blockinfos && have_free_buffer()) + { + BlockInfoRecord *blk = &block_info[pos]; + Buffer buf; + + /* + * Quit if we've reached records for another database. Unless the + * previous blocks were of global objects which were combined with + * next database's block infos. + */ + if (old_blk != NULL && old_blk->database != blk->database && + old_blk->database != 0) + break; + + /* + * When we reach a new relation, close the old one. Note, however, + * that the previous try_relation_open may have failed, in which case + * rel will be NULL. + */ + if (old_blk != NULL && old_blk->filenode != blk->filenode && rel != NULL) + { + relation_close(rel, AccessShareLock); + rel = NULL; + } + + /* + * Try to open each new relation, but only once, when we first + * encounter it. If it's been dropped, skip the associated blocks. + */ + if (old_blk == NULL || old_blk->filenode != blk->filenode) + { + Oid reloid; + + Assert(rel == NULL); + reloid = RelidByRelfilenode(blk->spcnode, blk->filenode); + if (OidIsValid(reloid)) + rel = try_relation_open(reloid, AccessShareLock); + } + if (!rel) + { + ++pos; + old_blk = blk; + continue; + } + + /* Once per fork, check for fork existence and size. */ + if (old_blk == NULL || old_blk->forknum != blk->forknum) + { + RelationOpenSmgr(rel); + if (smgrexists(rel->rd_smgr, blk->forknum)) + nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum); + else + nblocks = 0; + } + + /* check if blocknum is valid and with in fork file size. */ + if (blk->blocknum >= nblocks) + { + /* move to next forknum. */ + ++pos; + old_blk = blk; + continue; + } + + /* Prewarm buffer. */ + buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL, + NULL); + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + + old_blk = blk; + ++pos; + } + + dsm_detach(seg); + seg = NULL; + + /* release lock on previous relation. */ + if (rel) + { + relation_close(rel, AccessShareLock); + rel = NULL; + } + + CommitTransactionCommand(); + return; +} + +/* + * launch_prewarm_subworker -- register a dynamic worker to load the blocks + * starting from next_db_pos. We wait until the worker has stopped. + */ +static void +launch_prewarm_subworker(prewarm_elem *pelem) +{ + BackgroundWorker worker; + BackgroundWorkerHandle *handle = NULL; + BgwHandleStatus status; + + setup_autoprewarm(&worker, "autoprewarm", "load_one_database", + (Datum) NULL, BGW_NEVER_RESTART, + BGWORKER_BACKEND_DATABASE_CONNECTION); + + /* set bgw_notify_pid so that we can use WaitForBackgroundWorkerShutdown */ + worker.bgw_notify_pid = MyProcPid; + memcpy(worker.bgw_extra, pelem, sizeof(prewarm_elem)); + + if (!RegisterDynamicBackgroundWorker(&worker, &handle)) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("registering dynamic bgworker autoprewarm failed"), + errhint("Consider increasing configuration parameter " + "\"max_worker_processes\"."))); + } + + status = WaitForBackgroundWorkerShutdown(handle); + if (status == BGWH_STOPPED) + return; + + if (status == BGWH_POSTMASTER_DIED) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("cannot start bgworker autoprewarm without postmaster"), + errhint("Kill all remaining database processes and restart" + " the database."))); + } + + Assert(0); +} + +/* + * prewarm_buffer_pool - the main routine which prewarm the buffer pool. + * + * The prewarm bgworker will first load all of the BlockInfoRecord's in + * $PGDATA/AUTOPREWARM_FILE to a dsm. And those BlockInfoRecords are further + * separated based on their database. And for each group of BlockInfoRecords a + * sub-workers will be launched to load corresponding blocks. Each sub-worker + * will be launched in sequential order only after the previous sub-worker has + * finished its job. + */ +static void +prewarm_buffer_pool(void) +{ + FILE *file = NULL; + uint32 *next_db_pos; + size_t next_db_pos_size; + uint32 this_dbs_elements = 0, + num_elements, + num_db = 0, + i; + Oid prev_database; + BlockInfoRecord *blkinfo; + + file = fopen(AUTOPREWARM_FILE, PG_BINARY_R); + if (!file) + { + if (errno != ENOENT) + ereport(ERROR, (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + AUTOPREWARM_FILE))); + return; /* No file to load. */ + } + + if (fscanf(file, "<<%u>>", &num_elements) != 1) + { + fclose(file); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("Error reading num of elements in \"%s\" for" + " autoprewarm : %m", AUTOPREWARM_FILE))); + } + + seg = dsm_create(sizeof(BlockInfoRecord) * num_elements, 0); + on_shmem_exit(detach_blkinfos, 0); + + blkinfo = (BlockInfoRecord *) dsm_segment_address(seg); + + for (i = 0; i < num_elements; i++) + { + /* get next block. */ + if (5 != fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database, + &blkinfo[i].spcnode, &blkinfo[i].filenode, + (uint32 *) &blkinfo[i].forknum, &blkinfo[i].blocknum)) + break; + } + + num_elements = i; + + /* + * sort the block number to increase the chance of sequential reads during + * load. + */ + pg_qsort(blkinfo, num_elements, sizeof(BlockInfoRecord), blockinfo_cmp); + next_db_pos_size = 64; + next_db_pos = (uint32 *) palloc(sizeof(uint32) * next_db_pos_size); + + /* read and fill block infos */ + for (i = 0; i < num_elements; i++) + { + if (i == 0) + { + prev_database = blkinfo[i].database; + next_db_pos[num_db++] = 0; + } + else if (prev_database != blkinfo[i].database) + { + if (num_db >= next_db_pos_size) + { + next_db_pos_size *= 2; + next_db_pos = (uint32 *) repalloc(next_db_pos, + sizeof(uint32) * next_db_pos_size); + } + + next_db_pos[num_db++] = this_dbs_elements; + this_dbs_elements = 0; + prev_database = blkinfo[i].database; + } + + this_dbs_elements++; + } + + fclose(file); + i = 0; + + /* get next database's first block info's position. */ + while (!got_sigterm && i < num_db) + { + prewarm_elem pelem; + + pelem.start_pos = next_db_pos[i]; + + if (blkinfo[next_db_pos[i]].database == 0) + { + /* + * For block info of a global object whose database will be 0 try + * to combine them with next non-zero database's block infos to + * load. If there are no other block infos than the global objects + * we silently ignore them. Should I throw error? + */ + if ((i + 1) < num_db) + { + pelem.database = blkinfo[next_db_pos[i + 1]].database; + i++; + } + else + break; + } + else + pelem.database = blkinfo[next_db_pos[i]].database; + pelem.block_info_handle = dsm_segment_handle(seg); + pelem.end_of_blockinfos = num_elements; + + /* + * Register a sub-worker to load new database's block. Wait until the + * sub-worker finish its job before launching next sub-worker. + */ + launch_prewarm_subworker(&pelem); + i++; + } + + pfree(next_db_pos); + dsm_detach(seg); + seg = NULL; + ereport(LOG, (errmsg("autoprewarm load task ended"))); + return; +} + +/* ============================================================================ + * ============= buffer pool info dump part of autoprewarm =============== + * ============================================================================ + */ + +/* This sub-module is for periodically dumping buffer pool's block info into + * a dump file AUTOPREWARM_FILE. + * Each entry of block info looks like this: + * and we shall call it + * as BlockInfoRecord. Note we write in the text form so that the dump + * information is readable and if necessary can be carefully edited. + * + * The prewarm task will read these blockInfoRecord one by one in sequence and + * distribute it among its sub workers to load corresponding blocks. + */ + +/* + * dump_now - the main routine which goes through each buffer header of buffer + * pool and dumps their meta data. We Sort these data and then dump them. + * Sorting is necessary as it facilitates sequential read during load. + */ +static uint32 +dump_now(void) +{ + static char transient_dump_file_path[MAXPGPATH]; + uint32 i; + int ret, + buflen; + uint32 num_blocks; + BlockInfoRecord *block_info_array; + BufferDesc *bufHdr; + int fd; + char buf[1024]; + + block_info_array = + (BlockInfoRecord *) palloc(sizeof(BlockInfoRecord) * NBuffers); + + for (num_blocks = 0, i = 0; i < NBuffers; i++) + { + uint32 buf_state; + + /* + * In case of a SIGHUP, just reload the configuration. + */ + if (got_sighup) + { + got_sighup = false; + ProcessConfigFile(PGC_SIGHUP); + } + + /* Have we been asked to stop dump? */ + if (dump_interval == AT_PWARM_OFF) + { + free(block_info_array); + return 0; + } + + bufHdr = GetBufferDescriptor(i); + + /* lock each buffer header before inspecting. */ + buf_state = LockBufHdr(bufHdr); + + if (buf_state & BM_TAG_VALID) + { + block_info_array[num_blocks].database = bufHdr->tag.rnode.dbNode; + block_info_array[num_blocks].spcnode = bufHdr->tag.rnode.spcNode; + block_info_array[num_blocks].filenode = bufHdr->tag.rnode.relNode; + block_info_array[num_blocks].forknum = bufHdr->tag.forkNum; + block_info_array[num_blocks].blocknum = bufHdr->tag.blockNum; + ++num_blocks; + } + + UnlockBufHdr(bufHdr, buf_state); + } + + snprintf(transient_dump_file_path, MAXPGPATH, "%s.%d", AUTOPREWARM_FILE, + MyProcPid); + + fd = OpenTransientFile(transient_dump_file_path, + O_CREAT | O_WRONLY | O_TRUNC, 0666); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open \"%s\": %m", AUTOPREWARM_FILE))); + + buflen = sprintf(buf, "<<%u>>\n", num_blocks); + if (write(fd, buf, buflen) < buflen) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("autoprewarm: error writing to \"%s\" : %m", + AUTOPREWARM_FILE))); + + for (i = 0; i < num_blocks; i++) + { + /* + * In case of a SIGHUP, just reload the configuration. + */ + if (got_sighup) + { + got_sighup = false; + ProcessConfigFile(PGC_SIGHUP); + } + + /* Have we been asked to stop dump? */ + if (dump_interval == AT_PWARM_OFF) + { + pfree(block_info_array); + CloseTransientFile(fd); + unlink(transient_dump_file_path); + return 0; + } + + buflen = sprintf(buf, "%u,%u,%u,%u,%u\n", + block_info_array[i].database, + block_info_array[i].spcnode, + block_info_array[i].filenode, + (uint32) block_info_array[i].forknum, + block_info_array[i].blocknum); + + if (write(fd, buf, buflen) < buflen) + { + pfree(block_info_array); + CloseTransientFile(fd); + unlink(transient_dump_file_path); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("error writing to \"%s\" : %m", + AUTOPREWARM_FILE))); + } + } + + pfree(block_info_array); + + /* + * rename transient_dump_file_path to AUTOPREWARM_FILE to make things + * permanent. + */ + ret = CloseTransientFile(fd); + if (ret != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("error closing \"%s\" : %m", + transient_dump_file_path))); + (void) durable_rename(transient_dump_file_path, AUTOPREWARM_FILE, ERROR); + + ereport(LOG, (errmsg("saved metadata info of %d blocks", num_blocks))); + return num_blocks; +} + +/* + * dump_block_info_periodically - at regular intervals, which is defined by GUC + * dump_interval, dump the info of blocks which are present in buffer pool. + */ +void +dump_block_info_periodically(void) +{ + TimestampTz last_dump_time = GetCurrentTimestamp(); + + while (!got_sigterm) + { + int rc; + struct timeval nap; + + nap.tv_sec = AT_PWARM_DEFAULT_DUMP_INTERVAL; + nap.tv_usec = 0; + + /* Has been set not to dump. Nothing more to do. */ + if (dump_interval == AT_PWARM_OFF) + return; + + if (dump_interval > AT_PWARM_DUMP_AT_SHUTDOWN_ONLY) + { + TimestampTz current_time = GetCurrentTimestamp(); + + if (TimestampDifferenceExceeds(last_dump_time, + current_time, + (dump_interval * 1000))) + { + dump_now(); + if (got_sigterm) + return; /* got shutdown signal during or right after a + * dump. And, I think better to return now. */ + last_dump_time = GetCurrentTimestamp(); + nap.tv_sec = dump_interval; + nap.tv_usec = 0; + } + else + { + long secs; + int usecs; + + TimestampDifference(last_dump_time, current_time, + &secs, &usecs); + nap.tv_sec = dump_interval - secs; + nap.tv_usec = 0; + } + } + + ResetLatch(&MyProc->procLatch); + rc = WaitLatch(&MyProc->procLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L), + PG_WAIT_EXTENSION); + + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + /* + * In case of a SIGHUP, just reload the configuration. + */ + if (got_sighup) + { + got_sighup = false; + ProcessConfigFile(PGC_SIGHUP); + } + } + + /* One last block meta info dump while postmaster shutdown. */ + if (dump_interval != AT_PWARM_OFF) + dump_now(); +} + +/* + * autoprewarm_main -- the main entry point of autoprewarm bgworker process. + */ +void +autoprewarm_main(Datum main_arg) +{ + AutoPrewarmTask next_task; + + /* Establish signal handlers before unblocking signals. */ + pqsignal(SIGTERM, apw_sigterm_handler); + pqsignal(SIGHUP, apw_sighup_handler); + pqsignal(SIGUSR1, apw_sigusr1_handler); + + /* We're now ready to receive signals */ + BackgroundWorkerUnblockSignals(); + + next_task = get_autoprewarm_task(DatumGetInt32(main_arg)); + + ereport(LOG, (errmsg("autoprewarm has started"))); + + /* + * **** perform autoprewarm's next task **** + */ + if (next_task == TASK_PREWARM_BUFFERPOOL) + { + prewarm_buffer_pool(); + + /* prewarm is done lets move to TASK_DUMP_BUFFERPOOL_INFO. */ + state->current_task = TASK_DUMP_BUFFERPOOL_INFO; + next_task = TASK_DUMP_BUFFERPOOL_INFO; + } + + if (next_task == TASK_DUMP_BUFFERPOOL_INFO) + { + dump_block_info_periodically(); + + /* + * down grade to TASK_DUMP_IMMEDIATE_ONCE so others can start + * TASK_DUMP_BUFFERPOOL_INFO + */ + state->current_task = TASK_DUMP_IMMEDIATE_ONCE; + } + + ereport(LOG, (errmsg("autoprewarm shutting down"))); +} + +/* ============================================================================ + * ============= extension's entry functions/utilities =================== + * ============================================================================ + */ + +/* Register autoprewarm load bgworker. */ +static void +setup_autoprewarm(BackgroundWorker *autoprewarm, const char *worker_name, + const char *worker_function, Datum main_arg, int restart_time, + int extra_flags) +{ + MemSet(autoprewarm, 0, sizeof(BackgroundWorker)); + autoprewarm->bgw_flags = BGWORKER_SHMEM_ACCESS | extra_flags; + + /* Register the autoprewarm background worker */ + autoprewarm->bgw_start_time = BgWorkerStart_ConsistentState; + autoprewarm->bgw_restart_time = restart_time; + strcpy(autoprewarm->bgw_library_name, "pg_prewarm"); + strcpy(autoprewarm->bgw_function_name, worker_function); + strncpy(autoprewarm->bgw_name, worker_name, BGW_MAXLEN); + autoprewarm->bgw_main_arg = main_arg; +} + +/* Extension's entry point. */ +void +_PG_init(void) +{ + BackgroundWorker prewarm_worker; + + /* Define custom GUC variables. */ + if (process_shared_preload_libraries_in_progress) + DefineCustomBoolVariable("pg_prewarm.autoprewarm", + "Enable/Disable auto-prewarm feature.", + NULL, + &autoprewarm, + true, + PGC_POSTMASTER, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_prewarm.dump_interval", + "Sets the maximum time between two buffer pool dumps", + "If set to Zero, timer based dumping is disabled." + " If set to -1, stops the running autoprewarm.", + &dump_interval, + AT_PWARM_DEFAULT_DUMP_INTERVAL, + AT_PWARM_OFF, INT_MAX / 1000, + PGC_SIGHUP, + GUC_UNIT_S, + NULL, + NULL, + NULL); + + EmitWarningsOnPlaceholders("pg_prewarm"); + + /* if not run as a preloaded library, nothing more to do here! */ + if (!process_shared_preload_libraries_in_progress) + return; + + /* Request additional shared resources */ + RequestAddinShmemSpace(MAXALIGN(sizeof(AutoPrewarmSharedState))); + RequestNamedLWLockTranche("pg_autoprewarm", 1); + + /* Has been set not to start autoprewarm bgworker. Nothing more to do. */ + if (!autoprewarm) + return; + + /* Register autoprewarm load. */ + setup_autoprewarm(&prewarm_worker, "autoprewarm", "autoprewarm_main", + Int32GetDatum(TASK_PREWARM_BUFFERPOOL), 0, 0); + RegisterBackgroundWorker(&prewarm_worker); +} + +/* + * Dynamically launch an autoprewarm dump worker. + */ +static pid_t +autoprewarm_dump_launcher(void) +{ + BackgroundWorker worker; + BackgroundWorkerHandle *handle; + BgwHandleStatus status; + pid_t pid; + + setup_autoprewarm(&worker, "autoprewarm", "autoprewarm_main", + Int32GetDatum(TASK_DUMP_BUFFERPOOL_INFO), 0, 0); + + /* set bgw_notify_pid so that we can use WaitForBackgroundWorkerStartup */ + worker.bgw_notify_pid = MyProcPid; + + if (!RegisterDynamicBackgroundWorker(&worker, &handle)) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("registering dynamic bgworker \"autoprewarm\" failed"), + errhint("Consider increasing configuration parameter " + "\"max_worker_processes\"."))); + } + + status = WaitForBackgroundWorkerStartup(handle, &pid); + if (status == BGWH_STOPPED) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("could not start autoprewarm dump bgworker"), + errhint("More details may be available in the server log."))); + } + + if (status == BGWH_POSTMASTER_DIED) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("cannot start bgworker autoprewarm without postmaster"), + errhint("Kill all remaining database processes and restart the database."))); + } + + Assert(status == BGWH_STARTED); + return pid; +} + +/* + * The C-Language entry function to launch autoprewarm dump bgworker. + */ +Datum +launch_autoprewarm_dump(PG_FUNCTION_ARGS) +{ + pid_t pid; + + /* Has been set not to dump. Nothing more to do. */ + if (dump_interval == AT_PWARM_OFF) + PG_RETURN_NULL(); + + pid = autoprewarm_dump_launcher(); + PG_RETURN_INT32(pid); +} + +/* + * The C-Language entry function to dump immediately. + */ +Datum +autoprewarm_dump_now(PG_FUNCTION_ARGS) +{ + AutoPrewarmTask next_task; + + /* dump only if prewarm is not in progress. */ + next_task = get_autoprewarm_task(TASK_DUMP_IMMEDIATE_ONCE); + if (next_task == TASK_DUMP_IMMEDIATE_ONCE) + PG_RETURN_INT64(dump_now()); + PG_RETURN_NULL(); +} diff --git a/contrib/pg_prewarm/autoprewarm.h b/contrib/pg_prewarm/autoprewarm.h new file mode 100644 index 0000000..4220fc2 --- /dev/null +++ b/contrib/pg_prewarm/autoprewarm.h @@ -0,0 +1,35 @@ +/* + * contrib/pg_prewarm/autoprewarm.h + */ +#ifndef __AUTOPREWARM_H__ +#define __AUTOPREWARM_H__ + +#include "postgres.h" +#include + +/* These are always necessary for a bgworker. */ +#include "miscadmin.h" +#include "postmaster/bgworker.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/shmem.h" + +/* These are necessary for prewarm utilities. */ +#include "access/heapam.h" +#include "access/xact.h" +#include "catalog/pg_class.h" +#include "catalog/pg_type.h" +#include "pgstat.h" +#include "storage/buf_internals.h" +#include "storage/dsm.h" +#include "storage/smgr.h" +#include "utils/acl.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relfilenodemap.h" +#include "utils/resowner.h" + +#endif /* __AUTOPREWARM_H__ */ diff --git a/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql b/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql new file mode 100644 index 0000000..6c35fb7 --- /dev/null +++ b/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql @@ -0,0 +1,14 @@ +/* contrib/pg_prewarm/pg_prewarm--1.0--1.1.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_prewarm UPDATE TO '1.2'" to load this file. \quit + +CREATE FUNCTION launch_autoprewarm_dump() +RETURNS pg_catalog.int4 STRICT +AS 'MODULE_PATHNAME', 'launch_autoprewarm_dump' +LANGUAGE C; + +CREATE FUNCTION autoprewarm_dump_now() +RETURNS pg_catalog.int8 STRICT +AS 'MODULE_PATHNAME', 'autoprewarm_dump_now' +LANGUAGE C; diff --git a/contrib/pg_prewarm/pg_prewarm.control b/contrib/pg_prewarm/pg_prewarm.control index cf2fb92..40e3add 100644 --- a/contrib/pg_prewarm/pg_prewarm.control +++ b/contrib/pg_prewarm/pg_prewarm.control @@ -1,5 +1,5 @@ # pg_prewarm extension comment = 'prewarm relation data' -default_version = '1.1' +default_version = '1.2' module_pathname = '$libdir/pg_prewarm' relocatable = true diff --git a/doc/src/sgml/pgprewarm.sgml b/doc/src/sgml/pgprewarm.sgml index c090401..ab5bf42 100644 --- a/doc/src/sgml/pgprewarm.sgml +++ b/doc/src/sgml/pgprewarm.sgml @@ -10,7 +10,9 @@ The pg_prewarm module provides a convenient way to load relation data into either the operating system buffer cache - or the PostgreSQL buffer cache. + or the PostgreSQL buffer cache. Additionally, an + automatic prewarming of the server buffers is supported whenever the server + restarts. @@ -55,6 +57,102 @@ pg_prewarm(regclass, mode text default 'buffer', fork text default 'main', cache. For these reasons, prewarming is typically most useful at startup, when caches are largely empty. + + +launch_autoprewarm_dump() RETURNS int4 + + + + This is a SQL callable function to launch the autoprewarm + worker to dump the buffer pool information at regular interval. In a server, + we can only run one autoprewarm worker so if worker sees + another existing worker it will exit immediately. The return value is pid of + the worker which has been launched. + + + +autoprewarm_dump_now() RETURNS int8 + + + + This is a SQL callable function to dump buffer pool information immediately + once by a backend. This can work in parallel + with the autoprewarm worker while it is dumping. + The return value is the number of blocks info dumped. + + + + + autoprewarm + + + A bgworker which automatically records information about blocks which were + present in buffer pool before server shutdown and then prewarm the buffer + pool upon server restart with those blocks. + + + + When the shared library pg_prewarm is preloaded via + in postgresql.conf, + a bgworker autoprewarm is launched immediately after the + server has reached a consistent state. The bgworker will start loading blocks + recorded in $PGDATA/autoprewarm.blocks until there is a + free buffer left in the buffer pool. This way we do not replace any new + blocks which were loaded either by the recovery process or the querying + clients. + + + + Once the autoprewarm bgworker has completed its prewarm + task, it will start a new task to periodically dump the information about + blocks which are currently in shared buffer pool. Upon next server restart, + the bgworker will prewarm the buffer pool by loading those blocks. The GUC + pg_prewarm.dump_interval will control the dumping activity + of the bgworker. + + + + + Configuration Parameters + + + + + pg_prewarm.autoprewarm (boolean) + + pg_prewarm.autoprewarm configuration parameter + + + + + This is valid only for autoprewarm. An autoprewarm + worker will only be started if this variable is set on. + The default value is on. + + + + + + + + + pg_prewarm.dump_interval (int) + + pg_prewarm.dump_interval configuration parameter + + + + + This is valid only for autoprewarm. The minimum number + of seconds between two buffer pool's block information dump. The default + is 300 seconds. It also takes special values. If set to 0 then timer + based dump is disabled, it dumps only while the server is shutting down. + If set to -1, the running autoprewarm will be stopped. + + + + + diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 5d0a636..06a34a7 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -169,6 +169,23 @@ ClockSweepTick(void) } /* + * have_free_buffer -- a lockless check to see if there is a free buffer in + * buffer pool. + * + * If the result is true that will become stale once free buffers are moved out + * by other operations, so the caller who strictly want to use a free buffer + * should not call this. + */ +bool +have_free_buffer() +{ + if (StrategyControl->firstFreeBuffer >= 0) + return true; + else + return false; +} + +/* * StrategyGetBuffer * * Called by the bufmgr to get the next candidate buffer to use in diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index ff99f6b..ab04bd9 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -317,6 +317,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); +extern bool have_free_buffer(void); /* buf_table.c */ extern Size BufTableShmemSize(int size); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index eaa6d32..c6fa86a 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -138,6 +138,8 @@ AttrDefault AttrNumber AttributeOpts AuthRequest +AutoPrewarmSharedState +AutoPrewarmTask AutoVacOpts AutoVacuumShmemStruct AutoVacuumWorkItem @@ -214,10 +216,12 @@ BitmapOr BitmapOrPath BitmapOrState Bitmapset +BlkType BlobInfo Block BlockId BlockIdData +BlockInfoRecord BlockNumber BlockSampler BlockSamplerData @@ -2869,6 +2873,7 @@ pos_trgm post_parse_analyze_hook_type pqbool pqsigfunc +prewarm_elem printQueryOpt printTableContent printTableFooter