*** a/src/backend/postmaster/autovacuum.c --- b/src/backend/postmaster/autovacuum.c *************** *** 223,231 **** typedef struct WorkerInfoData Oid wi_tableoid; PGPROC *wi_proc; TimestampTz wi_launchtime; ! int wi_cost_delay; ! int wi_cost_limit; int wi_cost_limit_base; } WorkerInfoData; typedef struct WorkerInfoData *WorkerInfo; --- 223,234 ---- Oid wi_tableoid; PGPROC *wi_proc; TimestampTz wi_launchtime; ! /* the "base" values are the configured values */ int wi_cost_limit_base; + int wi_cost_delay_base; + /* these are the ones actually in effect, considering balancing */ + int wi_cost_limit; + int wi_cost_delay; } WorkerInfoData; typedef struct WorkerInfoData *WorkerInfo; *************** *** 248,253 **** typedef enum --- 251,257 ---- * * av_signal set by other processes to indicate various conditions * av_launcherpid the PID of the autovacuum launcher + * av_vacuum_cost_* globally configured values for the cost_delay feature * av_freeWorkers the WorkerInfo freelist * av_runningWorkers the WorkerInfo non-free queue * av_startingWorker pointer to WorkerInfo currently being started (cleared by *************** *** 261,266 **** typedef struct --- 265,272 ---- { sig_atomic_t av_signal[AutoVacNumSignals]; pid_t av_launcherpid; + int av_vacuum_cost_delay; + int av_vacuum_cost_limit; dlist_head av_freeWorkers; dlist_head av_runningWorkers; WorkerInfo av_startingWorker; *************** *** 296,301 **** static List *get_database_list(void); --- 302,309 ---- static void rebuild_database_list(Oid newdb); static int db_comparator(const void *a, const void *b); static void autovac_balance_cost(void); + static int choose_vacuum_cost_delay(AutoVacOpts *avopts); + static int choose_vacuum_cost_limit(AutoVacOpts *avopts); static void do_autovacuum(void); static void FreeWorkerInfo(int code, Datum arg); *************** *** 1717,1722 **** FreeWorkerInfo(int code, Datum arg) --- 1725,1731 ---- MyWorkerInfo->wi_proc = NULL; MyWorkerInfo->wi_launchtime = 0; MyWorkerInfo->wi_cost_delay = 0; + MyWorkerInfo->wi_cost_delay_base = 0; MyWorkerInfo->wi_cost_limit = 0; MyWorkerInfo->wi_cost_limit_base = 0; dlist_push_head(&AutoVacuumShmem->av_freeWorkers, *************** *** 1742,1749 **** AutoVacuumUpdateDelay(void) --- 1751,1760 ---- { if (MyWorkerInfo) { + LWLockAcquire(AutovacuumLock, LW_SHARED); VacuumCostDelay = MyWorkerInfo->wi_cost_delay; VacuumCostLimit = MyWorkerInfo->wi_cost_limit; + LWLockRelease(AutovacuumLock); } } *************** *** 1756,1830 **** AutoVacuumUpdateDelay(void) static void autovac_balance_cost(void) { - /* - * The idea here is that we ration out I/O equally. The amount of I/O - * that a worker can consume is determined by cost_limit/cost_delay, so we - * try to equalize those ratios rather than the raw limit settings. - * - * note: in cost_limit, zero also means use value from elsewhere, because - * zero is not a valid value. - */ - int vac_cost_limit = (autovacuum_vac_cost_limit > 0 ? - autovacuum_vac_cost_limit : VacuumCostLimit); - int vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ? - autovacuum_vac_cost_delay : VacuumCostDelay); - double cost_total; - double cost_avail; dlist_iter iter; ! ! /* not set? nothing to do */ ! if (vac_cost_limit <= 0 || vac_cost_delay <= 0) return; ! /* caculate the total base cost limit of active workers */ ! cost_total = 0.0; dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) { WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); ! if (worker->wi_proc != NULL && ! worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0) ! cost_total += ! (double) worker->wi_cost_limit_base / worker->wi_cost_delay; } - /* there are no cost limits -- nothing to do */ - if (cost_total <= 0) - return; /* ! * Adjust cost limit of each active worker to balance the total of cost ! * limit to autovacuum_vacuum_cost_limit. */ - cost_avail = (double) vac_cost_limit / vac_cost_delay; dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) { WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); ! if (worker->wi_proc != NULL && ! worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0) { - int limit = (int) - (cost_avail * worker->wi_cost_limit_base / cost_total); ! /* ! * We put a lower bound of 1 on the cost_limit, to avoid division- ! * by-zero in the vacuum code. Also, in case of roundoff trouble ! * in these calculations, let's be sure we don't ever set ! * cost_limit to more than the base value. ! */ ! worker->wi_cost_limit = Max(Min(limit, ! worker->wi_cost_limit_base), ! 1); ! ! elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, cost_limit=%d, cost_limit_base=%d, cost_delay=%d)", ! worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid, ! worker->wi_cost_limit, worker->wi_cost_limit_base, ! worker->wi_cost_delay); } } } /* * get_database_list * Return a list of all databases found in pg_database. * --- 1767,1958 ---- static void autovac_balance_cost(void) { dlist_iter iter; ! int num_regular_workers = 0; ! int num_fast_workers = 0; ! float4 global_equiv_delay; ! float4 fast_equiv_delay; ! float4 regular_total_equiv_delay; ! float4 fast_total_equiv_delay; ! ! /* not set in this worker? nothing to do */ ! if (MyWorkerInfo && (MyWorkerInfo->wi_cost_limit_base <= 0 || ! MyWorkerInfo->wi_cost_delay_base <= 0)) return; ! AutoVacuumShmem->av_vacuum_cost_limit = choose_vacuum_cost_limit(NULL); ! AutoVacuumShmem->av_vacuum_cost_delay = choose_vacuum_cost_delay(NULL); ! ! /* ! * We use a metric we call "equivalent delay", equal to cost_limit divided ! * by cost_delay, to enable a reasonably simple algorithm to distribute ! * vacuum I/O bandwidth among all active workers. The "global" equiv delay ! * is the value computed from the GUC parameters in effect for this ! * database. This value lets us split workers in two classes: regular workers ! * are those that have an equiv delay less than or equal to global equiv delay, ! * while fast workers are those that have a value greater than global equiv ! * delay. ! */ ! global_equiv_delay = AutoVacuumShmem->av_vacuum_cost_limit / ! AutoVacuumShmem->av_vacuum_cost_delay; ! ! /* ! * Find the sum of equiv delay values in each class; also find the equiv ! * delay of the fastest among all the fast workers. ! */ dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) { WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); + float4 this_equiv_delay; + + /* ignore inactive workers, and workers not using cost_delay */ + if (worker->wi_proc == NULL || + worker->wi_cost_limit_base <= 0 || worker->wi_cost_delay_base <= 0) + continue; + + this_equiv_delay = worker->wi_cost_limit_base / worker->wi_cost_delay_base; ! if (this_equiv_delay <= global_equiv_delay) ! { ! /* A regular worker -- count it and add its equiv delay as such */ ! num_regular_workers++; ! regular_total_equiv_delay += this_equiv_delay; ! } ! else ! { ! /* ! * A fast worker. Count and add its equiv delay in a separate ! * total; we also use these to determine the fast_equiv_delay which ! * is the value which we will distribute to all of them, separately ! * from global_equiv_delay. ! */ ! if (num_fast_workers == 0) ! fast_equiv_delay = this_equiv_delay; ! else if (this_equiv_delay > fast_equiv_delay) ! fast_equiv_delay = this_equiv_delay; ! num_fast_workers++; ! fast_total_equiv_delay += this_equiv_delay; ! } ! } ! ! /* ! * We shouldn't actually distribute all of global equiv delay to ! * regular workers and all of (fast equiv delay - global equiv delay) to ! * fast workers; that makes fast workers much slower if the fast equiv ! * delay is only slightly faster than regular workers. For example, ! * consider the scenario with one regular worker with equiv_delay=10 and ! * one fast worker with equiv_delay=11; after subtraction the fast worker ! * will have equiv_delay=1). This part needs more fiddling to avoid this ! * problem. ! * ! * Perhaps the way to solve this is to consider that all workers (including ! * fast ones) get their fraction of global delay, and then fast workers get ! * their pro-rated share of fast equiv delay *added* to that. This would ! * make regular workers slower. ! * ! * Idea is to subtract only a fraction of global_equiv_delay from ! * fast_equiv_delay here, rather than all of it (and of course decrease ! * global_equiv_delay accordingly). The fraction would be variable: if ! * fast_equiv_delay is much higher than global_equiv_delay, then subtract ! * all of it; if both values are close enough, subtract a pro-rated fraction ! * according to the number of workers in each class. ! */ ! if (num_regular_workers > 0) ! { ! float4 diff_equiv_delay = fast_equiv_delay - global_equiv_delay; ! ! if (diff_equiv_delay < global_equiv_delay) ! { ! fast_equiv_delay -= diff_equiv_delay / num_fast_workers; ! global_equiv_delay -= diff_equiv_delay / num_regular_workers; ! } ! else ! fast_equiv_delay -= global_equiv_delay; } /* ! * Now we have all parameters we need; compute the values for individual ! * workers. */ dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) { WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); + float4 this_equiv_delay_base; + float4 this_equiv_delay_frac; + + /* again, ignore inactive workers, and workers not using cost_delay */ + if (worker->wi_proc == NULL || + worker->wi_cost_limit_base <= 0 || worker->wi_cost_delay_base <= 0) + continue; ! this_equiv_delay_base = worker->wi_cost_limit_base / worker->wi_cost_delay_base; ! ! /* ! * If it's a regular worker, use pro-rated fraction of global_equiv_delay; ! * otherwise, use pro-rated fraction of fast_total_equiv_delay ! */ ! if (this_equiv_delay_base <= global_equiv_delay) { ! /* A regular worker; use a pro-rated fraction of global_equiv_delay */ ! this_equiv_delay_frac = ! this_equiv_delay_base * global_equiv_delay / regular_total_equiv_delay; } + else + { + /* a fast worker: use a pro-rated fraction of fast_equiv_delay */ + this_equiv_delay_frac = + this_equiv_delay_base * fast_equiv_delay / fast_total_equiv_delay; + } + + /* + * Convert back into cost_limit and cost_delay values, and set them + * into the worker's shmem struct. We put a lower bound of 1 to the + * cost_limit, to avoid a division-by-zero in the vacuum code. + */ + worker->wi_cost_limit = Max(this_equiv_delay_frac * + worker->wi_cost_delay_base, + 1); + worker->wi_cost_delay = worker->wi_cost_delay_base; + + elog(LOG, "autovac_balance_cost(pid=%u db=%u, rel=%u, cost_limit=%d, cost_delay=%d)", + worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid, + worker->wi_cost_limit, worker->wi_cost_delay); } } /* + * Determine the vacuum_cost_delay value to use: if this is a table and it has + * reloptions, use the value from there; otherwise the autovacuum parameter, + * unless it's -1 in which case we use plain vacuum_cost_delay. + */ + static int + choose_vacuum_cost_delay(AutoVacOpts *avopts) + { + if (avopts && avopts->vacuum_cost_delay >= 0) + return avopts->vacuum_cost_delay; + if (autovacuum_vac_cost_delay >= 0) + return autovacuum_vac_cost_delay; + return VacuumCostDelay; + } + + /* + * Determine the vacuum_cost_limit value to use: if this is a table and it has + * reloptions, use the value from there; otherwise the autovacuum parameter, + * unless it's -1 or 0 in which case we use plain vacuum_cost_limit. + */ + static int + choose_vacuum_cost_limit(AutoVacOpts *avopts) + { + /* 0 is an invalid value here, so avoid that */ + if (avopts && avopts->vacuum_cost_limit > 0) + return avopts->vacuum_cost_limit; + if (autovacuum_vac_cost_limit > 0) + return autovacuum_vac_cost_limit; + return VacuumCostLimit; + } + + /* * get_database_list * Return a list of all databases found in pg_database. * *************** *** 2202,2209 **** do_autovacuum(void) Oid relid = lfirst_oid(cell); autovac_table *tab; bool skipit; - int stdVacuumCostDelay; - int stdVacuumCostLimit; dlist_iter iter; CHECK_FOR_INTERRUPTS(); --- 2330,2335 ---- *************** *** 2272,2302 **** do_autovacuum(void) MyWorkerInfo->wi_tableoid = relid; LWLockRelease(AutovacuumScheduleLock); - /* - * Remember the prevailing values of the vacuum cost GUCs. We have to - * restore these at the bottom of the loop, else we'll compute wrong - * values in the next iteration of autovac_balance_cost(). - */ - stdVacuumCostDelay = VacuumCostDelay; - stdVacuumCostLimit = VacuumCostLimit; - /* Must hold AutovacuumLock while mucking with cost balance info */ LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); /* advertise my cost delay parameters for the balancing algorithm */ MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay; MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit; MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit; /* do a balance */ autovac_balance_cost(); - /* set the active cost parameters from the result of that */ - AutoVacuumUpdateDelay(); - /* done */ LWLockRelease(AutovacuumLock); /* clean up memory before each iteration */ MemoryContextResetAndDeleteChildren(PortalContext); --- 2398,2421 ---- MyWorkerInfo->wi_tableoid = relid; LWLockRelease(AutovacuumScheduleLock); /* Must hold AutovacuumLock while mucking with cost balance info */ LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); /* advertise my cost delay parameters for the balancing algorithm */ MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay; + MyWorkerInfo->wi_cost_delay_base = tab->at_vacuum_cost_delay; MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit; MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit; /* do a balance */ autovac_balance_cost(); /* done */ LWLockRelease(AutovacuumLock); + /* set the active cost parameters from the result of that */ + AutoVacuumUpdateDelay(); + /* clean up memory before each iteration */ MemoryContextResetAndDeleteChildren(PortalContext); *************** *** 2381,2390 **** deleted: LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); MyWorkerInfo->wi_tableoid = InvalidOid; LWLockRelease(AutovacuumLock); - - /* restore vacuum cost GUCs for the next iteration */ - VacuumCostDelay = stdVacuumCostDelay; - VacuumCostLimit = stdVacuumCostLimit; } /* --- 2500,2505 ---- *************** *** 2532,2550 **** table_recheck_autovac(Oid relid, HTAB *table_toast_map, * defaults, autovacuum's own first and plain vacuum second. */ ! /* -1 in autovac setting means use plain vacuum_cost_delay */ ! vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= 0) ! ? avopts->vacuum_cost_delay ! : (autovacuum_vac_cost_delay >= 0) ! ? autovacuum_vac_cost_delay ! : VacuumCostDelay; ! ! /* 0 or -1 in autovac setting means use plain vacuum_cost_limit */ ! vac_cost_limit = (avopts && avopts->vacuum_cost_limit > 0) ! ? avopts->vacuum_cost_limit ! : (autovacuum_vac_cost_limit > 0) ! ? autovacuum_vac_cost_limit ! : VacuumCostLimit; /* these do not have autovacuum-specific settings */ freeze_min_age = (avopts && avopts->freeze_min_age >= 0) --- 2647,2654 ---- * defaults, autovacuum's own first and plain vacuum second. */ ! vac_cost_delay = choose_vacuum_cost_delay(avopts); ! vac_cost_limit = choose_vacuum_cost_limit(avopts); /* these do not have autovacuum-specific settings */ freeze_min_age = (avopts && avopts->freeze_min_age >= 0) *************** *** 2935,2940 **** AutoVacuumShmemInit(void) --- 3039,3046 ---- dlist_init(&AutoVacuumShmem->av_freeWorkers); dlist_init(&AutoVacuumShmem->av_runningWorkers); AutoVacuumShmem->av_startingWorker = NULL; + AutoVacuumShmem->av_vacuum_cost_limit = 0; + AutoVacuumShmem->av_vacuum_cost_delay = 0; worker = (WorkerInfo) ((char *) AutoVacuumShmem + MAXALIGN(sizeof(AutoVacuumShmemStruct)));