Thread: PostgreSQL DB checkpoint error!

PostgreSQL DB checkpoint error!

From

Ashok kumar Mani

Date:

28 April 2020, 06:13:31

Classification: External

Hi Admin,

Database is getting hung every day at 5AM. I found the below message from log file , Could you please help me , should I need to increase the max_wal_size value ?

Why it say “incomplete startup packet” in the log.

Please someone help on this ?

020-04-28 01:45:12.494 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:45:26.860 GMT [2389] LOG: checkpoints are occurring too frequently (14 seconds apart)

2020-04-28 01:45:26.860 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:45:53.230 GMT [2389] LOG: checkpoints are occurring too frequently (27 seconds apart)

2020-04-28 01:45:53.230 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:46:01.630 GMT [16213] LOG: incomplete startup packet

2020-04-28 01:46:08.006 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 01:46:08.006 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:46:33.860 GMT [2389] LOG: checkpoints are occurring too frequently (25 seconds apart)

2020-04-28 01:46:33.860 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:46:48.472 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 01:46:48.472 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:47:01.661 GMT [16374] LOG: incomplete startup packet

2020-04-28 01:47:14.653 GMT [2389] LOG: checkpoints are occurring too frequently (26 seconds apart)

2020-04-28 01:47:14.653 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:47:28.988 GMT [2389] LOG: checkpoints are occurring too frequently (14 seconds apart)

2020-04-28 01:47:28.988 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:47:54.815 GMT [2389] LOG: checkpoints are occurring too frequently (26 seconds apart)

2020-04-28 01:47:54.815 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:48:01.692 GMT [16544] LOG: incomplete startup packet

2020-04-28 01:48:09.459 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 01:48:09.459 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:48:35.780 GMT [2389] LOG: checkpoints are occurring too frequently (26 seconds apart)

2020-04-28 01:48:35.780 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:48:50.154 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 01:48:50.154 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:49:01.726 GMT [16715] LOG: incomplete startup packet

2020-04-28 01:49:18.163 GMT [2389] LOG: checkpoints are occurring too frequently (28 seconds apart)

2020-04-28 01:49:18.163 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:49:32.931 GMT [2389] LOG: checkpoints are occurring too frequently (14 seconds apart)

2020-04-28 01:49:32.931 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:49:58.932 GMT [2389] LOG: checkpoints are occurring too frequently (26 seconds apart)

2020-04-28 01:49:58.932 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:50:01.770 GMT [16882] LOG: incomplete startup packet

2020-04-28 01:50:13.198 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 01:50:13.198 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:50:38.503 GMT [2389] LOG: checkpoints are occurring too frequently (25 seconds apart)

2020-04-28 01:50:38.503 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:50:53.422 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 01:50:53.422 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:51:01.800 GMT [17049] LOG: incomplete startup packet

2020-04-28 01:51:18.791 GMT [2389] LOG: checkpoints are occurring too frequently (25 seconds apart)

2020-04-28 01:51:18.791 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:51:32.977 GMT [2389] LOG: checkpoints are occurring too frequently (14 seconds apart)

2020-04-28 01:51:32.977 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:51:50.522 GMT [2389] LOG: checkpoints are occurring too frequently (18 seconds apart)

2020-04-28 01:51:50.522 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 01:52:01.829 GMT [17213] LOG: incomplete startup packet

2020-04-28 01:53:01.868 GMT [17380] LOG: incomplete startup packet

2020-04-28 01:54:01.905 GMT [17532] LOG: incomplete startup packet

2020-04-28 01:55:01.948 GMT [17698] LOG: incomplete startup packet

2020-04-28 01:56:01.979 GMT [17853] LOG: incomplete startup packet

2020-04-28 01:57:02.011 GMT [18013] LOG: incomplete startup packet

2020-04-28 01:58:01.040 GMT [18174] LOG: incomplete startup packet

2020-04-28 01:59:01.070 GMT [18331] LOG: incomplete startup packet

2020-04-28 02:00:01.102 GMT [18492] LOG: incomplete startup packet

2020-04-28 02:01:01.149 GMT [18662] LOG: incomplete startup packet

2020-04-28 02:02:01.179 GMT [18841] LOG: incomplete startup packet

2020-04-28 02:03:01.207 GMT [18996] LOG: incomplete startup packet

2020-04-28 02:04:01.234 GMT [19158] LOG: incomplete startup packet

2020-04-28 02:05:01.266 GMT [19318] LOG: incomplete startup packet

2020-04-28 02:06:01.294 GMT [19472] LOG: incomplete startup packet

2020-04-28 02:07:01.323 GMT [19630] LOG: incomplete startup packet

2020-04-28 02:08:01.353 GMT [19787] LOG: incomplete startup packet

2020-04-28 02:08:19.310 GMT [2389] LOG: checkpoints are occurring too frequently (27 seconds apart)

2020-04-28 02:08:19.310 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:08:34.374 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 02:08:34.374 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:09:00.597 GMT [2389] LOG: checkpoints are occurring too frequently (26 seconds apart)

2020-04-28 02:09:00.597 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:09:01.394 GMT [19950] LOG: incomplete startup packet

2020-04-28 02:09:15.417 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 02:09:15.417 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:09:40.970 GMT [2389] LOG: checkpoints are occurring too frequently (25 seconds apart)

2020-04-28 02:09:40.970 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:09:55.793 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 02:09:55.793 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:10:01.584 GMT [20189] LOG: incomplete startup packet

2020-04-28 02:10:21.860 GMT [2389] LOG: checkpoints are occurring too frequently (26 seconds apart)

2020-04-28 02:10:21.860 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:10:36.599 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 02:10:36.599 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:11:01.299 GMT [20353] LOG: incomplete startup packet

2020-04-28 02:11:03.194 GMT [2389] LOG: checkpoints are occurring too frequently (27 seconds apart)

2020-04-28 02:11:03.194 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:11:17.932 GMT [2389] LOG: checkpoints are occurring too frequently (14 seconds apart)

2020-04-28 02:11:17.932 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:11:44.487 GMT [2389] LOG: checkpoints are occurring too frequently (27 seconds apart)

2020-04-28 02:11:44.487 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:11:58.902 GMT [2389] LOG: checkpoints are occurring too frequently (14 seconds apart)

2020-04-28 02:11:58.902 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:12:01.328 GMT [20507] LOG: incomplete startup packet

2020-04-28 02:12:25.409 GMT [2389] LOG: checkpoints are occurring too frequently (27 seconds apart)

2020-04-28 02:12:25.409 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:12:39.800 GMT [2389] LOG: checkpoints are occurring too frequently (14 seconds apart)

2020-04-28 02:12:39.800 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:13:01.358 GMT [20668] LOG: incomplete startup packet

2020-04-28 02:13:05.808 GMT [2389] LOG: checkpoints are occurring too frequently (26 seconds apart)

2020-04-28 02:13:05.808 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:13:20.788 GMT [2389] LOG: checkpoints are occurring too frequently (15 seconds apart)

2020-04-28 02:13:20.788 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:13:47.007 GMT [2389] LOG: checkpoints are occurring too frequently (27 seconds apart)

2020-04-28 02:13:47.007 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:14:01.387 GMT [20827] LOG: incomplete startup packet

2020-04-28 02:14:01.686 GMT [2389] LOG: checkpoints are occurring too frequently (14 seconds apart)

2020-04-28 02:14:01.686 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:14:27.337 GMT [2389] LOG: checkpoints are occurring too frequently (26 seconds apart)

2020-04-28 02:14:27.337 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:14:41.546 GMT [2389] LOG: checkpoints are occurring too frequently (14 seconds apart)

2020-04-28 02:14:41.546 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:15:01.415 GMT [20985] LOG: incomplete startup packet

2020-04-28 02:15:07.280 GMT [2389] LOG: checkpoints are occurring too frequently (26 seconds apart)

2020-04-28 02:15:07.280 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:15:21.654 GMT [2389] LOG: checkpoints are occurring too frequently (14 seconds apart)

2020-04-28 02:15:21.654 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

2020-04-28 02:15:40.239 GMT [2389] LOG: checkpoints are occurring too frequently (19 seconds apart)

2020-04-28 02:15:40.239 GMT [2389] HINT: Consider increasing the configuration parameter "max_wal_size".

Best Wishes,

Ashokkumar Mani

Re: PostgreSQL DB checkpoint error!

From

"David G. Johnston"

Date:

28 April 2020, 06:26:57

There are no errors in the logs you provided, only “weak warnings”

On Monday, April 27, 2020, Ashok kumar Mani <amani@accelaero.com> wrote:

Database is getting hung every day at 5AM. I found the below message from log file , Could you please help me , should I need to increase the max_wal_size value ?

Probably not, you should probably look for insertion code that does high volume via the insert sql command without transactions and/or batching.

Why it say “incomplete startup packet” in the log.

Because you have a software client beginning but not completing a connection to the database. Usually its some form of monitoring.

David J.

RE: PostgreSQL DB checkpoint error!

From

Ashok kumar Mani

Date:

28 April 2020, 06:31:52

Classification: External

Hi David,

I am running cronjob at the same time which will do data pruning for zabbix database(psql). Please let me know if that warning is related to reindex and vacuum ?

^C-bash-4.2$ cat /Data/zabbix_hkp/scripts/data_Pruning.sh

psql -d zabbix -f /Data/zabbix_hkp/scripts/three_months_datapurge.sql 1>/Data/zabbix_hkp/scripts_log/data_purging_`date +%m%d%y`.log 2>data_cleaning_err_`date +%m%d%y`.log

-bash-4.2$ cat /Data/zabbix_hkp/scripts/three_months_datapurge.sql

-- Delete alerts which are older than 1 day -> tested on 8th april 2020

delete FROM alerts where age(to_timestamp(alerts.clock)) > interval '90 days';

VACUUM FULL alerts ;

REINDEX TABLE ALERTS;

delete FROM acknowledges where age(to_timestamp(acknowledges.clock)) > interval '90 days';

VACUUM FULL acknowledges;

REINDEX TABLE acknowledges;

-- Delete events which are older than 1 day -> tested on 8th april 2020

delete FROM events where age(to_timestamp(events.clock)) > interval '90 days';

VACUUM FULL events;

REINDEX TABLE events;

delete FROM history where age(to_timestamp(history.clock)) > interval '90 days';

VACUUM FULL history;

REINDEX TABLE history;

delete FROM history_uint where age(to_timestamp(history_uint.clock)) > interval '90 days' ;

VACUUM FULL history_uint;

REINDEX TABLE history_uint;

delete FROM history_str where age(to_timestamp(history_str.clock)) > interval '90 days' ;

VACUUM TABLE history_str;

REINDEX TABLE history_str;

delete FROM history_text where age(to_timestamp(history_text.clock)) > interval '90 days' ;

VACUUM FULL history_text;

REINDEX TABLE history_text;

delete FROM history_log where age(to_timestamp(history_log.clock)) > interval '90 days' ;

VACUUM FULL history_log;

REINDEX TABLE history_log;

delete FROM trends where age(to_timestamp(trends.clock)) > interval '90 days';

VACUUM FULL trends;

REINDEX TABLE trends;

delete FROM trends_uint where age(to_timestamp(trends_uint.clock)) > interval '90 days' ;

VACUUM FULL trends_uint;

REINDEX TABLE trends_uint;

-- Delete orphaned alerts entries

DELETE FROM alerts WHERE NOT EXISTS (SELECT 1 FROM actions WHERE alerts.actionid = actions.actionid);

DELETE FROM alerts WHERE NOT EXISTS (SELECT 1 FROM events WHERE alerts.eventid = events.eventid);

DELETE FROM alerts WHERE NOT userid IN (SELECT userid FROM users);

DELETE FROM alerts WHERE NOT mediatypeid IN (SELECT mediatypeid FROM media_type);

-- Delete orphaned application entries that no longer map back to a host

DELETE FROM applications WHERE NOT hostid IN (SELECT hostid FROM hosts);

-- Delete orphaned auditlog details (such as logins)

DELETE FROM auditlog_details WHERE NOT auditid IN (SELECT auditid FROM auditlog);

DELETE FROM auditlog WHERE NOT userid IN (SELECT userid FROM users);

-- Delete orphaned conditions

DELETE FROM conditions WHERE NOT actionid IN (SELECT actionid FROM actions);

-- Delete orphaned functions

DELETE FROM functions WHERE NOT itemid IN (SELECT itemid FROM items);

DELETE FROM functions WHERE NOT triggerid IN (SELECT triggerid FROM triggers);

-- Delete orphaned graph items

DELETE FROM graphs_items WHERE NOT graphid IN (SELECT graphid FROM graphs);

-- Delete orphaned host macro's

DELETE FROM hostmacro WHERE NOT hostid IN (SELECT hostid FROM hosts);

-- Delete orphaned item data

DELETE FROM items WHERE hostid NOT IN (SELECT hostid FROM hosts);

DELETE FROM items_applications WHERE applicationid NOT IN (SELECT applicationid FROM applications);

DELETE FROM items_applications WHERE itemid NOT IN (SELECT itemid FROM items);

-- Delete orphaned HTTP check data

DELETE FROM httpstep WHERE NOT httptestid IN (SELECT httptestid FROM httptest);

DELETE FROM httpstepitem WHERE NOT httpstepid IN (SELECT httpstepid FROM httpstep);

DELETE FROM httpstepitem WHERE NOT itemid IN (SELECT itemid FROM items);

DELETE FROM httptest WHERE applicationid NOT IN (SELECT applicationid FROM applications);

-- Delete orphaned maintenance data

DELETE FROM maintenances_groups WHERE maintenanceid NOT IN (SELECT maintenanceid FROM maintenances);

DELETE FROM maintenances_groups WHERE groupid NOT IN (SELECT groupid FROM groups);

DELETE FROM maintenances_hosts WHERE maintenanceid NOT IN (SELECT maintenanceid FROM maintenances);

DELETE FROM maintenances_hosts WHERE hostid NOT IN (SELECT hostid FROM hosts);

DELETE FROM maintenances_windows WHERE maintenanceid NOT IN (SELECT maintenanceid FROM maintenances);

DELETE FROM maintenances_windows WHERE timeperiodid NOT IN (SELECT timeperiodid FROM timeperiods);

-- Delete orphaned mappings

DELETE FROM mappings WHERE NOT valuemapid IN (SELECT valuemapid FROM valuemaps);

-- Delete orphaned media items

DELETE FROM media WHERE NOT userid IN (SELECT userid FROM users);

DELETE FROM media WHERE NOT mediatypeid IN (SELECT mediatypeid FROM media_type);

DELETE FROM rights WHERE NOT groupid IN (SELECT usrgrpid FROM usrgrp);

DELETE FROM rights WHERE NOT id IN (SELECT groupid FROM groups);

DELETE FROM sessions WHERE NOT userid IN (SELECT userid FROM users);

-- Delete orphaned screens

DELETE FROM screens_items WHERE screenid NOT IN (SELECT screenid FROM screens);

-- Delete orphaned events & triggers

DELETE FROM trigger_depends WHERE triggerid_down NOT IN (SELECT triggerid FROM triggers);

DELETE FROM trigger_depends WHERE triggerid_up NOT IN (SELECT triggerid FROM triggers);

-- Delete records in the history/trends table for items that no longer exist

DELETE FROM history WHERE NOT EXISTS (SELECT 1 FROM items WHERE history.itemid = items.itemid);

DELETE FROM history_uint WHERE NOT EXISTS (SELECT 1 FROM items WHERE history_uint.itemid = items.itemid);

DELETE FROM history_log WHERE NOT EXISTS (SELECT 1 FROM items WHERE history_log.itemid = items.itemid);

DELETE FROM history_str WHERE NOT EXISTS (SELECT 1 FROM items WHERE history_str.itemid = items.itemid);

DELETE FROM history_text WHERE NOT EXISTS (SELECT 1 FROM items WHERE history_text.itemid = items.itemid);

DELETE FROM trends WHERE NOT EXISTS (SELECT 1 FROM items WHERE trends.itemid = items.itemid);

DELETE FROM trends_uint WHERE NOT EXISTS (SELECT 1 FROM items WHERE trends_uint.itemid = items.itemid);

-- Delete records in the events table for triggers/items that no longer exist

DELETE FROM events WHERE source = 0 AND object = 0 AND objectid NOT IN (SELECT triggerid FROM triggers);

DELETE FROM events WHERE source = 3 AND object = 0 AND objectid NOT IN (SELECT triggerid FROM triggers);

DELETE FROM events WHERE source = 3 AND object = 4 AND objectid NOT IN (SELECT itemid FROM items);

-- Delete all orphaned acknowledge entries

DELETE FROM acknowledges WHERE NOT eventid IN (SELECT eventid FROM events);

DELETE FROM acknowledges WHERE NOT userid IN (SELECT userid FROM users);

DELETE FROM acknowledges WHERE eventid IN (SELECT eventid FROM events WHERE (source = 0 OR source=3) AND object = 0 AND objectid NOT IN (SELECT triggerid FROM triggers));

DELETE FROM acknowledges WHERE eventid IN (SELECT eventid FROM events WHERE source=3 AND object = 4 AND objectid NOT IN (SELECT itemid FROM items));

-bash-4.2$

Best Wishes,

Ashokkumar Mani

Database Architect\DBA

OCP | AWSCSA | M103

From: David G. Johnston <david.g.johnston@gmail.com>
Sent: Tuesday, April 28, 2020 10:27 AM
To: Ashok kumar Mani <amani@accelaero.com>
Cc: pgsql-admin@postgresql.org; pgsql-admin@lists.postgresql.org
Subject: Re: PostgreSQL DB checkpoint error!

Information Security Email Alert: This email is from an EXTERNAL source. Please use caution when clicking on links or opening attachments from an unknown or suspicious sender. To report a suspected phishing email, Send us an Email on Servicedesk@accelaero.com

There are no errors in the logs you provided, only “weak warnings”

On Monday, April 27, 2020, Ashok kumar Mani <amani@accelaero.com> wrote:

Database is getting hung every day at 5AM. I found the below message from log file , Could you please help me , should I need to increase the max_wal_size value ?

Probably not, you should probably look for insertion code that does high volume via the insert sql command without transactions and/or batching.

Why it say “incomplete startup packet” in the log.

Because you have a software client beginning but not completing a connection to the database. Usually its some form of monitoring.

David J.

Re: PostgreSQL DB checkpoint error!

From

Rui DeSousa

Date:

28 April 2020, 06:41:11

On Apr 28, 2020, at 2:13 AM, Ashok kumar Mani <amani@accelaero.com> wrote:

Database is getting hung every day at 5AM. I found the below message from log file , Could you please help me , should I need to increase the max_wal_size value ?

Most likely. I would recommend enabling checkpoint logging to get a better understanding how often checkpoint are occurring, duration, and size in relation to buffer pool. Check for 'checkpoint starting’ and 'checkpoint complete’ in the PostgreSQL log files after enabling it.

log_checkpoints = on

What is shared_buffers, max_wal_size, and wal_buffers set to?

Why it say “incomplete startup packet” in the log.

This should be benign.

RE: PostgreSQL DB checkpoint error!

From

Ashok kumar Mani

Date:

28 April 2020, 06:49:11

Classification: External

Hi Rui DeSousa

Do I need to restart the postgresql service for log_checkpoints to make it active . I would like to enable this parameter temporarily and check it?

postgres=# show log_checkpoints;

log_checkpoints

-----------------

off

(1 row)

postgres=# show shared_buffers;

shared_buffers

----------------

60GB

(1 row)

postgres=# show max_wal_size;

max_wal_size

--------------

2GB

(1 row)

postgres=# show wal_buffers;

wal_buffers

-------------

16MB

(1 row)

postgres=#

Best Wishes,

Ashokkumar Mani

Database Architect\DBA

OCP | AWSCSA | M103

From: Rui DeSousa <rui@crazybean.net>
Sent: Tuesday, April 28, 2020 10:41 AM
To: Ashok kumar Mani <amani@accelaero.com>
Cc: pgsql-admin@postgresql.org; pgsql-admin@lists.postgresql.org
Subject: Re: PostgreSQL DB checkpoint error!

On Apr 28, 2020, at 2:13 AM, Ashok kumar Mani <amani@accelaero.com> wrote:

Database is getting hung every day at 5AM. I found the below message from log file , Could you please help me , should I need to increase the max_wal_size value ?

log_checkpoints = on

What is shared_buffers, max_wal_size, and wal_buffers set to?

Why it say “incomplete startup packet” in the log.

This should be benign.

Re: PostgreSQL DB checkpoint error!

From

Rui DeSousa

Date:

28 April 2020, 06:50:23

On Apr 28, 2020, at 2:31 AM, Ashok kumar Mani <amani@accelaero.com> wrote:

I am running cronjob at the same time which will do data pruning for zabbix database(psql). Please let me know if that warning is related to reindex and vacuum ?

^C-bash-4.2$ cat /Data/zabbix_hkp/scripts/data_Pruning.sh
psql -d zabbix -f /Data/zabbix_hkp/scripts/three_months_datapurge.sql 1>/Data/zabbix_hkp/scripts_log/data_purging_`date +%m%d%y`.log 2>data_cleaning_err_`date +%m%d%y`.log
-bash-4.2$ cat /Data/zabbix_hkp/scripts/three_months_datapurge.sql
-- Delete alerts which are older than 1 day -> tested on 8th april 2020
delete FROM alerts where age(to_timestamp(alerts.clock)) > interval '90 days';
VACUUM FULL alerts ;
REINDEX TABLE ALERTS;
delete FROM acknowledges where age(to_timestamp(acknowledges.clock)) > interval '90 days';
VACUUM FULL acknowledges;
REINDEX TABLE acknowledges;

I would say so; the vacuum full and reindex is going create blocking situations that will hang Zabbix. I wouldn’t run vacuum full or reindex.

Re: PostgreSQL DB checkpoint error!

From

Rui DeSousa

Date:

28 April 2020, 07:01:47

On Apr 28, 2020, at 2:49 AM, Ashok kumar Mani <amani@accelaero.com> wrote:

Do I need to restart the postgresql service for log_checkpoints to make it active . I would like to enable this parameter temporarily and check it?

Nope, just get it in postgresql.conf and reload it. i.e. pg_ctl reload

postgres=# show max_wal_size;
max_wal_size
--------------
2GB
(1 row)

With the deletes, vacuum full, and reindex it is generating WALs at a rapid rate thus trigging another checkpoint given the 2GB size. The vacuum full and reindex are most likely causing Zabbix to be blocked.

I would rethink the vacuum full and reindex; before changing max_wal_size.

RE: PostgreSQL DB checkpoint error!

From

Ashok kumar Mani

Date:

28 April 2020, 07:06:31

Classification: External

Dear Rui DeSousa,

I would like to share the row counts of the big tables in zabbix as below.

What would you suggest if vacuum full and reindex is not possible then, is there any way to avoid blocking issue?

zabbix=# select count(*) from alerts;

count

-------

15354

(1 row)

zabbix=# select count(*) from history;

count

-----------

897550571

(1 row)

zabbix=# select count(*) from history_uint;

count

-----------

945414161

(1 row)

Best Wishes,

Ashokkumar Mani

Database Architect\DBA

OCP | AWSCSA | M103

From: Rui DeSousa <rui@crazybean.net>
Sent: Tuesday, April 28, 2020 10:50 AM
To: Ashok kumar Mani <amani@accelaero.com>
Cc: David G. Johnston <david.g.johnston@gmail.com>; pgsql-admin@postgresql.org; pgsql-admin@lists.postgresql.org
Subject: Re: PostgreSQL DB checkpoint error!

On Apr 28, 2020, at 2:31 AM, Ashok kumar Mani <amani@accelaero.com> wrote:

I am running cronjob at the same time which will do data pruning for zabbix database(psql). Please let me know if that warning is related to reindex and vacuum ?

^C-bash-4.2$ cat /Data/zabbix_hkp/scripts/data_Pruning.sh
psql -d zabbix -f /Data/zabbix_hkp/scripts/three_months_datapurge.sql 1>/Data/zabbix_hkp/scripts_log/data_purging_`date +%m%d%y`.log 2>data_cleaning_err_`date +%m%d%y`.log
-bash-4.2$ cat /Data/zabbix_hkp/scripts/three_months_datapurge.sql
-- Delete alerts which are older than 1 day -> tested on 8th april 2020
delete FROM alerts where age(to_timestamp(alerts.clock)) > interval '90 days';
VACUUM FULL alerts ;
REINDEX TABLE ALERTS;
delete FROM acknowledges where age(to_timestamp(acknowledges.clock)) > interval '90 days';
VACUUM FULL acknowledges;
REINDEX TABLE acknowledges;

I would say so; the vacuum full and reindex is going create blocking situations that will hang Zabbix. I wouldn’t run vacuum full or reindex.

Re: PostgreSQL DB checkpoint error!

From

Rui DeSousa

Date:

28 April 2020, 07:32:19

On Apr 28, 2020, at 3:06 AM, Ashok kumar Mani <amani@accelaero.com> wrote:

Classification: External

Dear Rui DeSousa,

I would like to share the row counts of the big tables in zabbix as below.
What would you suggest if vacuum full and reindex is not possible then, is there any way to avoid blocking issue?

I would not use ‘vacuum full’ for regular maintenance; only to address a specific issue. Use just vacuum and auto-vacuum.

reindex — that I do concurrently. If you are running PostgreSQL 12 ; great news! it adds ‘reindex currently’. If you are running an early version; then you have to build a new index currently and drop the old.

I only reindex tables that need to be reindexed; not just for the sake of it. I use the following function that I wrote to create a script that will reindex the table. The script creates a new index concurrently and then drops the old. It also does the same for the primary key by dropping the foreign key constraints as well and rebuilds them. It does this to minimize blocking contention for applications that normally run 24/7.

To use just execute the function with the table that you would like reindexed and the result we be a script that you can feed to psql to actually do the reindex.

i.e. select reindex_script(‘public’, ‘mytable’);

I would test it with your tables and application. I haven’t use this with a partitioned table and I don’t it would work for those tables.

p.s. looking forward to retiring this script when upgrading to version 12.

/*===========================================================================

Function: reindex_script()

Description: Returns a script to rebuild indexes concurrently

============================================================================*/

create or replace function reindex_script(_schema name, _table name, _tablespace name default null, _rename boolean default true)

returns table (sql text)

as $body$

declare

_obj_name text;

_old_iname text;

_new_iname text;

_is_pkey boolean;

_is_ukey boolean;

_sql text;

_cnt int;

_idx_oid oid;

_spath text;

begin

create temp table rebuild_indexes (pos serial not null primary key, sql text not null)

on commit drop;

-- sanitize schema input

select current_setting('search_path')

, nspname

into _spath, _schema

from pg_namespace

where nspname = _schema

;

if _schema is null then

raise exception 'Unable to identify schema';

end if

;

perform set_config('search_path', _schema, true);

insert into rebuild_indexes(sql)

values ('\set ON_ERROR_STOP on')

, (concat('set search_path=', quote_ident(_schema), ';'))

, ('\timing on')

, ('')

, (concat('\echo Vacuuming table ', quote_ident(_table), '.'))

, (concat('vacuum verbose ', quote_ident(_table), ';'))

;

for _idx_oid, _obj_name, _old_iname, _new_iname, _is_pkey, _is_ukey, _sql in

select i.indexrelid

, quote_ident(ns.nspname)

|| '.' || quote_ident(c.relname) as objname

, quote_ident((i.indexrelid::regclass)::text) as old_iname

, quote_ident(

c.relname

|| case

when substring((i.indexrelid::regclass)::text from '(_idx|_pkey|_ukey)\d{0,3}$') in ('_idx', '_pkey', '_ukey')

then

case

when i.indisprimary then '_pk'

when con.conindid is not null then '_uk'

else '_ix'

end

else

case

when i.indisprimary then '_pkey'

when con.conindid is not null then '_ukey'

else '_idx'

end

|| case i.indisprimary

when true then ''

else (row_number() over (partition by con.conindid is not null order by substring((i.indexrelid::regclass)::text from '(\d+)$')))::text

end

) as new_iname

, i.indisprimary as is_pkey

, con.conindid is not null as is_ukey

, regexp_replace(

lower(pg_get_indexdef(i.indexrelid))

, '( where |$)'

, coalesce(' tablespace ' || _tablespace, ' tablespace ' || its.spcname, '')

|| case

when i.indpred is null then ''

else ' where '

end

) as csql

from pg_namespace ns

join pg_class c on c.relnamespace = ns.oid

and c.relkind = 'r'

and c.relname = _table

join pg_index i on i.indrelid = c.oid

join pg_class ic on ic.oid = i.indexrelid

left join pg_tablespace its on its.oid = ic.reltablespace

left join pg_constraint con on con.conindid = i.indexrelid

and con.conrelid = c.oid

and con.contype = 'u'

where ns.nspname = _schema

order by i.indisprimary

, substring((i.indexrelid::regclass)::text from '(\d+)$')

loop

insert into rebuild_indexes(sql)

values ('')

, ('\echo Creating ' || _new_iname || ' on ' || _obj_name || '.' )

, (regexp_replace(_sql, 'index (.+) on (.+) using', 'index concurrently ' || _new_iname || ' on ' || _obj_name || ' using') || ';')

;

if _is_pkey then

insert into rebuild_indexes(sql)

values ('')

, ('\echo Swapping primary key ' || _old_iname || ' with '|| _new_iname || ' on ' || _obj_name || '.' )

, ('begin;')

, ('--Drop foreign keys to allow primary key swap')

;

insert into rebuild_indexes(sql)

select concat('alter table ', quote_ident((fk.conrelid::regclass)::text), ' drop constraint ', quote_ident(fk.conname), ';')

from pg_constraint fk

where fk.conindid = _idx_oid

and fk.contype = 'f'

;

insert into rebuild_indexes(sql)

values ('')

, ('alter table ' || _obj_name || ' drop constraint ' || _old_iname || ';')

, ('alter table ' || _obj_name || ' add primary key using index ' || _new_iname || ';')

, ('')

, ('--Rresintate foreign keys with "not valid" option; contraint is enforced just not validated')

;

insert into rebuild_indexes(sql)

select concat('alter table ', quote_ident((fk.conrelid::regclass)::text), ' add constraint ', quote_ident(fk.conname), ' ', lower(pg_get_constraintdef(fk.oid)), ' not valid;')

from pg_constraint fk

where fk.conindid = _idx_oid

and fk.contype = 'f'

;

insert into rebuild_indexes(sql)

values ('commit;')

, ('')

, ('\echo Revalidating foreign key constraints with existing records')

;

insert into rebuild_indexes(sql)

select concat('alter table ', quote_ident((fk.conrelid::regclass)::text), ' validate constraint ', quote_ident(fk.conname), ';')

from pg_constraint fk

where fk.conindid = _idx_oid

and fk.contype = 'f'

;

elsif _is_ukey then

insert into rebuild_indexes(sql)

values ('')

, ('\echo Swapping constraint ' || _old_iname || ' with '|| _new_iname || ' on ' || _obj_name || '.' )

, ('begin;')

, ('alter table ' || _obj_name || ' drop constraint ' || _old_iname || ';')

, ('alter table ' || _obj_name || ' add unique using index ' || _new_iname || ';')

, ('commit;')

;

else

insert into rebuild_indexes(sql)

values ('')

, ('\echo Dropping index ' || _old_iname || '.' )

, ('drop index concurrently ' || _old_iname || ';')

;

end if;

if _rename then

insert into rebuild_indexes(sql)

values ('')

, ('\echo Renaming index ' || _new_iname || ' to ' || _old_iname || '.' )

, ('alter index ' || _new_iname || ' rename to ' || _old_iname || ';')

;

end if

;

end loop

;

-- Reset search path incase of extended tranacation

perform set_config('search_path', _spath, true);

return query

select x.sql

from rebuild_indexes x

order by x.pos

;

end;

$body$ language plpgsql

;