From f7fd8640ebac242f21719574b0e92dc7cfba4041 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Mon, 1 Mar 2021 18:08:23 +1300 Subject: [PATCH v7] Detect dropped connections while running queries. Provide a new optional GUC that can be used to check whether the client connection has gone away periodically while running very long queries. Author: Sergey Cherkashin Author: Thomas Munro Reviewed-by: Thomas Munro Reviewed-by: Tatsuo Ishii Reviewed-by: Konstantin Knizhnik Reviewed-by: Zhihong Yu Reviewed-by: Tom Lane (much earlier version) Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru --- doc/src/sgml/config.sgml | 36 +++++++++++ src/backend/libpq/pqcomm.c | 60 +++++++++++++++++-- src/backend/tcop/postgres.c | 34 +++++++++++ src/backend/utils/init/globals.c | 1 + src/backend/utils/init/postinit.c | 11 ++++ src/backend/utils/misc/guc.c | 10 ++++ src/backend/utils/misc/postgresql.conf.sample | 3 + src/include/libpq/libpq.h | 2 + src/include/miscadmin.h | 1 + src/include/utils/timeout.h | 1 + 10 files changed, 155 insertions(+), 4 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 5679b40dd5..e522be460c 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -998,6 +998,42 @@ include_dir 'conf.d' + + client_connection_check_interval (integer) + + client_connection_check_interval configuration parameter + + + + + Sets the time interval between checks that the client is still + connected, while running queries. The check is performed by testing + whether a part of the next message can be read from the client. + If the kernel reports that the connection + has been closed or lost, a long running query can abort immediately, + rather than discovering the problem when it eventually tries to send + the response. + + + If this value is specified without units, it is taken as milliseconds. + The default value is 0, which disables connection + checks. Without connection checks, the server will detect the loss of + the connection only when it is waiting for a new request, receiving a + request or sending a response. + + + For the kernel itself to detect lost TCP connections reliably and + within a known timeframe in all scenarios including network failure, it + may also be necessary to adjust the default TCP keepalive settings of + the operating system, or the + , + and + settings of + PostgreSQL. + + + + diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index 4c7b1e7bfd..a12ed3f851 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -79,6 +79,7 @@ #include "storage/ipc.h" #include "utils/guc.h" #include "utils/memutils.h" +#include "utils/timeout.h" /* * Cope with the various platform-specific ways to spell TCP keepalive socket @@ -104,6 +105,7 @@ */ int Unix_socket_permissions; char *Unix_socket_group; +int client_connection_check_interval; /* Where the Unix socket files are (list of palloc'd strings) */ static List *sock_paths = NIL; @@ -919,13 +921,13 @@ socket_set_nonblocking(bool nonblocking) } /* -------------------------------- - * pq_recvbuf - load some bytes into the input buffer + * pq_recvbuf_ext - load some bytes into the input buffer * * returns 0 if OK, EOF if trouble * -------------------------------- */ static int -pq_recvbuf(void) +pq_recvbuf_ext(bool nonblocking) { if (PqRecvPointer > 0) { @@ -941,8 +943,7 @@ pq_recvbuf(void) PqRecvLength = PqRecvPointer = 0; } - /* Ensure that we're in blocking mode */ - socket_set_nonblocking(false); + socket_set_nonblocking(nonblocking); /* Can fill buffer from PqRecvLength and upwards */ for (;;) @@ -954,6 +955,9 @@ pq_recvbuf(void) if (r < 0) { + if (nonblocking && (errno == EAGAIN || errno == EWOULDBLOCK)) + return 0; + if (errno == EINTR) continue; /* Ok if interrupted */ @@ -981,6 +985,13 @@ pq_recvbuf(void) } } +static int +pq_recvbuf(void) +{ + return pq_recvbuf_ext(false); +} + + /* -------------------------------- * pq_getbyte - get a single byte from connection, or return EOF * -------------------------------- @@ -1921,3 +1932,44 @@ pq_settcpusertimeout(int timeout, Port *port) return STATUS_OK; } + +/* + * Peek at the first byte of the next message from the client, without + * consuming it. + * + * Return 0 if there isn't at least one byte of a new message in our receive + * buffer or the socket yet, or if we're in the middle of reading a message + * already so we can't see the next message yet. + * + * Return EOF if the connection is closed. + * + * Return 1 if there is at least one byte of data available from the start of + * the next messag, and write that byte into *c. + */ +int +pq_peekmessage(unsigned char *c) +{ + /* We're already in the middle of a message. */ + if (PqCommReadingMsg) + return 0; + + /* Do we have a byte already in our receive buffer? */ + if (PqRecvPointer < PqRecvLength) + { + *c = PqRecvBuffer[PqRecvPointer]; + return 1; + } + + /* Try to read at least one byte from secure_read() without blocking. */ + if (pq_recvbuf_ext(true)) + return EOF; + + /* Now do we have a byte in our receive buffer? */ + if (PqRecvPointer < PqRecvLength) + { + *c = PqRecvBuffer[PqRecvPointer]; + return 1; + } + + return 0; +} diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 2b1b68109f..aab90a3a78 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -2671,6 +2671,14 @@ start_xact_command(void) * not desired, the timeout has to be disabled explicitly. */ enable_statement_timeout(); + + /* Start timeout for checking if the client has gone away if necessary. */ + if (client_connection_check_interval > 0 && + IsUnderPostmaster && + MyProcPort && + !get_timeout_active(CLIENT_CONNECTION_CHECK_TIMEOUT)) + enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT, + client_connection_check_interval); } static void @@ -3149,6 +3157,32 @@ ProcessInterrupts(void) (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("terminating connection due to administrator command"))); } + if (CheckClientConnectionPending) + { + CheckClientConnectionPending = false; + + /* + * Check for lost connection and re-arm, if still configured, but not + * if we've arrived back at DoingCommandRead state. We don't want to + * wake up idle sessions, and they already know how to detect lost + * connections. + */ + if (!DoingCommandRead && client_connection_check_interval > 0) + { + unsigned char next_message; + int r; + + /* + * Does the kernel think we have been disconnected, or is there a + * pipelined terminate message from the client? + */ + r = pq_peekmessage(&next_message); + if (r == EOF || (r == 1 && next_message == 'X')) + ClientConnectionLost = true; + enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT, + client_connection_check_interval); + } + } if (ClientConnectionLost) { QueryCancelPending = false; /* lost connection trumps QueryCancel */ diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 73e0a672ae..a9f0fc3017 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol; volatile sig_atomic_t InterruptPending = false; volatile sig_atomic_t QueryCancelPending = false; volatile sig_atomic_t ProcDiePending = false; +volatile sig_atomic_t CheckClientConnectionPending = false; volatile sig_atomic_t ClientConnectionLost = false; volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false; volatile sig_atomic_t IdleSessionTimeoutPending = false; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 7abeccb536..0bac23e75d 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -34,6 +34,7 @@ #include "catalog/pg_db_role_setting.h" #include "catalog/pg_tablespace.h" #include "libpq/auth.h" +#include "libpq/libpq.h" #include "libpq/libpq-be.h" #include "mb/pg_wchar.h" #include "miscadmin.h" @@ -73,6 +74,7 @@ static void StatementTimeoutHandler(void); static void LockTimeoutHandler(void); static void IdleInTransactionSessionTimeoutHandler(void); static void IdleSessionTimeoutHandler(void); +static void ClientCheckTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); static void process_settings(Oid databaseid, Oid roleid); @@ -620,6 +622,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT, IdleInTransactionSessionTimeoutHandler); RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler); + RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler); } /* @@ -1242,6 +1245,14 @@ IdleSessionTimeoutHandler(void) SetLatch(MyLatch); } +static void +ClientCheckTimeoutHandler(void) +{ + CheckClientConnectionPending = true; + InterruptPending = true; + SetLatch(MyLatch); +} + /* * Returns true if at least one role is defined in this database cluster. */ diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 3b36a31a47..391d9983e0 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -3483,6 +3483,16 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER, + gettext_noop("Sets the time interval for checking connection with the client."), + gettext_noop("A value of 0 disables this feature."), + GUC_UNIT_MS + }, + &client_connection_check_interval, + 0, 0, INT_MAX, + NULL, NULL, NULL + }, /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 86425965d0..dd850bf272 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -718,6 +718,9 @@ #dynamic_library_path = '$libdir' +#client_connection_check_interval = 0 # set time interval between + # checks for client disconnection while + # running long queries; 0 for never #------------------------------------------------------------------------------ # LOCK MANAGEMENT diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h index b20deeb555..a2255cc0da 100644 --- a/src/include/libpq/libpq.h +++ b/src/include/libpq/libpq.h @@ -71,6 +71,7 @@ extern int pq_getbyte(void); extern int pq_peekbyte(void); extern int pq_getbyte_if_available(unsigned char *c); extern int pq_putmessage_v2(char msgtype, const char *s, size_t len); +extern int pq_peekmessage(unsigned char *c); /* * prototypes for functions in be-secure.c @@ -109,6 +110,7 @@ extern ssize_t secure_open_gssapi(Port *port); extern char *SSLCipherSuites; extern char *SSLECDHCurve; extern bool SSLPreferServerCiphers; +extern int client_connection_check_interval; extern int ssl_min_protocol_version; extern int ssl_max_protocol_version; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 013850ac28..6f8251e0b0 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -85,6 +85,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending; +extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending; extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost; /* these are marked volatile because they are examined by signal handlers: */ diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h index ecb2a366a5..93e6a691b3 100644 --- a/src/include/utils/timeout.h +++ b/src/include/utils/timeout.h @@ -32,6 +32,7 @@ typedef enum TimeoutId STANDBY_LOCK_TIMEOUT, IDLE_IN_TRANSACTION_SESSION_TIMEOUT, IDLE_SESSION_TIMEOUT, + CLIENT_CONNECTION_CHECK_TIMEOUT, /* First user-definable timeout reason */ USER_TIMEOUT, /* Maximum number of timeout reasons */ -- 2.30.1