From 74e6c1bf39c2fa8805dee59f8116ee640d53e31c Mon Sep 17 00:00:00 2001 From: Hubert Zhang Date: Wed, 28 Oct 2020 09:41:35 +0000 Subject: [PATCH] Enhance libpq to support multiple host for non hot standby. Libpq connection string support multiple host feature, but if the first host is assigned to non hot standby, it will not failover over to the rest of host in the connection string, and report an error 'database system is starting up'. The root cause is that non hot standby will raise a fatal error after accepting a connection. Client libpq just forwards the fatal error without trying to connect to other hosts. There are some other cases where libpq will not failover. For example, the primary/standby is down or encounter network problem during the libpq handshake. Co-authored-by: Hao Wu Co-authored-by: Hubert Zhang --- src/interfaces/libpq/fe-connect.c | 57 +++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index b0ca37c2ed..7f6e7476ff 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -2288,8 +2288,21 @@ PQconnectPoll(PGconn *conn) /* Load waiting data */ int n = pqReadData(conn); + /* + * Should try another address/host when pqReadData() failed + * instead of return error to client directly. + * + * For example a primary or a standby is down or encounter n/w + * problem after the libpq establish the connection and before + * reading any data. libpq should be able to failover to another + * host. + */ if (n < 0) - goto error_return; + { + conn->try_next_addr = true; + goto keep_going; + } + if (n == 0) return PGRES_POLLING_READING; @@ -3003,11 +3016,22 @@ keep_going: /* We will come back to here until there is int rdresult; rdresult = pqReadData(conn); + + /* + * Should try another address/host when pqReadData() failed + * instead of return error to client directly. + * + * For example a primary or a standby is down or encounter n/w + * problem after the libpq establish the connection and before + * reading any data. libpq should be able to failover to another + * host. + */ if (rdresult < 0) { - /* errorMessage is already filled in */ - goto error_return; + conn->try_next_addr = true; + goto keep_going; } + if (rdresult == 0) { /* caller failed to wait for data */ @@ -3119,9 +3143,20 @@ keep_going: /* We will come back to here until there is char gss_ok; int rdresult = pqReadData(conn); + /* + * Should try another address/host when pqReadData() failed + * instead of return error to client directly. + * + * For example a primary or a standby is down or encounter n/w + * problem after the libpq establish the connection and before + * reading any data. libpq should be able to failover to another + * host. + */ if (rdresult < 0) - /* pqReadData fills in error message */ - goto error_return; + { + conn->try_next_addr = true; + goto keep_going; + } else if (rdresult == 0) /* caller failed to wait for data */ return PGRES_POLLING_READING; @@ -3395,6 +3430,18 @@ keep_going: /* We will come back to here until there is } #endif + /* + * When try to connect to a non hot standby, ERRCODE_CANNOT_CONNECT_NOW + * will be reported with error message: 'database system is starting up'. + * We should try the next host in connection string instead of return + * error to client directly. + */ + if (strcmp(conn->last_sqlstate, ERRCODE_CANNOT_CONNECT_NOW) == 0) + { + conn->try_next_host = true; + goto keep_going; + } + goto error_return; } -- 2.16.6