From b2cfa13627bd844b28fe050ffb35e8b9696fdf2e Mon Sep 17 00:00:00 2001 From: Euler Taveira Date: Mon, 25 Mar 2024 22:01:52 -0300 Subject: [PATCH v1 1/2] Improve the code that checks if the recovery is finishing The recovery process has a window between the walreceiver shutdown and the pg_is_in_recovery function returns false. It means that the pg_stat_wal_receiver checks can cause the server to finish the recovery (even if it already reaches the recovery target). Since it checks the pg_stat_wal_receiver to verify the primary is available, if it does not return a row, PQping the primary server. If it is up and running, it can indicate that the target server is finishing the recovery process, hence, we shouldn't count it as an attempt. It avoids premature failures on slow hosts. While on it, increase the number of attempts (10 to 60). The wait time is the same pg_promote function uses by default. --- src/bin/pg_basebackup/pg_createsubscriber.c | 30 +++++++++++++-------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c index b8f8269340..cca93d8c25 100644 --- a/src/bin/pg_basebackup/pg_createsubscriber.c +++ b/src/bin/pg_basebackup/pg_createsubscriber.c @@ -30,6 +30,8 @@ #define DEFAULT_SUB_PORT "50432" +#define NUM_ATTEMPTS 60 + /* Command-line options */ struct CreateSubscriberOptions { @@ -93,7 +95,7 @@ static void pg_ctl_status(const char *pg_ctl_cmd, int rc); static void start_standby_server(const struct CreateSubscriberOptions *opt, bool restricted_access); static void stop_standby_server(const char *datadir); -static void wait_for_end_recovery(const char *conninfo, +static void wait_for_end_recovery(const struct LogicalRepInfo *dbinfo, const struct CreateSubscriberOptions *opt); static void create_publication(PGconn *conn, struct LogicalRepInfo *dbinfo); static void drop_publication(PGconn *conn, struct LogicalRepInfo *dbinfo); @@ -1354,18 +1356,16 @@ stop_standby_server(const char *datadir) * the recovery process. By default, it waits forever. */ static void -wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions *opt) +wait_for_end_recovery(const struct LogicalRepInfo *dbinfo, const struct CreateSubscriberOptions *opt) { PGconn *conn; int status = POSTMASTER_STILL_STARTING; int timer = 0; int count = 0; /* number of consecutive connection attempts */ -#define NUM_CONN_ATTEMPTS 10 - pg_log_info("waiting for the target server to reach the consistent state"); - conn = connect_database(conninfo, true); + conn = connect_database(dbinfo->subconninfo, true); for (;;) { @@ -1384,16 +1384,24 @@ wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions } /* - * If it is still in recovery, make sure the target server is - * connected to the primary so it can receive the required WAL to - * finish the recovery process. If it is disconnected try - * NUM_CONN_ATTEMPTS in a row and bail out if not succeed. + * If it is still in recovery, make sure the target server is connected + * to the primary so it can receive the required WAL to finish the + * recovery process. If the walreceiver process is not running it + * should indicate that (i) the recovery is almost finished or (ii) the + * primary is not running or is not accpeting connections. It should + * count as attempts iif (ii) is true. In this case, try NUM_ATTEMPTS + * in a row and bail out if not succeed. */ res = PQexec(conn, "SELECT 1 FROM pg_catalog.pg_stat_wal_receiver"); if (PQntuples(res) == 0) { - if (++count > NUM_CONN_ATTEMPTS) + if (PQping(dbinfo->pubconninfo) != PQPING_OK) + count++; + else + count = 0; /* reset counter if it connects again */ + + if (count > NUM_ATTEMPTS) { stop_standby_server(subscriber_dir); pg_log_error("standby server disconnected from the primary"); @@ -2113,7 +2121,7 @@ main(int argc, char **argv) start_standby_server(&opt, true); /* Waiting the subscriber to be promoted */ - wait_for_end_recovery(dbinfo[0].subconninfo, &opt); + wait_for_end_recovery(&dbinfo[0], &opt); /* * Create the subscription for each database on subscriber. It does not -- 2.30.2