Re: libpq copy error handling busted - Mailing list pgsql-hackers

From Tom Lane
Subject Re: libpq copy error handling busted
Date
Msg-id 1250623.1591288146@sss.pgh.pa.us
Whole thread Raw
In response to Re: libpq copy error handling busted  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-hackers
I wrote:
> * As for control-C not getting out of it: there is
>         if (CancelRequested)
>             break;
> in pgbench's loop, but this does nothing in this scenario because
> fe-utils/cancel.c only sets that flag when it successfully sends a
> Cancel ... which it certainly cannot if the postmaster is gone.
> I suspect that this may be relatively recent breakage.  It doesn't look
> too well thought out, in any case.  The places that are testing this
> flag look like they'd rather not be bothered with the fine point of
> whether the cancel request actually went anywhere.

On closer inspection, it seems that scripts_parallel.c does have a
dependency on the cancel request having been sent, because it insists
on collecting a query result off the active connection after detecting
CancelRequested.  This seems dangerously overoptimistic to me; it will
lock up if for any reason the server doesn't honor the cancel request.
It's also pointless, because all the calling apps are just going to close
their connections and exit(1) afterwards, so there's no use in trying to
resynchronize the connection state.  (Plus, disconnectDatabase will
issue cancels on any busy connections; which would be necessary anyway
in a parallel operation, since cancel.c could only have signaled one of
them.)  So the attached patch just removes the useless consumeQueryResult
call, and then simplifies select_loop's API a bit.

With that change, I don't see any place that wants the existing definition
of CancelRequested rather than the simpler meaning of "SIGINT was
received", so I just changed it to mean that.  We could certainly also
have a variable tracking whether a cancel request was sent, but I see
no point in one right now.

It's easiest to test this *without* the other patch -- just run the
pgbench scenario Andres demonstrated, and see whether control-C gets
pgbench to quit cleanly.

            regards, tom lane

diff --git a/src/bin/scripts/scripts_parallel.c b/src/bin/scripts/scripts_parallel.c
index 45c69b8d19..c3384d452a 100644
--- a/src/bin/scripts/scripts_parallel.c
+++ b/src/bin/scripts/scripts_parallel.c
@@ -28,7 +28,7 @@
 #include "scripts_parallel.h"

 static void init_slot(ParallelSlot *slot, PGconn *conn);
-static int    select_loop(int maxFd, fd_set *workerset, bool *aborting);
+static int    select_loop(int maxFd, fd_set *workerset);

 static void
 init_slot(ParallelSlot *slot, PGconn *conn)
@@ -41,23 +41,16 @@ init_slot(ParallelSlot *slot, PGconn *conn)
 /*
  * Loop on select() until a descriptor from the given set becomes readable.
  *
- * If we get a cancel request while we're waiting, we forego all further
- * processing and set the *aborting flag to true.  The return value must be
- * ignored in this case.  Otherwise, *aborting is set to false.
+ * Returns -1 on failure (including getting a cancel request).
  */
 static int
-select_loop(int maxFd, fd_set *workerset, bool *aborting)
+select_loop(int maxFd, fd_set *workerset)
 {
     int            i;
     fd_set        saveSet = *workerset;

     if (CancelRequested)
-    {
-        *aborting = true;
         return -1;
-    }
-    else
-        *aborting = false;

     for (;;)
     {
@@ -90,7 +83,7 @@ select_loop(int maxFd, fd_set *workerset, bool *aborting)
         if (i < 0 && errno == EINTR)
             continue;            /* ignore this */
         if (i < 0 || CancelRequested)
-            *aborting = true;    /* but not this */
+            return -1;            /* but not this */
         if (i == 0)
             continue;            /* timeout (Win32 only) */
         break;
@@ -135,7 +128,6 @@ ParallelSlotsGetIdle(ParallelSlot *slots, int numslots)
     {
         fd_set        slotset;
         int            maxFd = 0;
-        bool        aborting;

         /* We must reconstruct the fd_set for each call to select_loop */
         FD_ZERO(&slotset);
@@ -157,19 +149,12 @@ ParallelSlotsGetIdle(ParallelSlot *slots, int numslots)
         }

         SetCancelConn(slots->connection);
-        i = select_loop(maxFd, &slotset, &aborting);
+        i = select_loop(maxFd, &slotset);
         ResetCancelConn();

-        if (aborting)
-        {
-            /*
-             * We set the cancel-receiving connection to the one in the zeroth
-             * slot above, so fetch the error from there.
-             */
-            consumeQueryResult(slots->connection);
+        /* failure? */
+        if (i < 0)
             return NULL;
-        }
-        Assert(i != 0);

         for (i = 0; i < numslots; i++)
         {
diff --git a/src/fe_utils/cancel.c b/src/fe_utils/cancel.c
index eb4056a9a6..51fb67d384 100644
--- a/src/fe_utils/cancel.c
+++ b/src/fe_utils/cancel.c
@@ -43,11 +43,11 @@
 static PGcancel *volatile cancelConn = NULL;

 /*
- * CancelRequested tracks if a cancellation request has completed after
- * a signal interruption.  Note that if cancelConn is not set, in short
- * if SetCancelConn() was never called or if ResetCancelConn() freed
- * the cancellation object, then CancelRequested is switched to true after
- * all cancellation attempts.
+ * CancelRequested is set when we receive SIGINT (or local equivalent).
+ * There is no provision in this module for resetting it; but applications
+ * might choose to clear it after successfully recovering from a cancel.
+ * Note that there is no guarantee that we successfully sent a Cancel request,
+ * or that the request will have any effect if we did send it.
  */
 volatile sig_atomic_t CancelRequested = false;

@@ -148,6 +148,8 @@ handle_sigint(SIGNAL_ARGS)
     int            save_errno = errno;
     char        errbuf[256];

+    CancelRequested = true;
+
     if (cancel_callback != NULL)
         cancel_callback();

@@ -156,7 +158,6 @@ handle_sigint(SIGNAL_ARGS)
     {
         if (PQcancel(cancelConn, errbuf, sizeof(errbuf)))
         {
-            CancelRequested = true;
             write_stderr(_("Cancel request sent\n"));
         }
         else
@@ -165,8 +166,6 @@ handle_sigint(SIGNAL_ARGS)
             write_stderr(errbuf);
         }
     }
-    else
-        CancelRequested = true;

     errno = save_errno;            /* just in case the write changed it */
 }
@@ -193,6 +192,8 @@ consoleHandler(DWORD dwCtrlType)
     if (dwCtrlType == CTRL_C_EVENT ||
         dwCtrlType == CTRL_BREAK_EVENT)
     {
+        CancelRequested = true;
+
         if (cancel_callback != NULL)
             cancel_callback();

@@ -203,7 +204,6 @@ consoleHandler(DWORD dwCtrlType)
             if (PQcancel(cancelConn, errbuf, sizeof(errbuf)))
             {
                 write_stderr(_("Cancel request sent\n"));
-                CancelRequested = true;
             }
             else
             {
@@ -211,8 +211,6 @@ consoleHandler(DWORD dwCtrlType)
                 write_stderr(errbuf);
             }
         }
-        else
-            CancelRequested = true;

         LeaveCriticalSection(&cancelConnLock);


pgsql-hackers by date:

Previous
From: Avinash Kumar
Date:
Subject: Re: Just for fun: Postgres 20?
Next
From: Peter Eisentraut
Date:
Subject: Re: Expand the use of check_canonical_path() for more GUCs