Re: pgsql: Add kqueue(2) support to the WaitEventSet API. - Mailing list pgsql-hackers

From Tom Lane
Subject Re: pgsql: Add kqueue(2) support to the WaitEventSet API.
Date
Msg-id 27649.1582500247@sss.pgh.pa.us
Whole thread Raw
In response to Re: pgsql: Add kqueue(2) support to the WaitEventSet API.  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: pgsql: Add kqueue(2) support to the WaitEventSet API.
List pgsql-hackers
I wrote:
> Here's a draft patch that does it like that.

On reflection, trying to make ReserveExternalFD serve two different
use-cases was pretty messy.  Here's a version that splits it into two
functions.  I also took the trouble to fix dblink.

            regards, tom lane

diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c
index c1155e3..a5f69fc 100644
--- a/contrib/dblink/dblink.c
+++ b/contrib/dblink/dblink.c
@@ -200,12 +200,27 @@ dblink_get_conn(char *conname_or_str,
         if (connstr == NULL)
             connstr = conname_or_str;
         dblink_connstr_check(connstr);
+
+        /*
+         * We must obey fd.c's limit on non-virtual file descriptors.  Assume
+         * that a PGconn represents one long-lived FD.  (Doing this here also
+         * ensures that VFDs are closed if needed to make room.)
+         */
+        if (!AcquireExternalFD())
+            ereport(ERROR,
+                    (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+                     errmsg("could not establish connection"),
+                     errdetail("There are too many open files.")));
+
+        /* OK to make connection */
         conn = PQconnectdb(connstr);
+
         if (PQstatus(conn) == CONNECTION_BAD)
         {
             char       *msg = pchomp(PQerrorMessage(conn));

             PQfinish(conn);
+            ReleaseExternalFD();
             ereport(ERROR,
                     (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
                      errmsg("could not establish connection"),
@@ -282,12 +297,26 @@ dblink_connect(PG_FUNCTION_ARGS)

     /* check password in connection string if not superuser */
     dblink_connstr_check(connstr);
+
+    /*
+     * We must obey fd.c's limit on non-virtual file descriptors.  Assume that
+     * a PGconn represents one long-lived FD.  (Doing this here also ensures
+     * that VFDs are closed if needed to make room.)
+     */
+    if (!AcquireExternalFD())
+        ereport(ERROR,
+                (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+                 errmsg("could not establish connection"),
+                 errdetail("There are too many open files.")));
+
+    /* OK to make connection */
     conn = PQconnectdb(connstr);

     if (PQstatus(conn) == CONNECTION_BAD)
     {
         msg = pchomp(PQerrorMessage(conn));
         PQfinish(conn);
+        ReleaseExternalFD();
         if (rconn)
             pfree(rconn);

@@ -312,7 +341,10 @@ dblink_connect(PG_FUNCTION_ARGS)
     else
     {
         if (pconn->conn)
+        {
             PQfinish(pconn->conn);
+            ReleaseExternalFD();
+        }
         pconn->conn = conn;
     }

@@ -346,6 +378,7 @@ dblink_disconnect(PG_FUNCTION_ARGS)
         dblink_conn_not_avail(conname);

     PQfinish(conn);
+    ReleaseExternalFD();
     if (rconn)
     {
         deleteConnection(conname);
@@ -780,7 +813,10 @@ dblink_record_internal(FunctionCallInfo fcinfo, bool is_async)
     {
         /* if needed, close the connection to the database */
         if (freeconn)
+        {
             PQfinish(conn);
+            ReleaseExternalFD();
+        }
     }
     PG_END_TRY();

@@ -1458,7 +1494,10 @@ dblink_exec(PG_FUNCTION_ARGS)
     {
         /* if needed, close the connection to the database */
         if (freeconn)
+        {
             PQfinish(conn);
+            ReleaseExternalFD();
+        }
     }
     PG_END_TRY();

@@ -2563,6 +2602,7 @@ createNewConnection(const char *name, remoteConn *rconn)
     if (found)
     {
         PQfinish(rconn->conn);
+        ReleaseExternalFD();
         pfree(rconn);

         ereport(ERROR,
@@ -2604,6 +2644,7 @@ dblink_security_check(PGconn *conn, remoteConn *rconn)
         if (!PQconnectionUsedPassword(conn))
         {
             PQfinish(conn);
+            ReleaseExternalFD();
             if (rconn)
                 pfree(rconn);

diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 29c811a..74576d7 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -20,6 +20,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postgres_fdw.h"
+#include "storage/fd.h"
 #include "storage/latch.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
@@ -259,10 +260,27 @@ connect_pg_server(ForeignServer *server, UserMapping *user)

         keywords[n] = values[n] = NULL;

-        /* verify connection parameters and make connection */
+        /* verify the set of connection parameters */
         check_conn_params(keywords, values, user);

+        /*
+         * We must obey fd.c's limit on non-virtual file descriptors.  Assume
+         * that a PGconn represents one long-lived FD.  (Doing this here also
+         * ensures that VFDs are closed if needed to make room.)
+         */
+        if (!AcquireExternalFD())
+            ereport(ERROR,
+                    (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+                     errmsg("could not connect to server \"%s\"",
+                            server->servername),
+                     errdetail("There are too many open files.")));
+
+        /* OK to make connection */
         conn = PQconnectdbParams(keywords, values, false);
+
+        if (!conn)
+            ReleaseExternalFD();    /* because the PG_CATCH block won't */
+
         if (!conn || PQstatus(conn) != CONNECTION_OK)
             ereport(ERROR,
                     (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
@@ -294,7 +312,10 @@ connect_pg_server(ForeignServer *server, UserMapping *user)
     {
         /* Release PGconn data structure if we managed to create one */
         if (conn)
+        {
             PQfinish(conn);
+            ReleaseExternalFD();
+        }
         PG_RE_THROW();
     }
     PG_END_TRY();
@@ -312,6 +333,7 @@ disconnect_pg_server(ConnCacheEntry *entry)
     {
         PQfinish(entry->conn);
         entry->conn = NULL;
+        ReleaseExternalFD();
     }
 }

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index fd527f2..d19408b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -774,6 +774,7 @@ static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "strea
  * openLogFile is -1 or a kernel FD for an open log file segment.
  * openLogSegNo identifies the segment.  These variables are only used to
  * write the XLOG, and so will normally refer to the active segment.
+ * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
  */
 static int    openLogFile = -1;
 static XLogSegNo openLogSegNo = 0;
@@ -785,6 +786,9 @@ static XLogSegNo openLogSegNo = 0;
  * will be just past that page. readLen indicates how much of the current
  * page has been read into readBuf, and readSource indicates where we got
  * the currently open file from.
+ * Note: we could use Reserve/ReleaseExternalFD to track consumption of
+ * this FD too; but it doesn't currently seem worthwhile, since the XLOG is
+ * not read by general-purpose sessions.
  */
 static int    readFile = -1;
 static XLogSegNo readSegNo = 0;
@@ -2447,6 +2451,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
             /* create/use new log file */
             use_existent = true;
             openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
+            ReserveExternalFD();
         }

         /* Make sure we have the current logfile open */
@@ -2455,6 +2460,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
                             wal_segment_size);
             openLogFile = XLogFileOpen(openLogSegNo);
+            ReserveExternalFD();
         }

         /* Add current page to the set of pending pages-to-dump */
@@ -2605,6 +2611,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
                                 wal_segment_size);
                 openLogFile = XLogFileOpen(openLogSegNo);
+                ReserveExternalFD();
             }

             issue_xlog_fsync(openLogFile, openLogSegNo);
@@ -3811,6 +3818,7 @@ XLogFileClose(void)
     }

     openLogFile = -1;
+    ReleaseExternalFD();
 }

 /*
@@ -5224,6 +5232,11 @@ BootStrapXLOG(void)
     use_existent = false;
     openLogFile = XLogFileInit(1, &use_existent, false);

+    /*
+     * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
+     * close the file again in a moment.
+     */
+
     /* Write the first page with the initial record */
     errno = 0;
     pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 59dc4f3..462b4d7 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -602,6 +602,9 @@ retry2:

     pg_freeaddrinfo_all(hints.ai_family, addrs);

+    /* Now that we have a long-lived socket, tell fd.c about it. */
+    ReserveExternalFD();
+
     return;

 startup_failed:
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index b3986be..cd61665 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2554,9 +2554,14 @@ ClosePostmasterPorts(bool am_syslogger)
                 (errcode_for_file_access(),
                  errmsg_internal("could not close postmaster death monitoring pipe in child process: %m")));
     postmaster_alive_fds[POSTMASTER_FD_OWN] = -1;
+    /* Notify fd.c that we released one pipe FD. */
+    ReleaseExternalFD();
 #endif

-    /* Close the listen sockets */
+    /*
+     * Close the postmaster's listen sockets.  These aren't tracked by fd.c,
+     * so we don't call ReleaseExternalFD() here.
+     */
     for (i = 0; i < MAXLISTEN; i++)
     {
         if (ListenSocket[i] != PGINVALID_SOCKET)
@@ -2566,7 +2571,10 @@ ClosePostmasterPorts(bool am_syslogger)
         }
     }

-    /* If using syslogger, close the read side of the pipe */
+    /*
+     * If using syslogger, close the read side of the pipe.  We don't bother
+     * tracking this in fd.c, either.
+     */
     if (!am_syslogger)
     {
 #ifndef WIN32
@@ -4279,6 +4287,9 @@ BackendInitialize(Port *port)
     /* Save port etc. for ps status */
     MyProcPort = port;

+    /* Tell fd.c about the long-lived FD associated with the port */
+    ReserveExternalFD();
+
     /*
      * PreAuthDelay is a debugging aid for investigating problems in the
      * authentication cycle: it can be set in postgresql.conf to allow time to
@@ -6442,6 +6453,20 @@ restore_backend_variables(BackendParameters *param, Port *port)
     strlcpy(pkglib_path, param->pkglib_path, MAXPGPATH);

     strlcpy(ExtraOptions, param->ExtraOptions, MAXPGPATH);
+
+    /*
+     * We need to restore fd.c's counts of externally-opened FDs; to avoid
+     * confusion, be sure to do this after restoring max_safe_fds.  (Note:
+     * BackendInitialize will handle this for port->sock.)
+     */
+#ifndef WIN32
+    if (postmaster_alive_fds[0] >= 0)
+        ReserveExternalFD();
+    if (postmaster_alive_fds[1] >= 0)
+        ReserveExternalFD();
+#endif
+    if (pgStatSock != PGINVALID_SOCKET)
+        ReserveExternalFD();
 }


@@ -6584,6 +6609,10 @@ InitPostmasterDeathWatchHandle(void)
                 (errcode_for_file_access(),
                  errmsg_internal("could not create pipe to monitor postmaster death: %m")));

+    /* Notify fd.c that we've eaten two FDs for the pipe. */
+    ReserveExternalFD();
+    ReserveExternalFD();
+
     /*
      * Set O_NONBLOCK to allow testing for the fd's presence with a read()
      * call.
diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c
index b2b69a7..cf7b535 100644
--- a/src/backend/postmaster/syslogger.c
+++ b/src/backend/postmaster/syslogger.c
@@ -562,6 +562,11 @@ SysLogger_Start(void)
      * This means the postmaster must continue to hold the read end of the
      * pipe open, so we can pass it down to the reincarnated syslogger. This
      * is a bit klugy but we have little choice.
+     *
+     * Also note that we don't bother counting the pipe FDs by calling
+     * Reserve/ReleaseExternalFD.  There's no real need to account for them
+     * accurately in the postmaster or syslogger process, and both ends of the
+     * pipe will wind up closed in all other postmaster children.
      */
 #ifndef WIN32
     if (syslogPipe[0] < 0)
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index b5f4df6..34f7443 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -61,6 +61,12 @@
  * BasicOpenFile, it is solely the caller's responsibility to close the file
  * descriptor by calling close(2).
  *
+ * If a non-virtual file descriptor needs to be held open for any length of
+ * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
+ * (and eventually ReleaseExternalFD), so that we can take it into account
+ * while deciding how many VFDs can be open.  This applies to FDs obtained
+ * with BasicOpenFile as well as those obtained without use of any fd.c API.
+ *
  *-------------------------------------------------------------------------
  */

@@ -103,8 +109,8 @@
 /*
  * We must leave some file descriptors free for system(), the dynamic loader,
  * and other code that tries to open files without consulting fd.c.  This
- * is the number left free.  (While we can be pretty sure we won't get
- * EMFILE, there's never any guarantee that we won't get ENFILE due to
+ * is the number left free.  (While we try fairly hard to prevent EMFILE
+ * errors, there's never any guarantee that we won't get ENFILE due to
  * other processes chewing up FDs.  So it's a bad idea to try to open files
  * without consulting fd.c.  Nonetheless we cannot control all code.)
  *
@@ -119,9 +125,12 @@

 /*
  * If we have fewer than this many usable FDs after allowing for the reserved
- * ones, choke.
+ * ones, choke.  (This value is chosen to work with "ulimit -n 64", but not
+ * much less than that.  Note that this value ensures numExternalFDs can be
+ * at least 16; as of this writing, the contrib/postgres_fdw regression tests
+ * will not pass unless that can grow to at least 14.)
  */
-#define FD_MINFREE                10
+#define FD_MINFREE                48

 /*
  * A number of platforms allow individual processes to open many more files
@@ -132,8 +141,8 @@
 int            max_files_per_process = 1000;

 /*
- * Maximum number of file descriptors to open for either VFD entries or
- * AllocateFile/AllocateDir/OpenTransientFile operations.  This is initialized
+ * Maximum number of file descriptors to open for operations that fd.c knows
+ * about (VFDs, AllocateFile etc, or "external" FDs).  This is initialized
  * to a conservative value, and remains that way indefinitely in bootstrap or
  * standalone-backend cases.  In normal postmaster operation, the postmaster
  * calls set_max_safe_fds() late in initialization to update the value, and
@@ -142,7 +151,7 @@ int            max_files_per_process = 1000;
  * Note: the value of max_files_per_process is taken into account while
  * setting this variable, and so need not be tested separately.
  */
-int            max_safe_fds = 32;    /* default if not changed */
+int            max_safe_fds = FD_MINFREE;    /* default if not changed */

 /* Whether it is safe to continue running after fsync() fails. */
 bool        data_sync_retry = false;
@@ -244,6 +253,11 @@ static int    maxAllocatedDescs = 0;
 static AllocateDesc *allocatedDescs = NULL;

 /*
+ * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
+ */
+static int    numExternalFDs = 0;
+
+/*
  * Number of temporary files opened during the current session;
  * this is used in generation of tempfile names.
  */
@@ -1025,6 +1039,80 @@ tryAgain:
     return -1;                    /* failure */
 }

+/*
+ * AcquireExternalFD - attempt to reserve an external file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.
+ *
+ * The difference between this and the underlying ReserveExternalFD function
+ * is that this will report failure (by setting errno and returning false)
+ * if "too many" external FDs are already reserved.  This should be used in
+ * any code where the total number of FDs to be reserved is not predictable
+ * and small.
+ */
+bool
+AcquireExternalFD(void)
+{
+    /*
+     * We don't want more than max_safe_fds / 3 FDs to be consumed for
+     * "external" FDs.
+     */
+    if (numExternalFDs < max_safe_fds / 3)
+    {
+        ReserveExternalFD();
+        return true;
+    }
+    errno = EMFILE;
+    return false;
+}
+
+/*
+ * ReserveExternalFD - report external consumption of a file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.  This just tracks the use of the FD and closes
+ * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
+ *
+ * Call this directly only in code where failure to reserve the FD would be
+ * fatal; for example, the WAL-writing code does so, since the alternative is
+ * session failure.  Also, it's very unwise to do so in code that could
+ * consume more than one FD per process.
+ *
+ * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
+ * available, it doesn't matter too much whether this is called before or
+ * after actually opening the FD; but doing so beforehand reduces the risk of
+ * an EMFILE failure if not everybody played nice.  In any case, it's solely
+ * caller's responsibility to keep the external-FD count in sync with reality.
+ */
+void
+ReserveExternalFD(void)
+{
+    /*
+     * Release VFDs if needed to stay safe.  Because we do this before
+     * incrementing numExternalFDs, the final state will be as desired, i.e.,
+     * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
+     */
+    ReleaseLruFiles();
+
+    numExternalFDs++;
+}
+
+/*
+ * ReleaseExternalFD - report release of an external file descriptor
+ *
+ * This is guaranteed not to change errno, so it can be used in failure paths.
+ */
+void
+ReleaseExternalFD(void)
+{
+    Assert(numExternalFDs > 0);
+    numExternalFDs--;
+}
+
+
 #if defined(FDDEBUG)

 static void
@@ -1185,7 +1273,7 @@ ReleaseLruFile(void)
 static void
 ReleaseLruFiles(void)
 {
-    while (nfile + numAllocatedDescs >= max_safe_fds)
+    while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
     {
         if (!ReleaseLruFile())
             break;
@@ -2176,13 +2264,13 @@ reserveAllocatedDesc(void)

     /*
      * If the array hasn't yet been created in the current process, initialize
-     * it with FD_MINFREE / 2 elements.  In many scenarios this is as many as
+     * it with FD_MINFREE / 3 elements.  In many scenarios this is as many as
      * we will ever need, anyway.  We don't want to look at max_safe_fds
      * immediately because set_max_safe_fds() may not have run yet.
      */
     if (allocatedDescs == NULL)
     {
-        newMax = FD_MINFREE / 2;
+        newMax = FD_MINFREE / 3;
         newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
         /* Out of memory already?  Treat as fatal error. */
         if (newDescs == NULL)
@@ -2200,10 +2288,12 @@ reserveAllocatedDesc(void)
      *
      * We mustn't let allocated descriptors hog all the available FDs, and in
      * practice we'd better leave a reasonable number of FDs for VFD use.  So
-     * set the maximum to max_safe_fds / 2.  (This should certainly be at
-     * least as large as the initial size, FD_MINFREE / 2.)
+     * set the maximum to max_safe_fds / 3.  (This should certainly be at
+     * least as large as the initial size, FD_MINFREE / 3, so we aren't
+     * tightening the restriction here.)  Recall that "external" FDs are
+     * allowed to consume another third of max_safe_fds.
      */
-    newMax = max_safe_fds / 2;
+    newMax = max_safe_fds / 3;
     if (newMax > maxAllocatedDescs)
     {
         newDescs = (AllocateDesc *) realloc(allocatedDescs,
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index 138bdec..1972aec 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -247,14 +247,17 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
     /*
      * Create new segment or open an existing one for attach.
      *
-     * Even though we're not going through fd.c, we should be safe against
-     * running out of file descriptors, because of NUM_RESERVED_FDS.  We're
-     * only opening one extra descriptor here, and we'll close it before
-     * returning.
+     * Even though we will close the FD before returning, it seems desirable
+     * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
+     * failure.  The fact that we won't hold the FD open long justifies using
+     * ReserveExternalFD rather than AcquireExternalFD, though.
      */
+    ReserveExternalFD();
+
     flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
     if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
     {
+        ReleaseExternalFD();
         if (errno != EEXIST)
             ereport(elevel,
                     (errcode_for_dynamic_shared_memory(),
@@ -278,6 +281,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
             /* Back out what's already been done. */
             save_errno = errno;
             close(fd);
+            ReleaseExternalFD();
             errno = save_errno;

             ereport(elevel,
@@ -295,6 +299,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
         /* Back out what's already been done. */
         save_errno = errno;
         close(fd);
+        ReleaseExternalFD();
         shm_unlink(name);
         errno = save_errno;

@@ -323,6 +328,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
         /* Back out what's already been done. */
         save_errno = errno;
         close(fd);
+        ReleaseExternalFD();
         if (op == DSM_OP_CREATE)
             shm_unlink(name);
         errno = save_errno;
@@ -336,6 +342,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
     *mapped_address = address;
     *mapped_size = request_size;
     close(fd);
+    ReleaseExternalFD();

     return true;
 }
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index cbd4952..046ca5c 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -51,6 +51,7 @@
 #include "port/atomics.h"
 #include "portability/instr_time.h"
 #include "postmaster/postmaster.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
@@ -187,6 +188,9 @@ InitializeLatchSupport(void)
             /* Clean up, just for safety's sake; we'll set these below */
             selfpipe_readfd = selfpipe_writefd = -1;
             selfpipe_owner_pid = 0;
+            /* Keep fd.c's accounting straight */
+            ReleaseExternalFD();
+            ReleaseExternalFD();
         }
         else
         {
@@ -194,6 +198,7 @@ InitializeLatchSupport(void)
              * Postmaster didn't create a self-pipe ... or else we're in an
              * EXEC_BACKEND build, in which case it doesn't matter since the
              * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
+             * fd.c won't have state to clean up, either.
              */
             Assert(selfpipe_readfd == -1);
         }
@@ -228,6 +233,10 @@ InitializeLatchSupport(void)
     selfpipe_readfd = pipefd[0];
     selfpipe_writefd = pipefd[1];
     selfpipe_owner_pid = MyProcPid;
+
+    /* Tell fd.c about these two long-lived FDs */
+    ReserveExternalFD();
+    ReserveExternalFD();
 #else
     /* currently, nothing to do here for Windows */
 #endif
@@ -604,24 +613,57 @@ CreateWaitEventSet(MemoryContext context, int nevents)
     set->exit_on_postmaster_death = false;

 #if defined(WAIT_USE_EPOLL)
+    if (!AcquireExternalFD())
+    {
+        /* treat this as though epoll_create1 itself returned EMFILE */
+        elog(ERROR, "epoll_create1 failed: %m");
+    }
 #ifdef EPOLL_CLOEXEC
     set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
     if (set->epoll_fd < 0)
+    {
+        ReleaseExternalFD();
         elog(ERROR, "epoll_create1 failed: %m");
+    }
 #else
     /* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
     set->epoll_fd = epoll_create(nevents);
     if (set->epoll_fd < 0)
+    {
+        ReleaseExternalFD();
         elog(ERROR, "epoll_create failed: %m");
+    }
     if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
+    {
+        int            save_errno = errno;
+
+        close(set->epoll_fd);
+        ReleaseExternalFD();
+        errno = save_errno;
         elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
+    }
 #endif                            /* EPOLL_CLOEXEC */
 #elif defined(WAIT_USE_KQUEUE)
+    if (!AcquireExternalFD())
+    {
+        /* treat this as though kqueue itself returned EMFILE */
+        elog(ERROR, "kqueue failed: %m");
+    }
     set->kqueue_fd = kqueue();
     if (set->kqueue_fd < 0)
+    {
+        ReleaseExternalFD();
         elog(ERROR, "kqueue failed: %m");
+    }
     if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
+    {
+        int            save_errno = errno;
+
+        close(set->kqueue_fd);
+        ReleaseExternalFD();
+        errno = save_errno;
         elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
+    }
     set->report_postmaster_not_running = false;
 #elif defined(WAIT_USE_WIN32)

@@ -655,8 +697,10 @@ FreeWaitEventSet(WaitEventSet *set)
 {
 #if defined(WAIT_USE_EPOLL)
     close(set->epoll_fd);
+    ReleaseExternalFD();
 #elif defined(WAIT_USE_KQUEUE)
     close(set->kqueue_fd);
+    ReleaseExternalFD();
 #elif defined(WAIT_USE_WIN32)
     WaitEvent  *cur_event;

diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 51e2ece..2085c62 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -35,6 +35,10 @@
  * Likewise, use AllocateDir/FreeDir, not opendir/closedir, to allocate
  * open directories (DIR*), and OpenTransientFile/CloseTransientFile for an
  * unbuffered file descriptor.
+ *
+ * If you really can't use any of the above, at least call AcquireExternalFD
+ * or ReserveExternalFD to report any file descriptors that are held for any
+ * length of time.  Failure to do so risks unnecessary EMFILE errors.
  */
 #ifndef FD_H
 #define FD_H
@@ -120,7 +124,12 @@ extern int    CloseTransientFile(int fd);
 extern int    BasicOpenFile(const char *fileName, int fileFlags);
 extern int    BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode);

- /* Make a directory with default permissions */
+/* Use these for other cases, and also for long-lived BasicOpenFile FDs */
+extern bool AcquireExternalFD(void);
+extern void ReserveExternalFD(void);
+extern void ReleaseExternalFD(void);
+
+/* Make a directory with default permissions */
 extern int    MakePGDirectory(const char *directoryName);

 /* Miscellaneous support routines */

pgsql-hackers by date:

Previous
From: Justin Pryzby
Date:
Subject: v12 "won't fix" item regarding memory leak in "ATTACH PARTITIONwithout AEL"; (or, relcache ref counting)
Next
From: ilmari@ilmari.org (Dagfinn Ilmari Mannsåker)
Date:
Subject: Re: Fixing parallel make of libpq