Thread: A patch for xlog.c

A patch for xlog.c

From
Matthew Kirkwood
Date:
Hi,

Here is a patch against 7.1beta5 to use mmap(), and thus a
single write, to initialise xlogs.  It may well improve
performance of this on platforms/filesystems which write
metadata synchronously.

It needs a configure test, but certainly builds and runs
OK.

It also wraps the file reopening in an "ifdef WIN32", since
it certainly isn't needed for UNIX-like platforms (which I
assume includes BeOS).

Matthew.


diff -ruN postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c
postgresql-7.1beta5/src/backend/access/transam/xlog.c
--- postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c    Fri Feb 23 18:12:00 2001
+++ postgresql-7.1beta5/src/backend/access/transam/xlog.c    Sat Feb 24 15:23:41 2001
@@ -24,6 +24,10 @@
 #include <locale.h>
 #endif

+#ifdef    _HAVE_MMAP
+#include <sys/mman.h>
+#endif
+
 #include "access/transam.h"
 #include "access/xact.h"
 #include "catalog/catversion.h"
@@ -36,6 +40,7 @@
 #include "access/xlogutils.h"
 #include "utils/builtins.h"
 #include "utils/relcache.h"
+#include "utils/pfile.h"

 #include "miscadmin.h"

@@ -53,6 +58,10 @@
 StartUpID    ThisStartUpID = 0;
 XLogRecPtr    RedoRecPtr;

+#ifdef    _HAVE_MMAP
+void        *zmmap = NULL;
+#endif
+
 int            XLOG_DEBUG = 0;

 /* To read/update control file and create new log file */
@@ -955,7 +964,6 @@
 {
     char        path[MAXPGPATH];
     char        tpath[MAXPGPATH];
-    char        zbuffer[BLCKSZ];
     int            fd;
     int            nbytes;

@@ -987,28 +995,36 @@
         elog(STOP, "InitCreate(logfile %u seg %u) failed: %m",
              logId, logSeg);

-    /*
-     * Zero-fill the file.  We have to do this the hard way to ensure that
-     * all the file space has really been allocated --- on platforms that
-     * allow "holes" in files, just seeking to the end doesn't allocate
-     * intermediate space.  This way, we know that we have all the space
-     * and (after the fsync below) that all the indirect blocks are down
-     * on disk.  Therefore, fdatasync(2) will be sufficient to sync future
-     * writes to the log file.
-     */
-    MemSet(zbuffer, 0, sizeof(zbuffer));
-    for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
+#ifdef    _HAVE_MMAP
+    if (!zmmap || (write(fd, zmmap, XLogSegSize) != XLogSegSize))
+#endif
     {
-        if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
-            elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
-                 logId, logSeg);
+        /*
+         * Zero-fill the file.  We have to do this the hard way to ensure that
+         * all the file space has really been allocated --- on platforms that
+         * allow "holes" in files, just seeking to the end doesn't allocate
+         * intermediate space.  This way, we know that we have all the space
+         * and (after the fsync below) that all the indirect blocks are down
+         * on disk.  Therefore, fdatasync(2) will be sufficient to sync future
+         * writes to the log file.
+         */
+        char        zbuffer[BLCKSZ];
+        MemSet(zbuffer, 0, sizeof(zbuffer));
+        for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
+        {
+            if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
+                elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
+                     logId, logSeg);
+        }
     }

     if (pg_fsync(fd) != 0)
         elog(STOP, "fsync(logfile %u seg %u) failed: %m",
              logId, logSeg);

+#ifdef    WIN32
     close(fd);
+#endif

     /*
      * Prefer link() to rename() here just to be sure that we don't overwrite
@@ -1026,10 +1042,12 @@
              logId, logSeg);
 #endif

+#ifdef    WIN32
     fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
     if (fd < 0)
         elog(STOP, "InitReopen(logfile %u seg %u) failed: %m",
              logId, logSeg);
+#endif

     return (fd);
 }
@@ -1255,11 +1273,8 @@
     if (noBlck || readOff != (RecPtr->xrecoff % XLogSegSize) / BLCKSZ)
     {
         readOff = (RecPtr->xrecoff % XLogSegSize) / BLCKSZ;
-        if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
-            elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
-                 readId, readSeg, readOff);
-        if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
-            elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
+        if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
+            elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
                  readId, readSeg, readOff);
         if (((XLogPageHeader) readBuf)->xlp_magic != XLOG_PAGE_MAGIC)
         {
@@ -1415,19 +1430,13 @@
         elog(LOG, "Formatting logfile %u seg %u block %u at offset %u",
              readId, readSeg, readOff, EndRecPtr.xrecoff % BLCKSZ);
         readFile = XLogFileOpen(readId, readSeg, false);
-        if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
-            elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
-                 readId, readSeg, readOff);
-        if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
-            elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
+        if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
+            elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
                  readId, readSeg, readOff);
         memset(readBuf + EndRecPtr.xrecoff % BLCKSZ, 0,
                BLCKSZ - EndRecPtr.xrecoff % BLCKSZ);
-        if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
-            elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
-                 readId, readSeg, readOff);
-        if (write(readFile, readBuf, BLCKSZ) != BLCKSZ)
-            elog(STOP, "ReadRecord: write(logfile %u seg %u off %u) failed: %m",
+        if (pg_pwrite(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
+            elog(STOP, "ReadRecord: pg_pwrite(logfile %u seg %u off %u) failed: %m",
                  readId, readSeg, readOff);
         readOff++;
     }
@@ -1797,6 +1806,28 @@
     return buf;
 }

+
+#ifdef    _HAVE_MMAP
+static void
+ZeroMapInit(void)
+{
+    int zfd;
+
+    zfd = BasicOpenFile("/dev/zero", O_RDONLY, 0);
+    if (zfd < 0) {
+        elog(LOG, "Can't open /dev/zero: %m");
+        return;
+    }
+    zmmap = mmap(NULL, XLogSegSize, PROT_READ, MAP_SHARED, zfd, 0);
+    if (!zmmap)
+        elog(LOG, "Can't mmap /dev/zero: %m");
+    close(zfd);
+}
+#else
+#define    ZeroMapInit()
+#endif
+
+
 /*
  * This func must be called ONCE on system startup
  */
@@ -1811,6 +1842,9 @@
     char        buffer[_INTL_MAXLOGRECSZ + SizeOfXLogRecord];

     elog(LOG, "starting up");
+
+    ZeroMapInit();
+
     CritSectionCount++;

     XLogCtl->xlblocks = (XLogRecPtr *) (((char *) XLogCtl) + sizeof(XLogCtlData));


Re: A patch for xlog.c

From
Tom Lane
Date:
Matthew Kirkwood <matthew@hairy.beasts.org> writes:
> Here is a patch against 7.1beta5 to use mmap(), and thus a
> single write, to initialise xlogs.  It may well improve
> performance of this on platforms/filesystems which write
> metadata synchronously.

Have you *demonstrated* any actual performance improvement from this?
How much?  On what platforms?

I don't believe in adding unportable alternative implementations without
pretty darn compelling reasons ...

            regards, tom lane

Re: A patch for xlog.c

From
Matthew Kirkwood
Date:
On Sat, 24 Feb 2001, Tom Lane wrote:

> > Here is a patch against 7.1beta5 to use mmap(), and thus a
> > single write, to initialise xlogs.  It may well improve
> > performance of this on platforms/filesystems which write
> > metadata synchronously.
>
> Have you *demonstrated* any actual performance improvement from this?
> How much?  On what platforms?

Forgive me if I posted it to the wrong place -- I was far from
proposing this for inclusion.  It is but a small step on the
way to my plan of mmap()ifying all of the WAL stuff (which may
also prove a waste of effort).

On Linux 2.4 w/asynchronous ext2, it's good for about 5%, which
certainly wouldn't alone be worth the effort.  I tried synchronous
ext2, but the numbers were so poor with both that nobody who cared
about performance would be using it (1.2 sec per file, vs. over a
minute).

I don't have access to any kind machine running UFS/FFS.  Perhaps
someone on the list might do me the favour of trying the attached
test on such a platform with synchronous metadata writes (see top
of file for #ifdefs).

> I don't believe in adding unportable alternative implementations
> without pretty darn compelling reasons ...

mmap() is hardly unportable.  From a quick look, all the current
names in include/port/ (which must surely make up a vast majority
of deployed recent postgresql versions) except QNX and Win32 can
support POSIX mmap.

Thanks for the reply,

Matthew.

Attachment

Re: A patch for xlog.c

From
Bruce Momjian
Date:
I am confused why mmap() is better than writing to a real file.  Don't
we need to write to a real file so it is available for database
recovery?


> Hi,
>
> Here is a patch against 7.1beta5 to use mmap(), and thus a
> single write, to initialise xlogs.  It may well improve
> performance of this on platforms/filesystems which write
> metadata synchronously.
>
> It needs a configure test, but certainly builds and runs
> OK.
>
> It also wraps the file reopening in an "ifdef WIN32", since
> it certainly isn't needed for UNIX-like platforms (which I
> assume includes BeOS).
>
> Matthew.
>
>
> diff -ruN postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c
postgresql-7.1beta5/src/backend/access/transam/xlog.c
> --- postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c    Fri Feb 23 18:12:00 2001
> +++ postgresql-7.1beta5/src/backend/access/transam/xlog.c    Sat Feb 24 15:23:41 2001
> @@ -24,6 +24,10 @@
>  #include <locale.h>
>  #endif
>
> +#ifdef    _HAVE_MMAP
> +#include <sys/mman.h>
> +#endif
> +
>  #include "access/transam.h"
>  #include "access/xact.h"
>  #include "catalog/catversion.h"
> @@ -36,6 +40,7 @@
>  #include "access/xlogutils.h"
>  #include "utils/builtins.h"
>  #include "utils/relcache.h"
> +#include "utils/pfile.h"
>
>  #include "miscadmin.h"
>
> @@ -53,6 +58,10 @@
>  StartUpID    ThisStartUpID = 0;
>  XLogRecPtr    RedoRecPtr;
>
> +#ifdef    _HAVE_MMAP
> +void        *zmmap = NULL;
> +#endif
> +
>  int            XLOG_DEBUG = 0;
>
>  /* To read/update control file and create new log file */
> @@ -955,7 +964,6 @@
>  {
>      char        path[MAXPGPATH];
>      char        tpath[MAXPGPATH];
> -    char        zbuffer[BLCKSZ];
>      int            fd;
>      int            nbytes;
>
> @@ -987,28 +995,36 @@
>          elog(STOP, "InitCreate(logfile %u seg %u) failed: %m",
>               logId, logSeg);
>
> -    /*
> -     * Zero-fill the file.  We have to do this the hard way to ensure that
> -     * all the file space has really been allocated --- on platforms that
> -     * allow "holes" in files, just seeking to the end doesn't allocate
> -     * intermediate space.  This way, we know that we have all the space
> -     * and (after the fsync below) that all the indirect blocks are down
> -     * on disk.  Therefore, fdatasync(2) will be sufficient to sync future
> -     * writes to the log file.
> -     */
> -    MemSet(zbuffer, 0, sizeof(zbuffer));
> -    for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
> +#ifdef    _HAVE_MMAP
> +    if (!zmmap || (write(fd, zmmap, XLogSegSize) != XLogSegSize))
> +#endif
>      {
> -        if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
> -            elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
> -                 logId, logSeg);
> +        /*
> +         * Zero-fill the file.  We have to do this the hard way to ensure that
> +         * all the file space has really been allocated --- on platforms that
> +         * allow "holes" in files, just seeking to the end doesn't allocate
> +         * intermediate space.  This way, we know that we have all the space
> +         * and (after the fsync below) that all the indirect blocks are down
> +         * on disk.  Therefore, fdatasync(2) will be sufficient to sync future
> +         * writes to the log file.
> +         */
> +        char        zbuffer[BLCKSZ];
> +        MemSet(zbuffer, 0, sizeof(zbuffer));
> +        for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
> +        {
> +            if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
> +                elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
> +                     logId, logSeg);
> +        }
>      }
>
>      if (pg_fsync(fd) != 0)
>          elog(STOP, "fsync(logfile %u seg %u) failed: %m",
>               logId, logSeg);
>
> +#ifdef    WIN32
>      close(fd);
> +#endif
>
>      /*
>       * Prefer link() to rename() here just to be sure that we don't overwrite
> @@ -1026,10 +1042,12 @@
>               logId, logSeg);
>  #endif
>
> +#ifdef    WIN32
>      fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
>      if (fd < 0)
>          elog(STOP, "InitReopen(logfile %u seg %u) failed: %m",
>               logId, logSeg);
> +#endif
>
>      return (fd);
>  }
> @@ -1255,11 +1273,8 @@
>      if (noBlck || readOff != (RecPtr->xrecoff % XLogSegSize) / BLCKSZ)
>      {
>          readOff = (RecPtr->xrecoff % XLogSegSize) / BLCKSZ;
> -        if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
> -            elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
> -                 readId, readSeg, readOff);
> -        if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
> -            elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
> +        if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
> +            elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
>                   readId, readSeg, readOff);
>          if (((XLogPageHeader) readBuf)->xlp_magic != XLOG_PAGE_MAGIC)
>          {
> @@ -1415,19 +1430,13 @@
>          elog(LOG, "Formatting logfile %u seg %u block %u at offset %u",
>               readId, readSeg, readOff, EndRecPtr.xrecoff % BLCKSZ);
>          readFile = XLogFileOpen(readId, readSeg, false);
> -        if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
> -            elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
> -                 readId, readSeg, readOff);
> -        if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
> -            elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
> +        if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
> +            elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
>                   readId, readSeg, readOff);
>          memset(readBuf + EndRecPtr.xrecoff % BLCKSZ, 0,
>                 BLCKSZ - EndRecPtr.xrecoff % BLCKSZ);
> -        if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
> -            elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
> -                 readId, readSeg, readOff);
> -        if (write(readFile, readBuf, BLCKSZ) != BLCKSZ)
> -            elog(STOP, "ReadRecord: write(logfile %u seg %u off %u) failed: %m",
> +        if (pg_pwrite(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
> +            elog(STOP, "ReadRecord: pg_pwrite(logfile %u seg %u off %u) failed: %m",
>                   readId, readSeg, readOff);
>          readOff++;
>      }
> @@ -1797,6 +1806,28 @@
>      return buf;
>  }
>
> +
> +#ifdef    _HAVE_MMAP
> +static void
> +ZeroMapInit(void)
> +{
> +    int zfd;
> +
> +    zfd = BasicOpenFile("/dev/zero", O_RDONLY, 0);
> +    if (zfd < 0) {
> +        elog(LOG, "Can't open /dev/zero: %m");
> +        return;
> +    }
> +    zmmap = mmap(NULL, XLogSegSize, PROT_READ, MAP_SHARED, zfd, 0);
> +    if (!zmmap)
> +        elog(LOG, "Can't mmap /dev/zero: %m");
> +    close(zfd);
> +}
> +#else
> +#define    ZeroMapInit()
> +#endif
> +
> +
>  /*
>   * This func must be called ONCE on system startup
>   */
> @@ -1811,6 +1842,9 @@
>      char        buffer[_INTL_MAXLOGRECSZ + SizeOfXLogRecord];
>
>      elog(LOG, "starting up");
> +
> +    ZeroMapInit();
> +
>      CritSectionCount++;
>
>      XLogCtl->xlblocks = (XLogRecPtr *) (((char *) XLogCtl) + sizeof(XLogCtlData));
>
>


--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026

Re: A patch for xlog.c

From
Tom Lane
Date:
Matthew Kirkwood <matthew@hairy.beasts.org> writes:
> Forgive me if I posted it to the wrong place -- I was far from
> proposing this for inclusion.

Diffs posted to pgsql-patches are generally considered to be requests
for application of a patch.  If this is only an experiment it had best
be clearly labeled as such.

> It is but a small step on the way to my plan of mmap()ifying all of
> the WAL stuff (which may also prove a waste of effort).

Very probably.  What are your grounds for thinking that's a good idea?
I can't see any reason to think that mmap is more efficient than write
for simple sequential writes, which is what we need to do.

            regards, tom lane

Re: A patch for xlog.c

From
Matthew Kirkwood
Date:
On Sat, 24 Feb 2001, Bruce Momjian wrote:

> I am confused why mmap() is better than writing to a real file.

It isn't, except that it allows to initialise the logfile in
one syscall, without first allocating and zeroing (and hence
dirtying) 16Mb of memory.

> Don't we need to write to a real file so it is available for database
> recovery?

The mmap isn't used for the destination, but for the source;
it's just a cheap way to get your hands on 16Mb of zeroes.

Matthew.


Re: A patch for xlog.c

From
Matthew Kirkwood
Date:
On Sat, 24 Feb 2001, Tom Lane wrote:

> > Forgive me if I posted it to the wrong place -- I was far from
> > proposing this for inclusion.
>
> Diffs posted to pgsql-patches are generally considered to be requests
> for application of a patch.  If this is only an experiment it had best
> be clearly labeled as such.

OK.  Is there are better place for discussion of such?

> > It is but a small step on the way to my plan of mmap()ifying all
> > of the WAL stuff (which may also prove a waste of effort).
>
> Very probably.  What are your grounds for thinking that's a good idea?
> I can't see any reason to think that mmap is more efficient than write
> for simple sequential writes, which is what we need to do.

Potential pros:

a. msync(MS_ASYNC) seems to be exactly
b. Potential to reduce contention
c. Removing syscalls is rarely a bad thing
d. Fewer copies, better cache behaviour

Potential cons:

a. Portability
b. A bad pointer can cause a scribble on the log

Matthew.


Re: A patch for xlog.c

From
Tom Lane
Date:
Matthew Kirkwood <matthew@hairy.beasts.org> writes:
>> Diffs posted to pgsql-patches are generally considered to be requests
>> for application of a patch.  If this is only an experiment it had best
>> be clearly labeled as such.

> OK.  Is there are better place for discussion of such?

pgsql-hackers is the place to discuss anything that's experimental or
otherwise concerned with future development.

> [ possible merits of mmap ]

Let's take up that discussion in pghackers.

            regards, tom lane

Re: A patch for xlog.c

From
Bruce Momjian
Date:
> Matthew Kirkwood <matthew@hairy.beasts.org> writes:
> >> Diffs posted to pgsql-patches are generally considered to be requests
> >> for application of a patch.  If this is only an experiment it had best
> >> be clearly labeled as such.
>
> > OK.  Is there are better place for discussion of such?
>
> pgsql-hackers is the place to discuss anything that's experimental or
> otherwise concerned with future development.
>
> > [ possible merits of mmap ]
>
> Let's take up that discussion in pghackers.

I always felt the real benefit of mmap() would be to remove use of SysV
shared memory and use anon mmap() to prevent problems with SysV share
memory limits.


--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026

Re: A patch for xlog.c

From
The Hermit Hacker
Date:
On Sat, 24 Feb 2001, Bruce Momjian wrote:

> > Matthew Kirkwood <matthew@hairy.beasts.org> writes:
> > >> Diffs posted to pgsql-patches are generally considered to be requests
> > >> for application of a patch.  If this is only an experiment it had best
> > >> be clearly labeled as such.
> >
> > > OK.  Is there are better place for discussion of such?
> >
> > pgsql-hackers is the place to discuss anything that's experimental or
> > otherwise concerned with future development.
> >
> > > [ possible merits of mmap ]
> >
> > Let's take up that discussion in pghackers.
>
> I always felt the real benefit of mmap() would be to remove use of SysV
> shared memory and use anon mmap() to prevent problems with SysV share
> memory limits.

You'll still have memory limits to overcome ... per user memory limits
being one ... there is no such thing as a 'cure-all' ...



Re: A patch for xlog.c

From
Bruce Momjian
Date:
> > > pgsql-hackers is the place to discuss anything that's experimental or
> > > otherwise concerned with future development.
> > >
> > > > [ possible merits of mmap ]
> > >
> > > Let's take up that discussion in pghackers.
> >
> > I always felt the real benefit of mmap() would be to remove use of SysV
> > shared memory and use anon mmap() to prevent problems with SysV share
> > memory limits.
>
> You'll still have memory limits to overcome ... per user memory limits
> being one ... there is no such thing as a 'cure-all' ...

Yes, but typical SysV shared memory limits are much lower than
per-process limits.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026

Re: A patch for xlog.c

From
The Hermit Hacker
Date:
On Sun, 25 Feb 2001, Bruce Momjian wrote:

> > > > pgsql-hackers is the place to discuss anything that's experimental or
> > > > otherwise concerned with future development.
> > > >
> > > > > [ possible merits of mmap ]
> > > >
> > > > Let's take up that discussion in pghackers.
> > >
> > > I always felt the real benefit of mmap() would be to remove use of SysV
> > > shared memory and use anon mmap() to prevent problems with SysV share
> > > memory limits.
> >
> > You'll still have memory limits to overcome ... per user memory limits
> > being one ... there is no such thing as a 'cure-all' ...
>
> Yes, but typical SysV shared memory limits are much lower than
> per-process limits.

well, come up with suitable patches for v7.2 and we can see where it goes
... you seem to think mmap() will do what we require, but, so far, have
been unable to convince anyone to dedicate the time to converting to using
it.  "having to raise/set SysV limits", IMHO, isn't worth the overhaul
that I see having to happen, but, if you can show us the benefits of doing
it other then removing a 'one time administrative config' of an OS, I
imagine that nobody will be able to argue it ...



Re: A patch for xlog.c

From
Bruce Momjian
Date:
> > Yes, but typical SysV shared memory limits are much lower than
> > per-process limits.
>
> well, come up with suitable patches for v7.2 and we can see where it goes
> ... you seem to think mmap() will do what we require, but, so far, have
> been unable to convince anyone to dedicate the time to converting to using
> it.  "having to raise/set SysV limits", IMHO, isn't worth the overhaul
> that I see having to happen, but, if you can show us the benefits of doing
> it other then removing a 'one time administrative config' of an OS, I
> imagine that nobody will be able to argue it ...

Yea, it is pretty low priority, especially since most OS's don't support
ANON mmap().  Most BSD's support it, but I don't think Linux or others
do.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026

Re: A patch for xlog.c

From
The Hermit Hacker
Date:
On Sun, 25 Feb 2001, Bruce Momjian wrote:

> > > Yes, but typical SysV shared memory limits are much lower than
> > > per-process limits.
> >
> > well, come up with suitable patches for v7.2 and we can see where it goes
> > ... you seem to think mmap() will do what we require, but, so far, have
> > been unable to convince anyone to dedicate the time to converting to using
> > it.  "having to raise/set SysV limits", IMHO, isn't worth the overhaul
> > that I see having to happen, but, if you can show us the benefits of doing
> > it other then removing a 'one time administrative config' of an OS, I
> > imagine that nobody will be able to argue it ...
>
> Yea, it is pretty low priority, especially since most OS's don't support
> ANON mmap().  Most BSD's support it, but I don't think Linux or others
> do.

ah, then not a low priority, a non-starter, period ... maybe when all the
OSs we support move to supporting ANON mmap() :(


Re: A patch for xlog.c

From
Bruce Momjian
Date:
> On Sun, 25 Feb 2001, Bruce Momjian wrote:
>
> > > > Yes, but typical SysV shared memory limits are much lower than
> > > > per-process limits.
> > >
> > > well, come up with suitable patches for v7.2 and we can see where it goes
> > > ... you seem to think mmap() will do what we require, but, so far, have
> > > been unable to convince anyone to dedicate the time to converting to using
> > > it.  "having to raise/set SysV limits", IMHO, isn't worth the overhaul
> > > that I see having to happen, but, if you can show us the benefits of doing
> > > it other then removing a 'one time administrative config' of an OS, I
> > > imagine that nobody will be able to argue it ...
> >
> > Yea, it is pretty low priority, especially since most OS's don't support
> > ANON mmap().  Most BSD's support it, but I don't think Linux or others
> > do.
>
> ah, then not a low priority, a non-starter, period ... maybe when all the
> OSs we support move to supporting ANON mmap() :(

Yea, we would have to take a poll to see if the majority support it.
Right now, I think it is clearly a minority, and not worth the added
confusion for a few platforms.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026

Re: A patch for xlog.c

From
Peter Eisentraut
Date:
The Hermit Hacker writes:

> > Yea, it is pretty low priority, especially since most OS's don't support
> > ANON mmap().  Most BSD's support it, but I don't think Linux or others
> > do.
>
> ah, then not a low priority, a non-starter, period ... maybe when all the
> OSs we support move to supporting ANON mmap() :(

It would be worthwhile for those operating systems that don't have SysV
shared memory but do have mmap().  But I don't have one of those, so I
ain't gonna do it.  ;-)

--
Peter Eisentraut      peter_e@gmx.net       http://yi.org/peter-e/


Re: A patch for xlog.c

From
Bruce Momjian
Date:
> The Hermit Hacker writes:
>
> > > Yea, it is pretty low priority, especially since most OS's don't support
> > > ANON mmap().  Most BSD's support it, but I don't think Linux or others
> > > do.
> >
> > ah, then not a low priority, a non-starter, period ... maybe when all the
> > OSs we support move to supporting ANON mmap() :(
>
> It would be worthwhile for those operating systems that don't have SysV
> shared memory but do have mmap().  But I don't have one of those, so I
> ain't gonna do it.  ;-)

All have SysV memory.  mmap() usage is only useful in enabling larger
buffers without kernel changes.

--
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman@candle.pha.pa.us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026

Re: A patch for xlog.c

From
Matthew Kirkwood
Date:
On Tue, 27 Feb 2001, Bruce Momjian wrote:

> mmap() usage is only useful in enabling larger
> buffers without kernel changes.

My plan was not to replace the shared buffer pool with an
mmap()ed area, but rather to use mmap() on the data files
themselves to eliminate it.

Clearly this is rather controversial, since it may have
safety implications, but it should allow the kernel better
to choose what to cache.

Matthew.


Re: A patch for xlog.c

From
Peter Eisentraut
Date:
Bruce Momjian writes:

> All have SysV memory.

All that we currently support...

--
Peter Eisentraut      peter_e@gmx.net       http://yi.org/peter-e/