Re: [GENERAL] pg_upgrade from 9.5 to 9.6 fails with "invalid argument" - Mailing list pgsql-hackers

From Tom Lane
Subject Re: [GENERAL] pg_upgrade from 9.5 to 9.6 fails with "invalid argument"
Date
Msg-id 29413.1475253531@sss.pgh.pa.us
Whole thread Raw
In response to Re: [GENERAL] pg_upgrade from 9.5 to 9.6 fails with "invalid argument"  (Masahiko Sawada <sawada.mshk@gmail.com>)
List pgsql-hackers
Masahiko Sawada <sawada.mshk@gmail.com> writes:

> +#ifndef WIN32
>      if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0)
> +#else
> +    if ((src_fd = open(fromfile, O_RDONLY | O_BINARY)) < 0)
> +#endif

This is easier with PG_BINARY.  Also, both open() calls need this.

I'm rather inclined to also stick PG_BINARY into the open() calls in
copy_file() as well.  That function isn't actively broken since it's
not used under Windows, but I think it's highly likely that the
current bug arose from copying-and-pasting that code, so leaving it
in a state that's unsafe for Windows is just asking for trouble.

Attached is the patch I'm currently working with to fix all three
of the portability issues here (PG_BINARY, endianness, alignment,
and there's some cosmetic changes too).

> +            pg_log(PG_WARNING,
> +                   "could not read expected bytes: read = %u, expected = %u\n",
> +                   BLCKSZ, bytesRead);

I think this is probably better done as part of a wholesale revision
of the error reporting in this file.  What I have in mind is to
redefine copyFile, linkFile, and rewriteVisibilityMap as all being
responsible for reporting their own errors and then exiting (so
they can just return void).  We'd need to pass in the schema name
and table name so that they can include that context, so that the
reports don't get any less complete than they were before, but that
does not seem like a big downside.

A variant would be to have them print the error messages but then
return a bool success flag, leaving the caller to decide whether
it's fatal or not.  But that would be more complicated and I see
no real reason to think pg_upgrade would ever need it.

            regards, tom lane

diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index b33f0b4..9ae92a9 100644
*** a/src/bin/pg_upgrade/file.c
--- b/src/bin/pg_upgrade/file.c
***************
*** 18,25 ****
  #include <sys/stat.h>
  #include <fcntl.h>

- #define BITS_PER_HEAPBLOCK_OLD 1
-

  #ifndef WIN32
  static int    copy_file(const char *fromfile, const char *tofile);
--- 18,23 ----
*************** copy_file(const char *srcfile, const cha
*** 84,93 ****
          return -1;
      }

!     if ((src_fd = open(srcfile, O_RDONLY, 0)) < 0)
          return -1;

!     if ((dest_fd = open(dstfile, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR)) < 0)
      {
          save_errno = errno;

--- 82,92 ----
          return -1;
      }

!     if ((src_fd = open(srcfile, O_RDONLY | PG_BINARY, 0)) < 0)
          return -1;

!     if ((dest_fd = open(dstfile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
!                         S_IRUSR | S_IWUSR)) < 0)
      {
          save_errno = errno;

*************** copy_file(const char *srcfile, const cha
*** 153,183 ****
   * version, we could refuse to copy visibility maps from the old cluster
   * to the new cluster; the next VACUUM would recreate them, but at the
   * price of scanning the entire table.  So, instead, we rewrite the old
!  * visibility maps in the new format.  That way, the all-visible bit
!  * remains set for the pages for which it was set previously.  The
!  * all-frozen bit is never set by this conversion; we leave that to
!  * VACUUM.
   */
  const char *
  rewriteVisibilityMap(const char *fromfile, const char *tofile)
  {
!     int            src_fd = 0;
!     int            dst_fd = 0;
!     char        buffer[BLCKSZ];
!     ssize_t        bytesRead;
      ssize_t        totalBytesRead = 0;
      ssize_t        src_filesize;
      int            rewriteVmBytesPerPage;
      BlockNumber new_blkno = 0;
      struct stat statbuf;

!     /* Compute we need how many old page bytes to rewrite a new page */
      rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;

      if ((fromfile == NULL) || (tofile == NULL))
          return "Invalid old file or new file";

!     if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0)
          return getErrorText();

      if (fstat(src_fd, &statbuf) != 0)
--- 152,181 ----
   * version, we could refuse to copy visibility maps from the old cluster
   * to the new cluster; the next VACUUM would recreate them, but at the
   * price of scanning the entire table.  So, instead, we rewrite the old
!  * visibility maps in the new format.  That way, the all-visible bits
!  * remain set for the pages for which they were set previously.  The
!  * all-frozen bits are never set by this conversion; we leave that to VACUUM.
   */
  const char *
  rewriteVisibilityMap(const char *fromfile, const char *tofile)
  {
!     int            src_fd;
!     int            dst_fd;
!     char       *buffer;
!     char       *new_vmbuf;
      ssize_t        totalBytesRead = 0;
      ssize_t        src_filesize;
      int            rewriteVmBytesPerPage;
      BlockNumber new_blkno = 0;
      struct stat statbuf;

!     /* Compute number of old-format bytes per new page */
      rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;

      if ((fromfile == NULL) || (tofile == NULL))
          return "Invalid old file or new file";

!     if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0)
          return getErrorText();

      if (fstat(src_fd, &statbuf) != 0)
*************** rewriteVisibilityMap(const char *fromfil
*** 186,192 ****
          return getErrorText();
      }

!     if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR)) < 0)
      {
          close(src_fd);
          return getErrorText();
--- 184,191 ----
          return getErrorText();
      }

!     if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
!                        S_IRUSR | S_IWUSR)) < 0)
      {
          close(src_fd);
          return getErrorText();
*************** rewriteVisibilityMap(const char *fromfil
*** 196,208 ****
      src_filesize = statbuf.st_size;

      /*
       * Turn each visibility map page into 2 pages one by one. Each new page
!      * has the same page header as the old one.  If the last section of last
!      * page is empty, we skip it, mostly to avoid turning one-page visibility
!      * maps for small relations into two pages needlessly.
       */
      while (totalBytesRead < src_filesize)
      {
          char       *old_cur;
          char       *old_break;
          char       *old_blkend;
--- 195,215 ----
      src_filesize = statbuf.st_size;

      /*
+      * Malloc the work buffers, rather than making them local arrays, to
+      * ensure adequate alignment.
+      */
+     buffer = (char *) pg_malloc(BLCKSZ);
+     new_vmbuf = (char *) pg_malloc(BLCKSZ);
+
+     /*
       * Turn each visibility map page into 2 pages one by one. Each new page
!      * has the same page header as the old one.  If the last section of the
!      * last page is empty, we skip it, mostly to avoid turning one-page
!      * visibility maps for small relations into two pages needlessly.
       */
      while (totalBytesRead < src_filesize)
      {
+         ssize_t        bytesRead;
          char       *old_cur;
          char       *old_break;
          char       *old_blkend;
*************** rewriteVisibilityMap(const char *fromfil
*** 225,285 ****
          /*
           * These old_* variables point to old visibility map page. old_cur
           * points to current position on old page. old_blkend points to end of
!          * old block. old_break points to old page break position for
!          * rewriting a new page. After wrote a new page, old_break proceeds
!          * rewriteVmBytesPerPage bytes.
           */
          old_cur = buffer + SizeOfPageHeaderData;
          old_blkend = buffer + bytesRead;
          old_break = old_cur + rewriteVmBytesPerPage;

!         while (old_blkend >= old_break)
          {
!             char        new_vmbuf[BLCKSZ];
!             char       *new_cur = new_vmbuf;
              bool        empty = true;
              bool        old_lastpart;

!             /* Copy page header in advance */
              memcpy(new_vmbuf, &pageheader, SizeOfPageHeaderData);

!             /* Rewrite the last part of the old page? */
!             old_lastpart = old_lastblk && (old_blkend == old_break);

!             new_cur += SizeOfPageHeaderData;

              /* Process old page bytes one by one, and turn it into new page. */
!             while (old_break > old_cur)
              {
                  uint16        new_vmbits = 0;
                  int            i;

                  /* Generate new format bits while keeping old information */
                  for (i = 0; i < BITS_PER_BYTE; i++)
                  {
!                     uint8        byte = *(uint8 *) old_cur;
!
!                     if (byte & (1 << (BITS_PER_HEAPBLOCK_OLD * i)))
                      {
                          empty = false;
!                         new_vmbits |= 1 << (BITS_PER_HEAPBLOCK * i);
                      }
                  }

                  /* Copy new visibility map bit to new format page */
!                 memcpy(new_cur, &new_vmbits, BITS_PER_HEAPBLOCK);

!                 old_cur += BITS_PER_HEAPBLOCK_OLD;
                  new_cur += BITS_PER_HEAPBLOCK;
              }

!             /* If the last part of the old page is empty, skip writing it */
              if (old_lastpart && empty)
                  break;

!             /* Set new checksum for a visibility map page (if enabled) */
!             if (old_cluster.controldata.data_checksum_version != 0 &&
!                 new_cluster.controldata.data_checksum_version != 0)
                  ((PageHeader) new_vmbuf)->pd_checksum =
                      pg_checksum_page(new_vmbuf, new_blkno);

--- 232,290 ----
          /*
           * These old_* variables point to old visibility map page. old_cur
           * points to current position on old page. old_blkend points to end of
!          * old block.  old_break is the end+1 position on the old page for the
!          * data that will be transferred to the current new page.
           */
          old_cur = buffer + SizeOfPageHeaderData;
          old_blkend = buffer + bytesRead;
          old_break = old_cur + rewriteVmBytesPerPage;

!         while (old_break <= old_blkend)
          {
!             char       *new_cur;
              bool        empty = true;
              bool        old_lastpart;

!             /* First, copy old page header to new page */
              memcpy(new_vmbuf, &pageheader, SizeOfPageHeaderData);

!             /* Rewriting the last part of the last old page? */
!             old_lastpart = old_lastblk && (old_break == old_blkend);

!             new_cur = new_vmbuf + SizeOfPageHeaderData;

              /* Process old page bytes one by one, and turn it into new page. */
!             while (old_cur < old_break)
              {
+                 uint8        byte = *(uint8 *) old_cur;
                  uint16        new_vmbits = 0;
                  int            i;

                  /* Generate new format bits while keeping old information */
                  for (i = 0; i < BITS_PER_BYTE; i++)
                  {
!                     if (byte & (1 << i))
                      {
                          empty = false;
!                         new_vmbits |=
!                             VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i);
                      }
                  }

                  /* Copy new visibility map bit to new format page */
!                 new_cur[0] = (char) (new_vmbits & 0xFF);
!                 new_cur[1] = (char) (new_vmbits >> 8);

!                 old_cur++;
                  new_cur += BITS_PER_HEAPBLOCK;
              }

!             /* If the last part of the last page is empty, skip writing it */
              if (old_lastpart && empty)
                  break;

!             /* Set new checksum for visibility map page, if enabled */
!             if (new_cluster.controldata.data_checksum_version != 0)
                  ((PageHeader) new_vmbuf)->pd_checksum =
                      pg_checksum_page(new_vmbuf, new_blkno);

*************** rewriteVisibilityMap(const char *fromfil
*** 290,306 ****
                  return getErrorText();
              }

              old_break += rewriteVmBytesPerPage;
              new_blkno++;
          }
      }

!     /* Close files */
      close(dst_fd);
      close(src_fd);

      return NULL;
-
  }

  void
--- 295,313 ----
                  return getErrorText();
              }

+             /* Advance for next new page */
              old_break += rewriteVmBytesPerPage;
              new_blkno++;
          }
      }

!     /* Clean up */
!     pg_free(buffer);
!     pg_free(new_vmbuf);
      close(dst_fd);
      close(src_fd);

      return NULL;
  }

  void

pgsql-hackers by date:

Previous
From: Peter Geoghegan
Date:
Subject: Re: On conflict update & hint bits
Next
From: Peter Geoghegan
Date:
Subject: Re: Hash Indexes