Re: Confusing error message with too-large file in pg_basebackup - Mailing list pgsql-bugs

From Tom Lane
Subject Re: Confusing error message with too-large file in pg_basebackup
Date
Msg-id 6801.1448064683@sss.pgh.pa.us
Whole thread Raw
In response to Re: Confusing error message with too-large file in pg_basebackup  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: Confusing error message with too-large file in pg_basebackup  (David Gould <daveg@sonic.net>)
Re: Confusing error message with too-large file in pg_basebackup  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-bugs
I wrote:
> Actually ... why don't we get rid of the limit?  wikipedia's entry on
> tar format says
>     ... only 11 octal digits can be stored. This gives a maximum file size
>     of 8 gigabytes on archived files. To overcome this limitation, star in
>     2001 introduced a base-256 coding that is indicated by setting the
>     high-order bit of the leftmost byte of a numeric field. GNU-tar and
>     BSD-tar followed this idea.
> If that extension is as widespread as this suggests, then following it
> when we have a file > 8GB seems like a better answer than failing
> entirely.  If you try to read the dump with an old tar program, old
> pg_restore, etc, it might fail ... but are you really worse off than
> if you couldn't make the dump at all?

I looked into the GNU tar sources and confirmed that gtar supports this
concept.  (It looks from the GNU sources like they've supported it for
a *really long time*, like since the 90's, in which case wikipedia's
credit to "star" for the idea is full of it.)

Hence, I propose something like the attached (WIP, has no doc
corrections).  I've done simple testing on the pg_dump/pg_restore code
path, but not on basebackup --- anyone want to test that?

I'm not sure whether we should treat this as a back-patchable bug fix
or a new feature for HEAD only.  If we don't back-patch it, there are
in any case several bugs here that we must fix.  In particular, the
existing coding in ReceiveTarFile:

    size_t        filesz = 0;
    ...
    sscanf(&tarhdr[124], "%11o", (unsigned int *) &filesz);

is utterly, absolutely, completely broken; it'll fail grossly on
any 64-bit big-endian hardware.  There are other places with misplaced
faith that "unsigned long" is at least as wide as size_t.

Comments?

            regards, tom lane

diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 1af011e..6120c8f 100644
*** a/src/backend/replication/basebackup.c
--- b/src/backend/replication/basebackup.c
*************** sendDir(char *path, int basepathlen, boo
*** 1132,1144 ****


  /*
-  * Maximum file size for a tar member: The limit inherent in the
-  * format is 2^33-1 bytes (nearly 8 GB).  But we don't want to exceed
-  * what we can represent in pgoff_t.
-  */
- #define MAX_TAR_MEMBER_FILELEN (((int64) 1 << Min(33, sizeof(pgoff_t)*8 - 1)) - 1)
-
- /*
   * Given the member, write the TAR header & send the file.
   *
   * If 'missing_ok' is true, will not throw an error if the file is not found.
--- 1132,1137 ----
*************** sendFile(char *readfilename, char *tarfi
*** 1166,1180 ****
                   errmsg("could not open file \"%s\": %m", readfilename)));
      }

-     /*
-      * Some compilers will throw a warning knowing this test can never be true
-      * because pgoff_t can't exceed the compared maximum on their platform.
-      */
-     if (statbuf->st_size > MAX_TAR_MEMBER_FILELEN)
-         ereport(ERROR,
-                 (errmsg("archive member \"%s\" too large for tar format",
-                         tarfilename)));
-
      _tarWriteHeader(tarfilename, NULL, statbuf);

      while ((cnt = fread(buf, 1, Min(sizeof(buf), statbuf->st_size - len), fp)) > 0)
--- 1159,1164 ----
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 80de882..8c4dffe 100644
*** a/src/bin/pg_basebackup/pg_basebackup.c
--- b/src/bin/pg_basebackup/pg_basebackup.c
*************** ReceiveTarFile(PGconn *conn, PGresult *r
*** 781,787 ****
      bool        in_tarhdr = true;
      bool        skip_file = false;
      size_t        tarhdrsz = 0;
!     size_t        filesz = 0;

  #ifdef HAVE_LIBZ
      gzFile        ztarfile = NULL;
--- 781,787 ----
      bool        in_tarhdr = true;
      bool        skip_file = false;
      size_t        tarhdrsz = 0;
!     pgoff_t        filesz = 0;

  #ifdef HAVE_LIBZ
      gzFile        ztarfile = NULL;
*************** ReceiveTarFile(PGconn *conn, PGresult *r
*** 1046,1052 ****

                          skip_file = (strcmp(&tarhdr[0], "recovery.conf") == 0);

!                         sscanf(&tarhdr[124], "%11o", (unsigned int *) &filesz);

                          padding = ((filesz + 511) & ~511) - filesz;
                          filesz += padding;
--- 1046,1052 ----

                          skip_file = (strcmp(&tarhdr[0], "recovery.conf") == 0);

!                         filesz = read_tar_number(&tarhdr[124], 12);

                          padding = ((filesz + 511) & ~511) - filesz;
                          filesz += padding;
*************** ReceiveAndUnpackTarFile(PGconn *conn, PG
*** 1139,1145 ****
      char        current_path[MAXPGPATH];
      char        filename[MAXPGPATH];
      const char *mapped_tblspc_path;
!     int            current_len_left;
      int            current_padding = 0;
      bool        basetablespace;
      char       *copybuf = NULL;
--- 1139,1145 ----
      char        current_path[MAXPGPATH];
      char        filename[MAXPGPATH];
      const char *mapped_tblspc_path;
!     pgoff_t        current_len_left = 0;
      int            current_padding = 0;
      bool        basetablespace;
      char       *copybuf = NULL;
*************** ReceiveAndUnpackTarFile(PGconn *conn, PG
*** 1208,1227 ****
              }
              totaldone += 512;

!             if (sscanf(copybuf + 124, "%11o", ¤t_len_left) != 1)
!             {
!                 fprintf(stderr, _("%s: could not parse file size\n"),
!                         progname);
!                 disconnect_and_exit(1);
!             }

              /* Set permissions on the file */
!             if (sscanf(©buf[100], "%07o ", &filemode) != 1)
!             {
!                 fprintf(stderr, _("%s: could not parse file mode\n"),
!                         progname);
!                 disconnect_and_exit(1);
!             }

              /*
               * All files are padded up to 512 bytes
--- 1208,1217 ----
              }
              totaldone += 512;

!             current_len_left = read_tar_number(©buf[124], 12);

              /* Set permissions on the file */
!             filemode = read_tar_number(©buf[100], 8);

              /*
               * All files are padded up to 512 bytes
diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c
index 532eacc..5fff355 100644
*** a/src/bin/pg_dump/pg_backup_tar.c
--- b/src/bin/pg_dump/pg_backup_tar.c
*************** typedef struct
*** 78,90 ****
      ArchiveHandle *AH;
  } TAR_MEMBER;

- /*
-  * Maximum file size for a tar member: The limit inherent in the
-  * format is 2^33-1 bytes (nearly 8 GB).  But we don't want to exceed
-  * what we can represent in pgoff_t.
-  */
- #define MAX_TAR_MEMBER_FILELEN (((int64) 1 << Min(33, sizeof(pgoff_t)*8 - 1)) - 1)
-
  typedef struct
  {
      int            hasSeek;
--- 78,83 ----
*************** isValidTarHeader(char *header)
*** 1049,1055 ****
      int            sum;
      int            chk = tarChecksum(header);

!     sscanf(&header[148], "%8o", &sum);

      if (sum != chk)
          return false;
--- 1042,1048 ----
      int            sum;
      int            chk = tarChecksum(header);

!     sum = read_tar_number(&header[148], 8);

      if (sum != chk)
          return false;
*************** _tarAddFile(ArchiveHandle *AH, TAR_MEMBE
*** 1091,1103 ****
                        strerror(errno));
      fseeko(tmp, 0, SEEK_SET);

-     /*
-      * Some compilers will throw a warning knowing this test can never be true
-      * because pgoff_t can't exceed the compared maximum on their platform.
-      */
-     if (th->fileLen > MAX_TAR_MEMBER_FILELEN)
-         exit_horribly(modulename, "archive member too large for tar format\n");
-
      _tarWriteHeader(th);

      while ((cnt = fread(buf, 1, sizeof(buf), tmp)) > 0)
--- 1084,1089 ----
*************** _tarGetHeader(ArchiveHandle *AH, TAR_MEM
*** 1225,1232 ****
      char        tag[100];
      int            sum,
                  chk;
!     size_t        len;
!     unsigned long ullen;
      pgoff_t        hPos;
      bool        gotBlock = false;

--- 1211,1217 ----
      char        tag[100];
      int            sum,
                  chk;
!     pgoff_t        len;
      pgoff_t        hPos;
      bool        gotBlock = false;

*************** _tarGetHeader(ArchiveHandle *AH, TAR_MEM
*** 1249,1255 ****

          /* Calc checksum */
          chk = tarChecksum(h);
!         sscanf(&h[148], "%8o", &sum);

          /*
           * If the checksum failed, see if it is a null block. If so, silently
--- 1234,1240 ----

          /* Calc checksum */
          chk = tarChecksum(h);
!         sum = read_tar_number(&h[148], 8);

          /*
           * If the checksum failed, see if it is a null block. If so, silently
*************** _tarGetHeader(ArchiveHandle *AH, TAR_MEM
*** 1273,1280 ****
      }

      sscanf(&h[0], "%99s", tag);
!     sscanf(&h[124], "%12lo", &ullen);
!     len = (size_t) ullen;

      {
          char        buf[100];
--- 1258,1264 ----
      }

      sscanf(&h[0], "%99s", tag);
!     len = read_tar_number(&h[124], 12);

      {
          char        buf[100];
*************** _tarWriteHeader(TAR_MEMBER *th)
*** 1307,1313 ****
  {
      char        h[512];

!     tarCreateHeader(h, th->targetFile, NULL, th->fileLen, 0600, 04000, 02000, time(NULL));

      /* Now write the completed header. */
      if (fwrite(h, 1, 512, th->tarFH) != 512)
--- 1291,1298 ----
  {
      char        h[512];

!     tarCreateHeader(h, th->targetFile, NULL, th->fileLen,
!                     0600, 04000, 02000, time(NULL));

      /* Now write the completed header. */
      if (fwrite(h, 1, 512, th->tarFH) != 512)
diff --git a/src/include/pgtar.h b/src/include/pgtar.h
index 906db7c..9c94a58 100644
*** a/src/include/pgtar.h
--- b/src/include/pgtar.h
*************** enum tarError
*** 19,23 ****
      TAR_SYMLINK_TOO_LONG
  };

! extern enum tarError tarCreateHeader(char *h, const char *filename, const char *linktarget, size_t size, mode_t mode,
uid_tuid, gid_t gid, time_t mtime); 
  extern int    tarChecksum(char *header);
--- 19,25 ----
      TAR_SYMLINK_TOO_LONG
  };

! extern enum tarError tarCreateHeader(char *h, const char *filename, const char *linktarget,
!               pgoff_t size, mode_t mode, uid_t uid, gid_t gid, time_t mtime);
! extern uint64 read_tar_number(const char *s, int len);
  extern int    tarChecksum(char *header);
diff --git a/src/port/tar.c b/src/port/tar.c
index 72fd4e1..631ff4d 100644
*** a/src/port/tar.c
--- b/src/port/tar.c
***************
*** 3,23 ****
  #include <sys/stat.h>

  /*
!  * Utility routine to print possibly larger than 32 bit integers in a
!  * portable fashion.  Filled with zeros.
   */
  static void
! print_val(char *s, uint64 val, unsigned int base, size_t len)
  {
!     int            i;
!
!     for (i = len; i > 0; i--)
      {
!         int            digit = val % base;

!         s[i - 1] = '0' + digit;
!         val = val / base;
      }
  }


--- 3,82 ----
  #include <sys/stat.h>

  /*
!  * Print a numeric field in a tar header.  The field starts at *s and is of
!  * length len; val is the value to be written.
!  *
!  * Per POSIX, the way to write a number is in octal with leading zeroes and
!  * one trailing space (or NUL, but we use space) at the end of the specified
!  * field width.
!  *
!  * However, the given value may not fit in the available space in octal form.
!  * If that's true, we use the GNU extension of writing \200 followed by the
!  * number in base-256 form (ie, stored in binary MSB-first).  (Note: here we
!  * support only non-negative numbers, so we don't worry about the GNU rules
!  * for handling negative numbers.)
   */
  static void
! print_tar_number(char *s, int len, uint64 val)
  {
!     if (val < (((uint64) 1) << ((len - 1) * 3)))
      {
!         /* Use octal with trailing space */
!         s[--len] = ' ';
!         while (len)
!         {
!             s[--len] = (val & 7) + '0';
!             val >>= 3;
!         }
!     }
!     else
!     {
!         /* Use base-256 with leading \200 */
!         s[0] = '\200';
!         while (len > 1)
!         {
!             s[--len] = (val & 255);
!             val >>= 8;
!         }
!     }
! }

!
! /*
!  * Read a numeric field in a tar header.  The field starts at *s and is of
!  * length len.
!  *
!  * The POSIX-approved format for a number is octal, ending with a space or
!  * NUL.  However, for values that don't fit, we recognize the GNU extension
!  * of \200 followed by the number in base-256 form (ie, stored in binary
!  * MSB-first).  (Note: here we support only non-negative numbers, so we don't
!  * worry about the GNU rules for handling negative numbers.)
!  */
! uint64
! read_tar_number(const char *s, int len)
! {
!     uint64        result = 0;
!
!     if (*s == '\200')
!     {
!         /* base-256 */
!         while (--len)
!         {
!             result <<= 8;
!             result |= (unsigned char) (*++s);
!         }
      }
+     else
+     {
+         /* octal */
+         while (len-- && *s >= '0' && *s <= '7')
+         {
+             result <<= 3;
+             result |= (*s - '0');
+             s++;
+         }
+     }
+     return result;
  }


*************** tarChecksum(char *header)
*** 46,57 ****

  /*
   * Fill in the buffer pointed to by h with a tar format header. This buffer
!  * must always have space for 512 characters, which is a requirement by
   * the tar format.
   */
  enum tarError
  tarCreateHeader(char *h, const char *filename, const char *linktarget,
!                 size_t size, mode_t mode, uid_t uid, gid_t gid, time_t mtime)
  {
      if (strlen(filename) > 99)
          return TAR_NAME_TOO_LONG;
--- 105,116 ----

  /*
   * Fill in the buffer pointed to by h with a tar format header. This buffer
!  * must always have space for 512 characters, which is a requirement of
   * the tar format.
   */
  enum tarError
  tarCreateHeader(char *h, const char *filename, const char *linktarget,
!                 pgoff_t size, mode_t mode, uid_t uid, gid_t gid, time_t mtime)
  {
      if (strlen(filename) > 99)
          return TAR_NAME_TOO_LONG;
*************** tarCreateHeader(char *h, const char *fil
*** 84,107 ****
      }

      /* Mode 8 - this doesn't include the file type bits (S_IFMT)  */
!     sprintf(&h[100], "%07o ", (int) (mode & 07777));

      /* User ID 8 */
!     sprintf(&h[108], "%07o ", (int) uid);

      /* Group 8 */
!     sprintf(&h[116], "%07o ", (int) gid);

!     /* File size 12 - 11 digits, 1 space; use print_val for 64 bit support */
      if (linktarget != NULL || S_ISDIR(mode))
          /* Symbolic link or directory has size zero */
!         print_val(&h[124], 0, 8, 11);
      else
!         print_val(&h[124], size, 8, 11);
!     sprintf(&h[135], " ");

      /* Mod Time 12 */
!     sprintf(&h[136], "%011o ", (int) mtime);

      /* Checksum 8 cannot be calculated until we've filled all other fields */

--- 143,165 ----
      }

      /* Mode 8 - this doesn't include the file type bits (S_IFMT)  */
!     print_tar_number(&h[100], 8, (mode & 07777));

      /* User ID 8 */
!     print_tar_number(&h[108], 8, uid);

      /* Group 8 */
!     print_tar_number(&h[116], 8, gid);

!     /* File size 12 */
      if (linktarget != NULL || S_ISDIR(mode))
          /* Symbolic link or directory has size zero */
!         print_tar_number(&h[124], 12, 0);
      else
!         print_tar_number(&h[124], 12, size);

      /* Mod Time 12 */
!     print_tar_number(&h[136], 12, mtime);

      /* Checksum 8 cannot be calculated until we've filled all other fields */

*************** tarCreateHeader(char *h, const char *fil
*** 134,152 ****
      strlcpy(&h[297], "postgres", 32);

      /* Major Dev 8 */
!     sprintf(&h[329], "%07o ", 0);

      /* Minor Dev 8 */
!     sprintf(&h[337], "%07o ", 0);

      /* Prefix 155 - not used, leave as nulls */

!     /*
!      * We mustn't overwrite the next field while inserting the checksum.
!      * Fortunately, the checksum can't exceed 6 octal digits, so we just write
!      * 6 digits, a space, and a null, which is legal per POSIX.
!      */
!     sprintf(&h[148], "%06o ", tarChecksum(h));

      return TAR_OK;
  }
--- 192,206 ----
      strlcpy(&h[297], "postgres", 32);

      /* Major Dev 8 */
!     print_tar_number(&h[329], 8, 0);

      /* Minor Dev 8 */
!     print_tar_number(&h[337], 8, 0);

      /* Prefix 155 - not used, leave as nulls */

!     /* Finally, compute and insert the checksum */
!     print_tar_number(&h[148], 8, tarChecksum(h));

      return TAR_OK;
  }

pgsql-bugs by date:

Previous
From: Tom Lane
Date:
Subject: Re: Confusing error message with too-large file in pg_basebackup
Next
From: David Gould
Date:
Subject: Re: Confusing error message with too-large file in pg_basebackup