Re: pg_waldump: support decoding of WAL inside tarfile - Mailing list pgsql-hackers

From Tom Lane
Subject Re: pg_waldump: support decoding of WAL inside tarfile
Date
Msg-id 3118179.1775092964@sss.pgh.pa.us
Whole thread Raw
In response to Re: pg_waldump: support decoding of WAL inside tarfile  (Thomas Munro <thomas.munro@gmail.com>)
Responses Re: pg_waldump: support decoding of WAL inside tarfile
List pgsql-hackers
Thomas Munro <thomas.munro@gmail.com> writes:
> On Thu, Apr 2, 2026 at 7:25 AM Tom Lane <tgl@sss.pgh.pa.us> wrote:
>> Also, if we are admitting the possibility that what we are reading
>> was made by a platform-supplied tar and not our own code, I think
>> it verges on lunacy to behave as though unsupported typeflags are
>> regular files.

> Yeah, if this is the first time we parse files we didn't make then
> that makes total sense.  I was a bit unsure of that question when I
> suggested we reject pax only after we've failed to find a file, in
> case there are scenarios that work today with harmless ignorable pax
> headers that don't change the file name.

Of course this is all new so far as pg_waldump is concerned.
I'm a bit unclear about whether pg_verifybackup's exposure
is large enough to warrant back-patching any of this.

Looking again at astreamer_tar.c, I suddenly realized that it doesn't
do any meaningful input validation.  So if you feed it junk input,
you get garbage errors that aren't even predictable:

$ head -c 32768 /dev/urandom >junk.tar
$ pg_waldump --path junk.tar --start 1/0x10000000
pg_waldump: error: COPY stream ended before last file was finished
$ head -c 32768 /dev/urandom >junk.tar
$ pg_waldump --path junk.tar --start 1/0x10000000
pg_waldump: error: tar member has empty name
$ head -c 32768 /dev/urandom >junk.tar
$ pg_waldump --path junk.tar --start 1/0x10000000
pg_waldump: error: COPY stream ended before last file was finished
$ head -c 32768 /dev/urandom >junk.tar
$ pg_waldump --path junk.tar --start 1/0x10000000
pg_waldump: error: could not find WAL in archive "junk.tar"

tar itself is considerably saner:

$ tar tf junk.tar 
tar: This does not look like a tar archive
tar: Skipping to next header
tar: Exiting with failure status due to previous errors

So I think we need something like the attached, in addition
to what I sent before.  This just makes astreamer_tar.c use
the isValidTarHeader function that pg_dump already had.
(I decided to const-ify isValidTarHeader's argument while
moving it to a shared location, which in turn requires
const-ifying tarChecksum.)

            regards, tom lane

diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 271a2c3e481..fecf6f2d1ce 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -44,6 +44,7 @@
 #include "pg_backup_archiver.h"
 #include "pg_backup_db.h"
 #include "pg_backup_utils.h"
+#include "pgtar.h"

 #define TEXT_DUMP_HEADER "--\n-- PostgreSQL database dump\n--\n\n"
 #define TEXT_DUMPALL_HEADER "--\n-- PostgreSQL database cluster dump\n--\n\n"
@@ -2372,7 +2373,7 @@ _discoverArchiveFormat(ArchiveHandle *AH)
         }

         if (!isValidTarHeader(AH->lookahead))
-            pg_fatal("input file does not appear to be a valid archive");
+            pg_fatal("input file does not appear to be a valid tar archive");

         AH->format = archTar;
     }
diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h
index 365073b3eae..9c3aca6543a 100644
--- a/src/bin/pg_dump/pg_backup_archiver.h
+++ b/src/bin/pg_dump/pg_backup_archiver.h
@@ -465,8 +465,6 @@ extern void InitArchiveFmt_Null(ArchiveHandle *AH);
 extern void InitArchiveFmt_Directory(ArchiveHandle *AH);
 extern void InitArchiveFmt_Tar(ArchiveHandle *AH);

-extern bool isValidTarHeader(char *header);
-
 extern void ReconnectToServer(ArchiveHandle *AH, const char *dbname);
 extern void IssueCommandPerBlob(ArchiveHandle *AH, TocEntry *te,
                                 const char *cmdBegin, const char *cmdEnd);
diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c
index d94d0de2a5d..a3879410c94 100644
--- a/src/bin/pg_dump/pg_backup_tar.c
+++ b/src/bin/pg_dump/pg_backup_tar.c
@@ -984,31 +984,6 @@ tarPrintf(TAR_MEMBER *th, const char *fmt,...)
     return (int) cnt;
 }

-bool
-isValidTarHeader(char *header)
-{
-    int            sum;
-    int            chk = tarChecksum(header);
-
-    sum = read_tar_number(&header[TAR_OFFSET_CHECKSUM], 8);
-
-    if (sum != chk)
-        return false;
-
-    /* POSIX tar format */
-    if (memcmp(&header[TAR_OFFSET_MAGIC], "ustar\0", 6) == 0 &&
-        memcmp(&header[TAR_OFFSET_VERSION], "00", 2) == 0)
-        return true;
-    /* GNU tar format */
-    if (memcmp(&header[TAR_OFFSET_MAGIC], "ustar  \0", 8) == 0)
-        return true;
-    /* not-quite-POSIX format written by pre-9.3 pg_dump */
-    if (memcmp(&header[TAR_OFFSET_MAGIC], "ustar00\0", 8) == 0)
-        return true;
-
-    return false;
-}
-
 /* Given the member, write the TAR header & copy the file */
 static void
 _tarAddFile(ArchiveHandle *AH, TAR_MEMBER *th)
diff --git a/src/fe_utils/astreamer_tar.c b/src/fe_utils/astreamer_tar.c
index 3b094fc0328..8fa3329237d 100644
--- a/src/fe_utils/astreamer_tar.c
+++ b/src/fe_utils/astreamer_tar.c
@@ -260,7 +260,8 @@ astreamer_tar_parser_content(astreamer *streamer, astreamer_member *member,
  * Parse a file header within a tar stream.
  *
  * The return value is true if we found a file header and passed it on to the
- * next astreamer; it is false if we have reached the archive trailer.
+ * next astreamer; it is false if we have found the archive trailer.
+ * We throw error if we see invalid data.
  */
 static bool
 astreamer_tar_header(astreamer_tar_parser *mystreamer)
@@ -289,6 +290,12 @@ astreamer_tar_header(astreamer_tar_parser *mystreamer)
     if (!has_nonzero_byte)
         return false;

+    /*
+     * Verify that we have a reasonable-looking header.
+     */
+    if (!isValidTarHeader(buffer))
+        pg_fatal("input file does not appear to be a valid tar archive");
+
     /*
      * Parse key fields out of the header.
      */
diff --git a/src/include/pgtar.h b/src/include/pgtar.h
index eb93bdef5c4..55b8c7c77a4 100644
--- a/src/include/pgtar.h
+++ b/src/include/pgtar.h
@@ -68,7 +68,8 @@ extern enum tarError tarCreateHeader(char *h, const char *filename,
                                      time_t mtime);
 extern uint64 read_tar_number(const char *s, int len);
 extern void print_tar_number(char *s, int len, uint64 val);
-extern int    tarChecksum(char *header);
+extern int    tarChecksum(const char *header);
+extern bool isValidTarHeader(const char *header);

 /*
  * Compute the number of padding bytes required for an entry in a tar
diff --git a/src/port/tar.c b/src/port/tar.c
index 592b4fb7b0f..db462b90292 100644
--- a/src/port/tar.c
+++ b/src/port/tar.c
@@ -87,7 +87,7 @@ read_tar_number(const char *s, int len)
  * be 512 bytes, per the tar standard.
  */
 int
-tarChecksum(char *header)
+tarChecksum(const char *header)
 {
     int            i,
                 sum;
@@ -104,6 +104,35 @@ tarChecksum(char *header)
     return sum;
 }

+/*
+ * Check validity of a tar header (assumed to be 512 bytes long).
+ * We verify the checksum and the magic number / version.
+ */
+bool
+isValidTarHeader(const char *header)
+{
+    int            sum;
+    int            chk = tarChecksum(header);
+
+    sum = read_tar_number(&header[TAR_OFFSET_CHECKSUM], 8);
+
+    if (sum != chk)
+        return false;
+
+    /* POSIX tar format */
+    if (memcmp(&header[TAR_OFFSET_MAGIC], "ustar\0", 6) == 0 &&
+        memcmp(&header[TAR_OFFSET_VERSION], "00", 2) == 0)
+        return true;
+    /* GNU tar format */
+    if (memcmp(&header[TAR_OFFSET_MAGIC], "ustar  \0", 8) == 0)
+        return true;
+    /* not-quite-POSIX format written by pre-9.3 pg_dump */
+    if (memcmp(&header[TAR_OFFSET_MAGIC], "ustar00\0", 8) == 0)
+        return true;
+
+    return false;
+}
+

 /*
  * Fill in the buffer pointed to by h with a tar format header. This buffer

pgsql-hackers by date:

Previous
From: Andreas Karlsson
Date:
Subject: Re: Check some unchecked fclose() results
Next
From: shihao zhong
Date:
Subject: Re: [Patch] New pg_stat_tablespace view