Re: pg_waldump: support decoding of WAL inside tarfile - Mailing list pgsql-hackers

From Tom Lane
Subject Re: pg_waldump: support decoding of WAL inside tarfile
Date
Msg-id 3049460.1775067940@sss.pgh.pa.us
Whole thread Raw
In response to Re: pg_waldump: support decoding of WAL inside tarfile  (Thomas Munro <thomas.munro@gmail.com>)
Responses Re: pg_waldump: support decoding of WAL inside tarfile
List pgsql-hackers
Thomas Munro <thomas.munro@gmail.com> writes:
> On Mon, Mar 30, 2026 at 11:23 AM Tom Lane <tgl@sss.pgh.pa.us> wrote:
>> I agree that this isn't too critical if the effects are confined to
>> pg_waldump.  I believe that pg_basebackup and pg_verifybackup also use
>> astreamer_tar.c, but it's not clear to me if they'd ever be asked to
>> parse files made by tar(1) and not by our own sparseness-ignorant
>> tar-writing code.  If they can be, that'd be a higher-priority reason
>> to fill in this gap.

> Yeah I can't see any reason why pg_verifybackup --wal-path=foo.tar
> won't suffer the same problem in the wild.  Again, it's not the end of
> the world because it'll just fail and you'll probably eventually
> figure out why.  So perhaps we should just improve our detection of
> archives that we can't handle?

After reading the POSIX spec for pax format (in the pax(1) man page),
I think it's absolutely essential that we reject files that contain
pax extension headers.  Those can change the interpretation of the
following file header(s) in nearly arbitrary ways, so we have plenty
of problems besides this sparse-file issue if we just ignore them.

(Of course, later we can consider improving the code to handle them
correctly, but that ain't happening in time for v19.)

Also, if we are admitting the possibility that what we are reading
was made by a platform-supplied tar and not our own code, I think
it verges on lunacy to behave as though unsupported typeflags are
regular files.

So I think we need something more or less like the attached.

            regards, tom lane

diff --git a/src/bin/pg_basebackup/astreamer_inject.c b/src/bin/pg_basebackup/astreamer_inject.c
index 14a96733bad..a5dff0ac1c6 100644
--- a/src/bin/pg_basebackup/astreamer_inject.c
+++ b/src/bin/pg_basebackup/astreamer_inject.c
@@ -224,8 +224,9 @@ astreamer_inject_file(astreamer *streamer, char *pathname, char *data,
     strlcpy(member.pathname, pathname, MAXPGPATH);
     member.size = len;
     member.mode = pg_file_create_mode;
+    member.is_regular = true;
     member.is_directory = false;
-    member.is_link = false;
+    member.is_symlink = false;
     member.linktarget[0] = '\0';

     /*
diff --git a/src/bin/pg_verifybackup/astreamer_verify.c b/src/bin/pg_verifybackup/astreamer_verify.c
index 26c98186530..99bd69ce7c9 100644
--- a/src/bin/pg_verifybackup/astreamer_verify.c
+++ b/src/bin/pg_verifybackup/astreamer_verify.c
@@ -165,7 +165,7 @@ member_verify_header(astreamer *streamer, astreamer_member *member)
     char        pathname[MAXPGPATH];

     /* We are only interested in normal files. */
-    if (member->is_directory || member->is_link)
+    if (!member->is_regular)
         return;

     /*
diff --git a/src/bin/pg_waldump/archive_waldump.c b/src/bin/pg_waldump/archive_waldump.c
index ff1cc3fa5f0..e4a4bf44a7e 100644
--- a/src/bin/pg_waldump/archive_waldump.c
+++ b/src/bin/pg_waldump/archive_waldump.c
@@ -815,7 +815,7 @@ member_is_wal_file(astreamer_waldump *mystreamer, astreamer_member *member,
     char       *filename;

     /* We are only interested in normal files */
-    if (member->is_directory || member->is_link)
+    if (!member->is_regular)
         return false;

     if (strlen(member->pathname) < XLOG_FNAME_LEN)
diff --git a/src/fe_utils/astreamer_file.c b/src/fe_utils/astreamer_file.c
index 158e9a14f2c..0fca70a4f86 100644
--- a/src/fe_utils/astreamer_file.c
+++ b/src/fe_utils/astreamer_file.c
@@ -228,9 +228,13 @@ astreamer_extractor_content(astreamer *streamer, astreamer_member *member,
                 mystreamer->filename[fnamelen - 1] = '\0';

             /* Dispatch based on file type. */
-            if (member->is_directory)
+            if (member->is_regular)
+                mystreamer->file =
+                    create_file_for_extract(mystreamer->filename,
+                                            member->mode);
+            else if (member->is_directory)
                 extract_directory(mystreamer->filename, member->mode);
-            else if (member->is_link)
+            else if (member->is_symlink)
             {
                 const char *linktarget = member->linktarget;

@@ -238,10 +242,6 @@ astreamer_extractor_content(astreamer *streamer, astreamer_member *member,
                     linktarget = mystreamer->link_map(linktarget);
                 extract_link(mystreamer->filename, linktarget);
             }
-            else
-                mystreamer->file =
-                    create_file_for_extract(mystreamer->filename,
-                                            member->mode);

             /* Report output file change. */
             if (mystreamer->report_output_file)
diff --git a/src/fe_utils/astreamer_tar.c b/src/fe_utils/astreamer_tar.c
index 3b094fc0328..3de6f8d6fb2 100644
--- a/src/fe_utils/astreamer_tar.c
+++ b/src/fe_utils/astreamer_tar.c
@@ -272,6 +272,9 @@ astreamer_tar_header(astreamer_tar_parser *mystreamer)

     Assert(mystreamer->base.bbs_buffer.len == TAR_BLOCK_SIZE);

+    /* Zero out fields of *member, just for consistency. */
+    memset(member, 0, sizeof(astreamer_member));
+
     /* Check whether we've got a block of all zero bytes. */
     for (i = 0; i < TAR_BLOCK_SIZE; ++i)
     {
@@ -299,12 +302,28 @@ astreamer_tar_header(astreamer_tar_parser *mystreamer)
     member->mode = read_tar_number(&buffer[TAR_OFFSET_MODE], 8);
     member->uid = read_tar_number(&buffer[TAR_OFFSET_UID], 8);
     member->gid = read_tar_number(&buffer[TAR_OFFSET_GID], 8);
-    member->is_directory =
-        (buffer[TAR_OFFSET_TYPEFLAG] == TAR_FILETYPE_DIRECTORY);
-    member->is_link =
-        (buffer[TAR_OFFSET_TYPEFLAG] == TAR_FILETYPE_SYMLINK);
-    if (member->is_link)
-        strlcpy(member->linktarget, &buffer[TAR_OFFSET_LINKNAME], 100);
+
+    switch (buffer[TAR_OFFSET_TYPEFLAG])
+    {
+        case TAR_FILETYPE_PLAIN:
+        case '\0':                /* backwards compatibility hack, per POSIX */
+            member->is_regular = true;
+            break;
+        case TAR_FILETYPE_DIRECTORY:
+            member->is_directory = true;
+            break;
+        case TAR_FILETYPE_SYMLINK:
+            member->is_symlink = true;
+            strlcpy(member->linktarget, &buffer[TAR_OFFSET_LINKNAME], 100);
+            break;
+        case TAR_FILETYPE_PAX_EXTENDED:
+        case TAR_FILETYPE_PAX_EXTENDED_GLOBAL:
+            pg_fatal("pax extensions to tar format are not supported");
+            break;
+        default:
+            /* For special files, set none of the three is_xxx flags */
+            break;
+    }

     /* Compute number of padding bytes. */
     mystreamer->pad_bytes_expected = tarPaddingBytesRequired(member->size);
diff --git a/src/include/fe_utils/astreamer.h b/src/include/fe_utils/astreamer.h
index f370ef62720..8329e4efbc5 100644
--- a/src/include/fe_utils/astreamer.h
+++ b/src/include/fe_utils/astreamer.h
@@ -83,8 +83,10 @@ typedef struct
     mode_t        mode;
     uid_t        uid;
     gid_t        gid;
+    /* note: special filetypes will set none of these flags */
+    bool        is_regular;
     bool        is_directory;
-    bool        is_link;
+    bool        is_symlink;
     char        linktarget[MAXPGPATH];
 } astreamer_member;

diff --git a/src/include/pgtar.h b/src/include/pgtar.h
index eb93bdef5c4..d6c434e09b0 100644
--- a/src/include/pgtar.h
+++ b/src/include/pgtar.h
@@ -60,6 +60,8 @@ enum tarFileType
     TAR_FILETYPE_PLAIN = '0',
     TAR_FILETYPE_SYMLINK = '2',
     TAR_FILETYPE_DIRECTORY = '5',
+    TAR_FILETYPE_PAX_EXTENDED = 'x',
+    TAR_FILETYPE_PAX_EXTENDED_GLOBAL = 'g',
 };

 extern enum tarError tarCreateHeader(char *h, const char *filename,

pgsql-hackers by date:

Previous
From: Heikki Linnakangas
Date:
Subject: Re: Better shared data structure management and resizable shared data structures
Next
From: Álvaro Herrera
Date:
Subject: Re: table AM option passing