Re: XTS cipher mode for cluster file encryption - Mailing list pgsql-hackers

From Antonin Houska
Subject Re: XTS cipher mode for cluster file encryption
Date
Msg-id 18009.1638171451@antos
Whole thread Raw
In response to Re: XTS cipher mode for cluster file encryption  (Stephen Frost <sfrost@snowman.net>)
Responses Re: XTS cipher mode for cluster file encryption
List pgsql-hackers
Stephen Frost <sfrost@snowman.net> wrote:

> Greetings,
> 
> * Bruce Momjian (bruce@momjian.us) wrote:
> > On Tue, Oct 19, 2021 at 02:54:56PM -0400, Stephen Frost wrote:
> > > * Sasasu (i@sasa.su) wrote:
> > > > A unified block-based I/O API sounds great. Has anyone tried to do this
> > > > before? It would be nice if the front-end tools could also use these API.
> > > 
> > > The TDE patch from Cybertec did go down this route, but the API ended up
> > > being rather different which menat a lot of changes in other parts of
> > > the system.  If we can get a block-based temporary file method that
> > > maintains more-or-less the same API, that'd be great, but I'm not sure
> > > that we can really do so and I am not entirely convinced that we should
> > > make the TDE effort depend on an otherwise quite independent effort of
> > > making all temp files usage be block based.
> > 
> > Uh, I thought people felt the Cybertec patch was too large and that a
> > unified API for temporary file I/O-encryption was a requirement.  Would
> > a CTR-steaming-encryption API for temporary tables be easier to
> > implement?
> 
> Having a unified API for temporary file I/O (that could then be extended
> to provide encryption) would definitely help with reducing the size of a
> TDE patch.  The approach used in the Cybertec patch was to make
> temporary file access block based, but the way that was implemented was
> with an API different from pread/pwrite and that meant changing pretty
> much all of the call sites for temporary file access, which naturally
> resulted in changes in a lot of otherwise unrelated code.

The changes to buffile.c are not trivial, but we haven't really changed the
API, as long as you mean BufFileCreateTemp(), BufFileWrite(), BufFileRead().

What our patch affects on the caller side is that BufFileOpenTransient(),
BufFileCloseTransient(), BufFileWriteTransient() and BufFileReadTransient()
replace OpenTransientFile(), CloseTransientFile(), write()/fwrite() and
read()/fread() respectively in reorderbuffer.c and in pgstat.c. These changes
become a little bit less invasive in TDE 1.1 than they were in 1.0, see [1],
see the diffs attached.

(I expect that [2] will get committed someday so that the TDE feature won't
affect pgstat.c in the future at all.)

-- 
Antonin Houska
Web: https://www.cybertec-postgresql.com


[1] https://github.com/cybertec-postgresql/postgres/tree/PG_14_TDE_1_1

[2] https://commitfest.postgresql.org/34/1708/

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index d474ea1d0a..9cb0708e41 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -99,6 +99,7 @@
 #include "replication/slot.h"
 #include "replication/snapbuild.h"    /* just for SnapBuildSnapDecRefcount */
 #include "storage/bufmgr.h"
+#include "storage/buffile.h"
 #include "storage/fd.h"
 #include "storage/sinval.h"
 #include "utils/builtins.h"
@@ -131,21 +132,13 @@ typedef struct ReorderBufferTupleCidEnt
     CommandId    combocid;        /* just for debugging */
 } ReorderBufferTupleCidEnt;
 
-/* Virtual file descriptor with file offset tracking */
-typedef struct TXNEntryFile
-{
-    File        vfd;            /* -1 when the file is closed */
-    off_t        curOffset;        /* offset for next write or read. Reset to 0
-                                 * when vfd is opened. */
-} TXNEntryFile;
-
 /* k-way in-order change iteration support structures */
 typedef struct ReorderBufferIterTXNEntry
 {
     XLogRecPtr    lsn;
     ReorderBufferChange *change;
     ReorderBufferTXN *txn;
-    TXNEntryFile file;
+    TransientBufFile *file;
     XLogSegNo    segno;
 } ReorderBufferIterTXNEntry;
 
@@ -243,14 +236,22 @@ static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMe
  * Disk serialization support functions
  * ---------------------------------------
  */
+static void ReorderBufferTweakBase(ReorderBufferTXN *txn,
+                                   char tweak_base[TWEAK_BASE_SIZE]);
 static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
 static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
 static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
-                                         int fd, ReorderBufferChange *change);
+                             TransientBufFile *file, ReorderBufferChange *change);
+static void ReorderBufferWriteData(TransientBufFile *file, void *ptr, size_t size,
+                       ReorderBufferTXN *txn);
 static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
-                                        TXNEntryFile *file, XLogSegNo *segno);
+                            TransientBufFile **file, XLogSegNo *segno);
 static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
-                                       char *change);
+                           TransientBufFile **file);
+static ReorderBufferTupleBuf *ReorderBufferRestoreTuple(ReorderBuffer *rb,
+                          TransientBufFile *file);
+static void ReorderBufferReadData(TransientBufFile *file, void *ptr, size_t size,
+                      bool *no_data_p);
 static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
 static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
                                      bool txn_prepared);
@@ -342,8 +343,6 @@ ReorderBufferAllocate(void)
     buffer->by_txn_last_xid = InvalidTransactionId;
     buffer->by_txn_last_txn = NULL;
 
-    buffer->outbuf = NULL;
-    buffer->outbufsize = 0;
     buffer->size = 0;
 
     buffer->spillTxns = 0;
@@ -1241,7 +1240,7 @@ ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
 
     for (off = 0; off < state->nr_txns; off++)
     {
-        state->entries[off].file.vfd = -1;
+        state->entries[off].file = NULL;
         state->entries[off].segno = 0;
     }
 
@@ -1423,8 +1422,8 @@ ReorderBufferIterTXNFinish(ReorderBuffer *rb,
 
     for (off = 0; off < state->nr_txns; off++)
     {
-        if (state->entries[off].file.vfd != -1)
-            FileClose(state->entries[off].file.vfd);
+        if (state->entries[off].file)
+            BufFileCloseTransient(state->entries[off].file);
     }
 
     /* free memory we might have "leaked" in the last *Next call */
@@ -3332,21 +3331,39 @@ ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
  */
 
 /*
- * Ensure the IO buffer is >= sz.
+ * Initialize the common part of the encryption tweak.
  */
 static void
-ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
+ReorderBufferTweakBase(ReorderBufferTXN *txn,
+                       char tweak_base[TWEAK_BASE_SIZE])
 {
-    if (!rb->outbufsize)
-    {
-        rb->outbuf = MemoryContextAlloc(rb->context, sz);
-        rb->outbufsize = sz;
-    }
-    else if (rb->outbufsize < sz)
-    {
-        rb->outbuf = repalloc(rb->outbuf, sz);
-        rb->outbufsize = sz;
-    }
+    char    *c = tweak_base;
+    pid_t    pid = MyProcPid;
+    int    pid_bytes;
+
+/* Only this part of the PID fits into the tweak. */
+#define PID_BYTES_USABLE    3
+
+    StaticAssertStmt(1 + sizeof(TransactionId) + PID_BYTES_USABLE
+                     <= TWEAK_BASE_SIZE,
+                     "tweak components do not fit into TWEAK_BASE_SIZE");
+
+    memset(tweak_base, 0, TWEAK_BASE_SIZE);
+    *c = TRANS_BUF_FILE_REORDERBUFFER;
+    c++;
+    memcpy(c, &txn->xid, sizeof(TransactionId));
+    c += sizeof(TransactionId);
+
+    /*
+     * There's only room for PID_BYTES_USABLE bytes of the PID. Use the less
+     * significant part so that PID increment always causes tweak change.
+     */
+    pid_bytes = Min(sizeof(pid), PID_BYTES_USABLE);
+#ifdef WORDS_BIGENDIAN
+    memcpy(c, ((char *) &pid) + sizeof(pid) - pid_bytes, pid_bytes);
+#else
+    memcpy(c, &pid, pid_bytes);
+#endif
 }
 
 /*
@@ -3517,9 +3534,10 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 {
     dlist_iter    subtxn_i;
     dlist_mutable_iter change_i;
-    int            fd = -1;
+    TransientBufFile *file = NULL;
     XLogSegNo    curOpenSegNo = 0;
     Size        spilled = 0;
+
     Size        size = txn->size;
 
     elog(DEBUG2, "spill %u changes in XID %u to disk",
@@ -3545,13 +3563,14 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
          * store in segment in which it belongs by start lsn, don't split over
          * multiple segments tho
          */
-        if (fd == -1 ||
+        if (file == NULL ||
             !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
         {
             char        path[MAXPGPATH];
+            char        tweak_base[TWEAK_BASE_SIZE];
 
-            if (fd != -1)
-                CloseTransientFile(fd);
+            if (file)
+                BufFileCloseTransient(file);
 
             XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
 
@@ -3562,17 +3581,15 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
             ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
                                         curOpenSegNo);
 
+            if (data_encrypted)
+                ReorderBufferTweakBase(txn, tweak_base);
             /* open segment, create it if necessary */
-            fd = OpenTransientFile(path,
-                                   O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
-
-            if (fd < 0)
-                ereport(ERROR,
-                        (errcode_for_file_access(),
-                         errmsg("could not open file \"%s\": %m", path)));
+            file = BufFileOpenTransient(path,
+                                        O_CREAT | O_WRONLY | O_APPEND | PG_BINARY,
+                                        tweak_base);
         }
 
-        ReorderBufferSerializeChange(rb, txn, fd, change);
+        ReorderBufferSerializeChange(rb, txn, file, change);
         dlist_delete(&change->node);
         ReorderBufferReturnChange(rb, change, true);
 
@@ -3597,8 +3614,8 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
     txn->nentries_mem = 0;
     txn->txn_flags |= RBTXN_IS_SERIALIZED;
 
-    if (fd != -1)
-        CloseTransientFile(fd);
+    if (file)
+        BufFileCloseTransient(file);
 }
 
 /*
@@ -3606,15 +3623,13 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
  */
 static void
 ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
-                             int fd, ReorderBufferChange *change)
+                             TransientBufFile *file, ReorderBufferChange *change)
 {
-    ReorderBufferDiskChange *ondisk;
+    ReorderBufferDiskChange hdr;
     Size        sz = sizeof(ReorderBufferDiskChange);
 
-    ReorderBufferSerializeReserve(rb, sz);
-
-    ondisk = (ReorderBufferDiskChange *) rb->outbuf;
-    memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
+    memcpy((char *) &hdr + offsetof(ReorderBufferDiskChange, change),
+           change, sizeof(ReorderBufferChange));
 
     switch (change->action)
     {
@@ -3624,7 +3639,6 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
         case REORDER_BUFFER_CHANGE_DELETE:
         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
             {
-                char       *data;
                 ReorderBufferTupleBuf *oldtup,
                            *newtup;
                 Size        oldlen = 0;
@@ -3647,84 +3661,71 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
                     sz += newlen;
                 }
 
-                /* make sure we have enough space */
-                ReorderBufferSerializeReserve(rb, sz);
-
-                data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
-                /* might have been reallocated above */
-                ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+                hdr.size = sz;
+                ReorderBufferWriteData(file, &hdr, sizeof(ReorderBufferDiskChange),
+                                       txn);
 
                 if (oldlen)
                 {
-                    memcpy(data, &oldtup->tuple, sizeof(HeapTupleData));
-                    data += sizeof(HeapTupleData);
-
-                    memcpy(data, oldtup->tuple.t_data, oldlen);
-                    data += oldlen;
+                    ReorderBufferWriteData(file, &oldtup->tuple,
+                                           sizeof(HeapTupleData), txn);
+                    ReorderBufferWriteData(file, oldtup->tuple.t_data, oldlen,
+                                           txn);
                 }
 
                 if (newlen)
                 {
-                    memcpy(data, &newtup->tuple, sizeof(HeapTupleData));
-                    data += sizeof(HeapTupleData);
-
-                    memcpy(data, newtup->tuple.t_data, newlen);
-                    data += newlen;
+                    ReorderBufferWriteData(file, &newtup->tuple,
+                                           sizeof(HeapTupleData), txn);
+                    ReorderBufferWriteData(file, newtup->tuple.t_data, newlen,
+                                           txn);
                 }
                 break;
             }
         case REORDER_BUFFER_CHANGE_MESSAGE:
             {
-                char       *data;
                 Size        prefix_size = strlen(change->data.msg.prefix) + 1;
 
                 sz += prefix_size + change->data.msg.message_size +
                     sizeof(Size) + sizeof(Size);
-                ReorderBufferSerializeReserve(rb, sz);
 
-                data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
-
-                /* might have been reallocated above */
-                ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+                hdr.size = sz;
+                ReorderBufferWriteData(file, &hdr,
+                                       sizeof(ReorderBufferDiskChange),
+                                       txn);
 
                 /* write the prefix including the size */
-                memcpy(data, &prefix_size, sizeof(Size));
-                data += sizeof(Size);
-                memcpy(data, change->data.msg.prefix,
-                       prefix_size);
-                data += prefix_size;
+                ReorderBufferWriteData(file, &prefix_size, sizeof(Size), txn);
+                ReorderBufferWriteData(file, change->data.msg.prefix,
+                                       prefix_size, txn);
 
                 /* write the message including the size */
-                memcpy(data, &change->data.msg.message_size, sizeof(Size));
-                data += sizeof(Size);
-                memcpy(data, change->data.msg.message,
-                       change->data.msg.message_size);
-                data += change->data.msg.message_size;
+                ReorderBufferWriteData(file, &change->data.msg.message_size,
+                                       sizeof(Size), txn);
+                ReorderBufferWriteData(file, change->data.msg.message,
+                                       change->data.msg.message_size, txn);
 
                 break;
             }
         case REORDER_BUFFER_CHANGE_INVALIDATION:
             {
-                char       *data;
                 Size        inval_size = sizeof(SharedInvalidationMessage) *
                 change->data.inval.ninvalidations;
 
                 sz += inval_size;
 
-                ReorderBufferSerializeReserve(rb, sz);
-                data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
-
-                /* might have been reallocated above */
-                ondisk = (ReorderBufferDiskChange *) rb->outbuf;
-                memcpy(data, change->data.inval.invalidations, inval_size);
-                data += inval_size;
-
+                hdr.size = sz;
+                ReorderBufferWriteData(file, &hdr,
+                                       sizeof(ReorderBufferDiskChange),
+                                       txn);
+                ReorderBufferWriteData(file, change->data.inval.invalidations,
+                                       inval_size,
+                                       txn);
                 break;
             }
         case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
             {
                 Snapshot    snap;
-                char       *data;
 
                 snap = change->data.snapshot;
 
@@ -3732,78 +3733,50 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
                     sizeof(TransactionId) * snap->xcnt +
                     sizeof(TransactionId) * snap->subxcnt;
 
-                /* make sure we have enough space */
-                ReorderBufferSerializeReserve(rb, sz);
-                data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
-                /* might have been reallocated above */
-                ondisk = (ReorderBufferDiskChange *) rb->outbuf;
+                hdr.size = sz;
+                ReorderBufferWriteData(file, &hdr,
+                                       sizeof(ReorderBufferDiskChange), txn);
 
-                memcpy(data, snap, sizeof(SnapshotData));
-                data += sizeof(SnapshotData);
+                ReorderBufferWriteData(file, snap, sizeof(SnapshotData), txn);
 
                 if (snap->xcnt)
-                {
-                    memcpy(data, snap->xip,
-                           sizeof(TransactionId) * snap->xcnt);
-                    data += sizeof(TransactionId) * snap->xcnt;
-                }
+                    ReorderBufferWriteData(file, snap->xip,
+                                           sizeof(TransactionId) * snap->xcnt,
+                                           txn);
 
                 if (snap->subxcnt)
-                {
-                    memcpy(data, snap->subxip,
-                           sizeof(TransactionId) * snap->subxcnt);
-                    data += sizeof(TransactionId) * snap->subxcnt;
-                }
+                    ReorderBufferWriteData(file, snap->subxip,
+                                           sizeof(TransactionId) * snap->subxcnt,
+                                           txn);
                 break;
             }
         case REORDER_BUFFER_CHANGE_TRUNCATE:
             {
                 Size        size;
-                char       *data;
 
                 /* account for the OIDs of truncated relations */
                 size = sizeof(Oid) * change->data.truncate.nrelids;
                 sz += size;
 
-                /* make sure we have enough space */
-                ReorderBufferSerializeReserve(rb, sz);
-
-                data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
-                /* might have been reallocated above */
-                ondisk = (ReorderBufferDiskChange *) rb->outbuf;
-
-                memcpy(data, change->data.truncate.relids, size);
-                data += size;
+                hdr.size = sz;
+                ReorderBufferWriteData(file, &hdr, sizeof(ReorderBufferDiskChange),
+                                       txn);
 
+                ReorderBufferWriteData(file, change->data.truncate.relids, size,
+                                       txn);
                 break;
             }
         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
         case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
         case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
+            hdr.size = sz;
+            ReorderBufferWriteData(file, &hdr, sizeof(ReorderBufferDiskChange),
+                                   txn);
             /* ReorderBufferChange contains everything important */
             break;
     }
 
-    ondisk->size = sz;
-
-    errno = 0;
-    pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
-    if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
-    {
-        int            save_errno = errno;
-
-        CloseTransientFile(fd);
-
-        /* if write didn't set errno, assume problem is no disk space */
-        errno = save_errno ? save_errno : ENOSPC;
-        ereport(ERROR,
-                (errcode_for_file_access(),
-                 errmsg("could not write to data file for XID %u: %m",
-                        txn->xid)));
-    }
-    pgstat_report_wait_end();
-
     /*
      * Keep the transaction's final_lsn up to date with each change we send to
      * disk, so that ReorderBufferRestoreCleanup works correctly.  (We used to
@@ -3814,8 +3787,21 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
      */
     if (txn->final_lsn < change->lsn)
         txn->final_lsn = change->lsn;
+}
 
-    Assert(ondisk->change.action == change->action);
+/*
+ * Wrapper for BufFileWriteTransient() that raises ERROR if the whole chunk
+ * was not written. XXX Should this be a macro?
+ */
+static void
+ReorderBufferWriteData(TransientBufFile *file, void *ptr, size_t size,
+                       ReorderBufferTXN *txn)
+{
+    if (BufFileWriteTransient(file, ptr, size) != size)
+        ereport(ERROR,
+                (errcode_for_file_access(),
+                 errmsg("could not write to data file for XID %u: %m",
+                        txn->xid)));
 }
 
 /* Returns true, if the output plugin supports streaming, false, otherwise. */
@@ -4058,12 +4044,11 @@ ReorderBufferChangeSize(ReorderBufferChange *change)
  */
 static Size
 ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
-                            TXNEntryFile *file, XLogSegNo *segno)
+                            TransientBufFile **file, XLogSegNo *segno)
 {
     Size        restored = 0;
     XLogSegNo    last_segno;
     dlist_mutable_iter cleanup_iter;
-    File       *fd = &file->vfd;
 
     Assert(txn->first_lsn != InvalidXLogRecPtr);
     Assert(txn->final_lsn != InvalidXLogRecPtr);
@@ -4084,12 +4069,10 @@ ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
 
     while (restored < max_changes_in_memory && *segno <= last_segno)
     {
-        int            readBytes;
-        ReorderBufferDiskChange *ondisk;
-
-        if (*fd == -1)
+        if (*file == NULL)
         {
             char        path[MAXPGPATH];
+            char        tweak_base[TWEAK_BASE_SIZE];
 
             /* first time in */
             if (*segno == 0)
@@ -4104,86 +4087,27 @@ ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
             ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
                                         *segno);
 
-            *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
-
-            /* No harm in resetting the offset even in case of failure */
-            file->curOffset = 0;
-
-            if (*fd < 0 && errno == ENOENT)
+            if (data_encrypted)
+                ReorderBufferTweakBase(txn, tweak_base);
+            *file = BufFileOpenTransient(path, O_RDONLY | PG_BINARY,
+                                         tweak_base);
+            if (*file == NULL)
             {
-                *fd = -1;
+                Assert(errno == ENOENT);
                 (*segno)++;
                 continue;
             }
-            else if (*fd < 0)
-                ereport(ERROR,
-                        (errcode_for_file_access(),
-                         errmsg("could not open file \"%s\": %m",
-                                path)));
         }
 
-        /*
-         * Read the statically sized part of a change which has information
-         * about the total size. If we couldn't read a record, we're at the
-         * end of this file.
-         */
-        ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
-        readBytes = FileRead(file->vfd, rb->outbuf,
-                             sizeof(ReorderBufferDiskChange),
-                             file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
-
-        /* eof */
-        if (readBytes == 0)
+        ReorderBufferRestoreChange(rb, txn, file);
+        if (*file)
+            restored++;
+        else
         {
-            FileClose(*fd);
-            *fd = -1;
+            /* No data could be restored. */
             (*segno)++;
             continue;
         }
-        else if (readBytes < 0)
-            ereport(ERROR,
-                    (errcode_for_file_access(),
-                     errmsg("could not read from reorderbuffer spill file: %m")));
-        else if (readBytes != sizeof(ReorderBufferDiskChange))
-            ereport(ERROR,
-                    (errcode_for_file_access(),
-                     errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
-                            readBytes,
-                            (uint32) sizeof(ReorderBufferDiskChange))));
-
-        file->curOffset += readBytes;
-
-        ondisk = (ReorderBufferDiskChange *) rb->outbuf;
-
-        ReorderBufferSerializeReserve(rb,
-                                      sizeof(ReorderBufferDiskChange) + ondisk->size);
-        ondisk = (ReorderBufferDiskChange *) rb->outbuf;
-
-        readBytes = FileRead(file->vfd,
-                             rb->outbuf + sizeof(ReorderBufferDiskChange),
-                             ondisk->size - sizeof(ReorderBufferDiskChange),
-                             file->curOffset,
-                             WAIT_EVENT_REORDER_BUFFER_READ);
-
-        if (readBytes < 0)
-            ereport(ERROR,
-                    (errcode_for_file_access(),
-                     errmsg("could not read from reorderbuffer spill file: %m")));
-        else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
-            ereport(ERROR,
-                    (errcode_for_file_access(),
-                     errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
-                            readBytes,
-                            (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
-
-        file->curOffset += readBytes;
-
-        /*
-         * ok, read a full change from disk, now restore it into proper
-         * in-memory format
-         */
-        ReorderBufferRestoreChange(rb, txn, rb->outbuf);
-        restored++;
     }
 
     return restored;
@@ -4193,25 +4117,36 @@ ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
  * Convert change from its on-disk format to in-memory format and queue it onto
  * the TXN's ->changes list.
  *
- * Note: although "data" is declared char*, at entry it points to a
- * maxalign'd buffer, making it safe in most of this function to assume
- * that the pointed-to data is suitably aligned for direct access.
+ * If no data was found in the file, close it and set *file to NULL.
  */
 static void
 ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
-                           char *data)
+                           TransientBufFile **file)
 {
-    ReorderBufferDiskChange *ondisk;
+    ReorderBufferDiskChange ondisk;
+    bool        no_data;
     ReorderBufferChange *change;
 
-    ondisk = (ReorderBufferDiskChange *) data;
+    /*
+     * Read the statically sized part of a change which has information about
+     * the total size. If we couldn't read a record, we're at the end of this
+     * file.
+     */
+    ReorderBufferReadData(*file, &ondisk, sizeof(ReorderBufferDiskChange),
+                          &no_data);
+
+    /* eof */
+    if (no_data)
+    {
+        BufFileCloseTransient(*file);
+        *file = NULL;
+        return;
+    }
 
     change = ReorderBufferGetChange(rb);
 
     /* copy static part */
-    memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
-
-    data += sizeof(ReorderBufferDiskChange);
+    memcpy(change, &ondisk.change, sizeof(ReorderBufferChange));
 
     /* restore individual stuff */
     switch (change->action)
@@ -4222,50 +4157,10 @@ ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
         case REORDER_BUFFER_CHANGE_DELETE:
         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
             if (change->data.tp.oldtuple)
-            {
-                uint32        tuplelen = ((HeapTuple) data)->t_len;
-
-                change->data.tp.oldtuple =
-                    ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
-
-                /* restore ->tuple */
-                memcpy(&change->data.tp.oldtuple->tuple, data,
-                       sizeof(HeapTupleData));
-                data += sizeof(HeapTupleData);
-
-                /* reset t_data pointer into the new tuplebuf */
-                change->data.tp.oldtuple->tuple.t_data =
-                    ReorderBufferTupleBufData(change->data.tp.oldtuple);
-
-                /* restore tuple data itself */
-                memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen);
-                data += tuplelen;
-            }
+                change->data.tp.oldtuple = ReorderBufferRestoreTuple(rb, *file);
 
             if (change->data.tp.newtuple)
-            {
-                /* here, data might not be suitably aligned! */
-                uint32        tuplelen;
-
-                memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
-                       sizeof(uint32));
-
-                change->data.tp.newtuple =
-                    ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
-
-                /* restore ->tuple */
-                memcpy(&change->data.tp.newtuple->tuple, data,
-                       sizeof(HeapTupleData));
-                data += sizeof(HeapTupleData);
-
-                /* reset t_data pointer into the new tuplebuf */
-                change->data.tp.newtuple->tuple.t_data =
-                    ReorderBufferTupleBufData(change->data.tp.newtuple);
-
-                /* restore tuple data itself */
-                memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen);
-                data += tuplelen;
-            }
+                change->data.tp.newtuple = ReorderBufferRestoreTuple(rb, *file);
 
             break;
         case REORDER_BUFFER_CHANGE_MESSAGE:
@@ -4273,22 +4168,20 @@ ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
                 Size        prefix_size;
 
                 /* read prefix */
-                memcpy(&prefix_size, data, sizeof(Size));
-                data += sizeof(Size);
+                ReorderBufferReadData(*file, &prefix_size, sizeof(Size), NULL);
                 change->data.msg.prefix = MemoryContextAlloc(rb->context,
                                                              prefix_size);
-                memcpy(change->data.msg.prefix, data, prefix_size);
+                ReorderBufferReadData(*file, change->data.msg.prefix,
+                                      prefix_size, NULL);
                 Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
-                data += prefix_size;
 
                 /* read the message */
-                memcpy(&change->data.msg.message_size, data, sizeof(Size));
-                data += sizeof(Size);
+                ReorderBufferReadData(*file, &change->data.msg.message_size,
+                                      sizeof(Size), NULL);
                 change->data.msg.message = MemoryContextAlloc(rb->context,
                                                               change->data.msg.message_size);
-                memcpy(change->data.msg.message, data,
-                       change->data.msg.message_size);
-                data += change->data.msg.message_size;
+                ReorderBufferReadData(*file, change->data.msg.message,
+                                      change->data.msg.message_size, NULL);
 
                 break;
             }
@@ -4301,29 +4194,32 @@ ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
                     MemoryContextAlloc(rb->context, inval_size);
 
                 /* read the message */
-                memcpy(change->data.inval.invalidations, data, inval_size);
+                ReorderBufferReadData(*file, change->data.inval.invalidations,
+                                      inval_size, NULL);
 
                 break;
             }
         case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
             {
-                Snapshot    oldsnap;
+                SnapshotData oldsnap;
                 Snapshot    newsnap;
                 Size        size;
 
-                oldsnap = (Snapshot) data;
+                ReorderBufferReadData(*file, &oldsnap, sizeof(SnapshotData), NULL);
 
                 size = sizeof(SnapshotData) +
-                    sizeof(TransactionId) * oldsnap->xcnt +
-                    sizeof(TransactionId) * (oldsnap->subxcnt + 0);
+                    sizeof(TransactionId) * oldsnap.xcnt +
+                    sizeof(TransactionId) * (oldsnap.subxcnt + 0);
 
                 change->data.snapshot = MemoryContextAllocZero(rb->context, size);
 
                 newsnap = change->data.snapshot;
 
-                memcpy(newsnap, data, size);
+                memcpy(newsnap, &oldsnap, sizeof(SnapshotData));
                 newsnap->xip = (TransactionId *)
                     (((char *) newsnap) + sizeof(SnapshotData));
+                ReorderBufferReadData(*file, newsnap->xip,
+                                      size - sizeof(SnapshotData), NULL);
                 newsnap->subxip = newsnap->xip + newsnap->xcnt;
                 newsnap->copied = true;
                 break;
@@ -4335,7 +4231,9 @@ ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
 
                 relids = ReorderBufferGetRelids(rb,
                                                 change->data.truncate.nrelids);
-                memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
+                ReorderBufferReadData(*file, relids,
+                                      change->data.truncate.nrelids * sizeof(Oid),
+                                      NULL);
                 change->data.truncate.relids = relids;
 
                 break;
@@ -4362,6 +4260,77 @@ ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
                                     ReorderBufferChangeSize(change));
 }
 
+/*
+ * Convert heap tuple from its on-disk format to in-memory format.
+ */
+static ReorderBufferTupleBuf *
+ReorderBufferRestoreTuple(ReorderBuffer *rb, TransientBufFile *file)
+{
+    HeapTupleData tupdata;
+    uint32        tuplelen;
+    ReorderBufferTupleBuf *result;
+
+    ReorderBufferReadData(file, &tupdata, sizeof(HeapTupleData), NULL);
+    tuplelen = tupdata.t_len;
+
+    result = ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
+
+    /* restore ->tuple */
+    memcpy(&result->tuple, &tupdata, sizeof(HeapTupleData));
+
+    /* reset t_data pointer into the new tuplebuf */
+    result->tuple.t_data = ReorderBufferTupleBufData(result);
+
+    /* restore tuple data itself */
+    ReorderBufferReadData(file, result->tuple.t_data, tuplelen, NULL);
+
+    return result;
+}
+
+/*
+ * Wrapper for BufFileReadTransient() that raises ERROR if the expected amount
+ * of bytes was not read.
+ *
+ * If valid pointer is passed for no_data_p, set *no_data_p to indicate
+ * whether zero bytes was read. If NULL is passed, do not tolerate missing
+ * data.
+ */
+static void
+ReorderBufferReadData(TransientBufFile *file, void *ptr, size_t size,
+                      bool *no_data_p)
+{
+    int            readBytes;
+
+    /*
+     * Caller should not request zero bytes. This assumption simplifies
+     * setting of *no_data_p below.
+     */
+    Assert(size > 0);
+
+    if ((readBytes = BufFileReadTransient(file, ptr, size)) != size)
+    {
+        if (no_data_p)
+            *no_data_p = readBytes == 0;
+
+        /*
+         * It is o.k. to receive exactly zero bytes if caller passed valid
+         * no_data_p.
+         */
+        if (no_data_p && *no_data_p)
+            return;
+
+        ereport(ERROR,
+                (errcode_for_file_access(),
+                 errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
+                        readBytes, (uint32) size)));
+    }
+    else if (no_data_p)
+    {
+        /* Given that size is non-zero, readBytes must be non-zero too. */
+        *no_data_p = false;
+    }
+}
+
 /*
  * Remove all on-disk stored for the passed in transaction.
  */
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index a2f75b23b8..af5b329f8a 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -11,6 +11,7 @@
  *            - Add a pgstat config column to pg_database, so this
  *              entire thing can be enabled/disabled on a per db basis.
  *
+ *    Portions Copyright (c) 2019-2021, CYBERTEC PostgreSQL International GmbH
  *    Copyright (c) 2001-2021, PostgreSQL Global Development Group
  *
  *    src/backend/postmaster/pgstat.c
@@ -54,6 +55,7 @@
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/backendid.h"
+#include "storage/buffile.h"
 #include "storage/dsm.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
@@ -308,10 +310,16 @@ NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_no
 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
 static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry,
                                                  Oid tableoid, bool create);
+static void pgstat_tweak_base(bool permanent, bool global, Oid database,
+                              char tweak_base[TWEAK_BASE_SIZE]);
 static void pgstat_write_statsfiles(bool permanent, bool allDbs);
 static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
+static void pgstat_write_bytes(TransientBufFile *file, void *ptr, size_t size,
+                               bool *failed);
 static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
 static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
+static void pgstat_read_bytes(TransientBufFile *file, void *ptr, size_t size,
+                              bool *failed);
 static void backend_read_statsfile(void);
 
 static bool pgstat_write_statsfile_needed(void);
@@ -3202,6 +3210,9 @@ PgstatCollectorMain(int argc, char *argv[])
     MyBackendType = B_STATS_COLLECTOR;
     init_ps_display(NULL);
 
+    /* BufFileOpenTransient() and friends do use VFD. */
+    InitFileAccess();
+
     /*
      * Read in existing stats files or initialize the stats to zero.
      */
@@ -3595,6 +3606,26 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
     return result;
 }
 
+/*
+ * Initialize the common part of the encryption tweak.
+ */
+static void
+pgstat_tweak_base(bool permanent, bool global, Oid database,
+                  char tweak_base[TWEAK_BASE_SIZE])
+{
+    char    *c = tweak_base;
+
+    StaticAssertStmt(3 + sizeof(Oid) <= TWEAK_BASE_SIZE,
+                     "tweak components do not fit into TWEAK_BASE_SIZE");
+    memset(tweak_base, 0, TWEAK_BASE_SIZE);
+    *c = TRANS_BUF_FILE_PGSTATS;
+    c++;
+    *c = permanent;
+    c++;
+    *c = global;
+    c++;
+    memcpy(c, &database, sizeof(Oid));
+}
 
 /* ----------
  * pgstat_write_statsfiles() -
@@ -3615,18 +3646,24 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
 {
     HASH_SEQ_STATUS hstat;
     PgStat_StatDBEntry *dbentry;
-    FILE       *fpout;
+    TransientBufFile       *fpout;
+    File    vfd;
     int32        format_id;
     const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
     const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
-    int            rc;
+    bool    failed = false;
+    char tweak_base[TWEAK_BASE_SIZE];
 
     elog(DEBUG2, "writing stats file \"%s\"", statfile);
 
     /*
      * Open the statistics temp file to write out the current values.
      */
-    fpout = AllocateFile(tmpfile, PG_BINARY_W);
+    if (data_encrypted)
+        pgstat_tweak_base(permanent, true, InvalidOid, tweak_base);
+    fpout = BufFileOpenTransient(tmpfile,
+                                 O_CREAT | O_WRONLY | O_APPEND | PG_BINARY,
+                                 tweak_base);
     if (fpout == NULL)
     {
         ereport(LOG,
@@ -3645,32 +3682,27 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
      * Write the file header --- currently just a format ID.
      */
     format_id = PGSTAT_FILE_FORMAT_ID;
-    rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
-    (void) rc;                    /* we'll check for error with ferror */
+    pgstat_write_bytes(fpout, &format_id, sizeof(format_id), &failed);
 
     /*
      * Write global stats struct
      */
-    rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout);
-    (void) rc;                    /* we'll check for error with ferror */
+    pgstat_write_bytes(fpout, &globalStats, sizeof(globalStats), &failed);
 
     /*
      * Write archiver stats struct
      */
-    rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
-    (void) rc;                    /* we'll check for error with ferror */
+    pgstat_write_bytes(fpout, &archiverStats, sizeof(archiverStats), &failed);
 
     /*
      * Write WAL stats struct
      */
-    rc = fwrite(&walStats, sizeof(walStats), 1, fpout);
-    (void) rc;                    /* we'll check for error with ferror */
+    pgstat_write_bytes(fpout, &walStats, sizeof(walStats), &failed);
 
     /*
      * Write SLRU stats struct
      */
-    rc = fwrite(slruStats, sizeof(slruStats), 1, fpout);
-    (void) rc;                    /* we'll check for error with ferror */
+    pgstat_write_bytes(fpout, slruStats, sizeof(slruStats), &failed);
 
     /*
      * Walk through the database table.
@@ -3694,9 +3726,11 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
          * Write out the DB entry. We don't write the tables or functions
          * pointers, since they're of no use to any other process.
          */
-        fputc('D', fpout);
-        rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
-        (void) rc;                /* we'll check for error with ferror */
+        pgstat_write_bytes(fpout, "D", 1, &failed);
+        pgstat_write_bytes(fpout,
+                           dbentry,
+                           offsetof(PgStat_StatDBEntry, tables),
+                           &failed);
     }
 
     /*
@@ -3709,29 +3743,36 @@ pgstat_write_statsfiles(bool permanent, bool allDbs)
         hash_seq_init(&hstat, replSlotStatHash);
         while ((slotent = (PgStat_StatReplSlotEntry *) hash_seq_search(&hstat)) != NULL)
         {
-            fputc('R', fpout);
-            rc = fwrite(slotent, sizeof(PgStat_StatReplSlotEntry), 1, fpout);
-            (void) rc;            /* we'll check for error with ferror */
+            pgstat_write_bytes(fpout, "R", 1, &failed);
+            pgstat_write_bytes(fpout, slotent, sizeof(PgStat_StatReplSlotEntry),
+                               &failed);
         }
     }
 
     /*
      * No more output to be done. Close the temp file and replace the old
-     * pgstat.stat with it.  The ferror() check replaces testing for error
-     * after each individual fputc or fwrite above.
+     * pgstat.stat with it.
      */
-    fputc('E', fpout);
+    pgstat_write_bytes(fpout, "E", 1, &failed);
 
-    if (ferror(fpout))
+    if (failed)
     {
         ereport(LOG,
                 (errcode_for_file_access(),
                  errmsg("could not write temporary statistics file \"%s\": %m",
                         tmpfile)));
-        FreeFile(fpout);
+        BufFileCloseTransient(fpout);
         unlink(tmpfile);
+        return;
     }
-    else if (FreeFile(fpout) < 0)
+
+    /*
+     * XXX This might PANIC, see FileClose(). Don't we need special behaviour
+     * for statistics?
+     */
+    vfd = BufFileTransientGetVfd(fpout);
+    BufFileCloseTransient(fpout);
+    if (!FileIsClosed(vfd))
     {
         ereport(LOG,
                 (errcode_for_file_access(),
@@ -3796,12 +3837,14 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
     HASH_SEQ_STATUS fstat;
     PgStat_StatTabEntry *tabentry;
     PgStat_StatFuncEntry *funcentry;
-    FILE       *fpout;
+    TransientBufFile       *fpout;
+    File    vfd;
     int32        format_id;
     Oid            dbid = dbentry->databaseid;
-    int            rc;
     char        tmpfile[MAXPGPATH];
     char        statfile[MAXPGPATH];
+    bool    failed = false;
+    char tweak_base[TWEAK_BASE_SIZE];
 
     get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH);
     get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH);
@@ -3811,7 +3854,11 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
     /*
      * Open the statistics temp file to write out the current values.
      */
-    fpout = AllocateFile(tmpfile, PG_BINARY_W);
+    if (data_encrypted)
+        pgstat_tweak_base(permanent, false, dbid, tweak_base);
+    fpout = BufFileOpenTransient(tmpfile,
+                                 O_CREAT | O_WRONLY | O_APPEND | PG_BINARY,
+                                 tweak_base);
     if (fpout == NULL)
     {
         ereport(LOG,
@@ -3825,8 +3872,7 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
      * Write the file header --- currently just a format ID.
      */
     format_id = PGSTAT_FILE_FORMAT_ID;
-    rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
-    (void) rc;                    /* we'll check for error with ferror */
+    pgstat_write_bytes(fpout, &format_id, sizeof(format_id), &failed);
 
     /*
      * Walk through the database's access stats per table.
@@ -3834,9 +3880,14 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
     hash_seq_init(&tstat, dbentry->tables);
     while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
     {
-        fputc('T', fpout);
-        rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
-        (void) rc;                /* we'll check for error with ferror */
+        pgstat_write_bytes(fpout, "T", 1, &failed);
+        if (failed)
+            break;
+
+        pgstat_write_bytes(fpout, tabentry, sizeof(PgStat_StatTabEntry),
+                           &failed);
+        if (failed)
+            break;
     }
 
     /*
@@ -3845,28 +3896,42 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
     hash_seq_init(&fstat, dbentry->functions);
     while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
     {
-        fputc('F', fpout);
-        rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
-        (void) rc;                /* we'll check for error with ferror */
+        pgstat_write_bytes(fpout, "F", 1, &failed);
+        if (failed)
+            break;
+
+        pgstat_write_bytes(fpout,
+                           funcentry,
+                           sizeof(PgStat_StatFuncEntry),
+                           &failed);
+        if (failed)
+            break;
     }
 
     /*
      * No more output to be done. Close the temp file and replace the old
-     * pgstat.stat with it.  The ferror() check replaces testing for error
-     * after each individual fputc or fwrite above.
+     * pgstat.stat with it.
      */
-    fputc('E', fpout);
+    pgstat_write_bytes(fpout, "E", 1, &failed);
 
-    if (ferror(fpout))
+    if (failed)
     {
         ereport(LOG,
                 (errcode_for_file_access(),
                  errmsg("could not write temporary statistics file \"%s\": %m",
                         tmpfile)));
-        FreeFile(fpout);
+        BufFileCloseTransient(fpout);
         unlink(tmpfile);
+        return;
     }
-    else if (FreeFile(fpout) < 0)
+
+    /*
+     * XXX This might PANIC, see FileClose(). Don't we need special behaviour
+     * for statistics?
+     */
+    vfd = BufFileTransientGetVfd(fpout);
+    BufFileCloseTransient(fpout);
+    if (!FileIsClosed(vfd))
     {
         ereport(LOG,
                 (errcode_for_file_access(),
@@ -3892,6 +3957,25 @@ pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent)
     }
 }
 
+/*
+ * Convenience routine to write data to file and check for errors.
+ */
+static void
+pgstat_write_bytes(TransientBufFile *file, void *ptr, size_t size,
+    bool *failed)
+{
+    /* Do nothing if any previous write failed. */
+    if (*failed)
+        return;
+
+    /*
+     * Use BufFileWriteTransient() because it handles encryption
+     * transparently.
+     */
+    if (BufFileWriteTransient(file, ptr, size) != size)
+        *failed = true;
+}
+
 /* ----------
  * pgstat_read_statsfiles() -
  *
@@ -3919,10 +4003,12 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
     PgStat_StatDBEntry dbbuf;
     HASHCTL        hash_ctl;
     HTAB       *dbhash;
-    FILE       *fpin;
+    TransientBufFile *fpin;
     int32        format_id;
     bool        found;
     const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
+    bool    failed = false;
+    char tweak_base[TWEAK_BASE_SIZE];
     int            i;
 
     /*
@@ -3971,7 +4057,11 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
      * not yet written the stats file the first time.  Any other failure
      * condition is suspicious.
      */
-    if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+    if (data_encrypted)
+        pgstat_tweak_base(permanent, true, InvalidOid, tweak_base);
+    if ((fpin = BufFileOpenTransient(statfile,
+                                     O_RDONLY | PG_BINARY,
+                                     tweak_base)) == NULL)
     {
         if (errno != ENOENT)
             ereport(pgStatRunningInCollector ? LOG : WARNING,
@@ -3984,8 +4074,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
     /*
      * Verify it's of the expected format.
      */
-    if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
-        format_id != PGSTAT_FILE_FORMAT_ID)
+    pgstat_read_bytes(fpin, &format_id, sizeof(format_id), &failed);
+    if (failed || format_id != PGSTAT_FILE_FORMAT_ID)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
@@ -3995,7 +4085,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
     /*
      * Read global stats struct
      */
-    if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
+    pgstat_read_bytes(fpin, &globalStats, sizeof(globalStats), &failed);
+    if (failed)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
@@ -4016,7 +4107,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
     /*
      * Read archiver stats struct
      */
-    if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats))
+    pgstat_read_bytes(fpin, &archiverStats, sizeof(archiverStats), &failed);
+    if (failed)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
@@ -4027,7 +4119,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
     /*
      * Read WAL stats struct
      */
-    if (fread(&walStats, 1, sizeof(walStats), fpin) != sizeof(walStats))
+    pgstat_read_bytes(fpin, &walStats, sizeof(walStats), &failed);
+    if (failed)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
@@ -4038,7 +4131,8 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
     /*
      * Read SLRU stats struct
      */
-    if (fread(slruStats, 1, sizeof(slruStats), fpin) != sizeof(slruStats))
+    pgstat_read_bytes(fpin, &slruStats, sizeof(slruStats), &failed);
+    if (failed)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
@@ -4052,15 +4146,22 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
      */
     for (;;)
     {
-        switch (fgetc(fpin))
+        char    c;
+
+        pgstat_read_bytes(fpin, &c, 1, &failed);
+
+        switch (c)
         {
                 /*
                  * 'D'    A PgStat_StatDBEntry struct describing a database
                  * follows.
                  */
             case 'D':
-                if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
-                          fpin) != offsetof(PgStat_StatDBEntry, tables))
+                pgstat_read_bytes(fpin,
+                                  &dbbuf,
+                                  offsetof(PgStat_StatDBEntry, tables),
+                                  &failed);
+                if (failed)
                 {
                     ereport(pgStatRunningInCollector ? LOG : WARNING,
                             (errmsg("corrupted statistics file \"%s\"",
@@ -4144,8 +4245,10 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
                     PgStat_StatReplSlotEntry slotbuf;
                     PgStat_StatReplSlotEntry *slotent;
 
-                    if (fread(&slotbuf, 1, sizeof(PgStat_StatReplSlotEntry), fpin)
-                        != sizeof(PgStat_StatReplSlotEntry))
+                    pgstat_read_bytes(fpin, &slotbuf,
+                                      sizeof(PgStat_StatReplSlotEntry),
+                                      &failed);
+                    if (failed)
                     {
                         ereport(pgStatRunningInCollector ? LOG : WARNING,
                                 (errmsg("corrupted statistics file \"%s\"",
@@ -4186,7 +4289,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
     }
 
 done:
-    FreeFile(fpin);
+    BufFileCloseTransient(fpin);
 
     /* If requested to read the permanent file, also get rid of it. */
     if (permanent)
@@ -4221,10 +4324,12 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
     PgStat_StatTabEntry tabbuf;
     PgStat_StatFuncEntry funcbuf;
     PgStat_StatFuncEntry *funcentry;
-    FILE       *fpin;
+    TransientBufFile       *fpin;
     int32        format_id;
     bool        found;
     char        statfile[MAXPGPATH];
+    bool    failed = false;
+    char tweak_base[TWEAK_BASE_SIZE];
 
     get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH);
 
@@ -4237,7 +4342,11 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
      * not yet written the stats file the first time.  Any other failure
      * condition is suspicious.
      */
-    if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+    if (data_encrypted)
+        pgstat_tweak_base(permanent, false, databaseid, tweak_base);
+    if ((fpin = BufFileOpenTransient(statfile,
+                                     O_RDONLY | PG_BINARY,
+                                     tweak_base)) == NULL)
     {
         if (errno != ENOENT)
             ereport(pgStatRunningInCollector ? LOG : WARNING,
@@ -4250,8 +4359,8 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
     /*
      * Verify it's of the expected format.
      */
-    if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
-        format_id != PGSTAT_FILE_FORMAT_ID)
+    pgstat_read_bytes(fpin, &format_id, sizeof(format_id), &failed);
+    if (failed || format_id != PGSTAT_FILE_FORMAT_ID)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
@@ -4264,14 +4373,19 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
      */
     for (;;)
     {
-        switch (fgetc(fpin))
+        char    c;
+
+        pgstat_read_bytes(fpin, &c, 1, &failed);
+
+        switch (c)
         {
                 /*
                  * 'T'    A PgStat_StatTabEntry follows.
                  */
             case 'T':
-                if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
-                          fpin) != sizeof(PgStat_StatTabEntry))
+                pgstat_read_bytes(fpin, &tabbuf, sizeof(PgStat_StatTabEntry),
+                                  &failed);
+                if (failed)
                 {
                     ereport(pgStatRunningInCollector ? LOG : WARNING,
                             (errmsg("corrupted statistics file \"%s\"",
@@ -4304,8 +4418,11 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
                  * 'F'    A PgStat_StatFuncEntry follows.
                  */
             case 'F':
-                if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
-                          fpin) != sizeof(PgStat_StatFuncEntry))
+                pgstat_read_bytes(fpin,
+                                  &funcbuf,
+                                  sizeof(PgStat_StatFuncEntry),
+                                  &failed);
+                if (failed)
                 {
                     ereport(pgStatRunningInCollector ? LOG : WARNING,
                             (errmsg("corrupted statistics file \"%s\"",
@@ -4349,7 +4466,7 @@ pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
     }
 
 done:
-    FreeFile(fpin);
+    BufFileCloseTransient(fpin);
 
     if (permanent)
     {
@@ -4386,15 +4503,21 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
     PgStat_WalStats myWalStats;
     PgStat_SLRUStats mySLRUStats[SLRU_NUM_ELEMENTS];
     PgStat_StatReplSlotEntry myReplSlotStats;
-    FILE       *fpin;
+    TransientBufFile       *fpin;
     int32        format_id;
     const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
+    bool    failed = false;
+    char tweak_base[TWEAK_BASE_SIZE];
 
     /*
      * Try to open the stats file.  As above, anything but ENOENT is worthy of
      * complaining about.
      */
-    if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
+    if (data_encrypted)
+        pgstat_tweak_base(permanent, true, InvalidOid, tweak_base);
+    if ((fpin = BufFileOpenTransient(statfile,
+                                     O_RDONLY | PG_BINARY,
+                                     tweak_base)) == NULL)
     {
         if (errno != ENOENT)
             ereport(pgStatRunningInCollector ? LOG : WARNING,
@@ -4407,58 +4530,62 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
     /*
      * Verify it's of the expected format.
      */
-    if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
-        format_id != PGSTAT_FILE_FORMAT_ID)
+    pgstat_read_bytes(fpin, &format_id, sizeof(format_id), &failed);
+    if (failed || format_id != PGSTAT_FILE_FORMAT_ID)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
-        FreeFile(fpin);
+        BufFileCloseTransient(fpin);
         return false;
     }
 
     /*
      * Read global stats struct
      */
-    if (fread(&myGlobalStats, 1, sizeof(myGlobalStats),
-              fpin) != sizeof(myGlobalStats))
+    pgstat_read_bytes(fpin, &myGlobalStats, sizeof(myGlobalStats), &failed);
+    if (failed)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
-        FreeFile(fpin);
+        BufFileCloseTransient(fpin);
         return false;
     }
 
     /*
      * Read archiver stats struct
      */
-    if (fread(&myArchiverStats, 1, sizeof(myArchiverStats),
-              fpin) != sizeof(myArchiverStats))
+    pgstat_read_bytes(fpin, &myArchiverStats, sizeof(myArchiverStats),
+                      &failed);
+    if (failed)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
-        FreeFile(fpin);
+        BufFileCloseTransient(fpin);
         return false;
     }
 
     /*
      * Read WAL stats struct
      */
-    if (fread(&myWalStats, 1, sizeof(myWalStats), fpin) != sizeof(myWalStats))
+    pgstat_read_bytes(fpin, &myWalStats, sizeof(myWalStats), &failed);
+    if (failed)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
-        FreeFile(fpin);
+        BufFileCloseTransient(fpin);
         return false;
     }
 
     /*
      * Read SLRU stats struct
      */
-    if (fread(mySLRUStats, 1, sizeof(mySLRUStats), fpin) != sizeof(mySLRUStats))
+    pgstat_read_bytes(fpin, mySLRUStats, sizeof(mySLRUStats),
+                      &failed);
+    if (failed)
     {
         ereport(pgStatRunningInCollector ? LOG : WARNING,
                 (errmsg("corrupted statistics file \"%s\"", statfile)));
-        FreeFile(fpin);
+        BufFileCloseTransient(fpin);
         return false;
     }
 
@@ -4471,20 +4598,27 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
      */
     for (;;)
     {
-        switch (fgetc(fpin))
+        char    c;
+
+        pgstat_read_bytes(fpin, &c, 1, &failed);
+
+        switch (c)
         {
                 /*
                  * 'D'    A PgStat_StatDBEntry struct describing a database
                  * follows.
                  */
             case 'D':
-                if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
-                          fpin) != offsetof(PgStat_StatDBEntry, tables))
+                pgstat_read_bytes(fpin,
+                                  &dbentry,
+                                  offsetof(PgStat_StatDBEntry, tables),
+                                  &failed);
+                if (failed)
                 {
                     ereport(pgStatRunningInCollector ? LOG : WARNING,
                             (errmsg("corrupted statistics file \"%s\"",
                                     statfile)));
-                    FreeFile(fpin);
+                    BufFileCloseTransient(fpin);
                     return false;
                 }
 
@@ -4505,13 +4639,14 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
                  * replication slot follows.
                  */
             case 'R':
-                if (fread(&myReplSlotStats, 1, sizeof(PgStat_StatReplSlotEntry), fpin)
-                    != sizeof(PgStat_StatReplSlotEntry))
+                pgstat_read_bytes(fpin, &myReplSlotStats,
+                                  sizeof(PgStat_StatReplSlotEntry), &failed);
+                if (failed)
                 {
                     ereport(pgStatRunningInCollector ? LOG : WARNING,
                             (errmsg("corrupted statistics file \"%s\"",
                                     statfile)));
-                    FreeFile(fpin);
+                    BufFileCloseTransient(fpin);
                     return false;
                 }
                 break;
@@ -4524,17 +4659,35 @@ pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
                     ereport(pgStatRunningInCollector ? LOG : WARNING,
                             (errmsg("corrupted statistics file \"%s\"",
                                     statfile)));
-                    FreeFile(fpin);
+                    BufFileCloseTransient(fpin);
                     return false;
                 }
         }
     }
 
 done:
-    FreeFile(fpin);
+    BufFileCloseTransient(fpin);
     return true;
 }
 
+/*
+ * Convenience routine to read data from file and check for errors.
+ */
+static void
+pgstat_read_bytes(TransientBufFile *file, void *ptr, size_t size,
+    bool *failed)
+{
+    /* Do nothing if any previous read failed. */
+    if (*failed)
+        return;
+
+    /*
+     * Use BufFileReadTransient() because it handles encryption transparently.
+     */
+    if (BufFileReadTransient(file, ptr, size) != size)
+        *failed = true;
+}
+
 /*
  * If not already done, read the statistics collector stats file into
  * some hash tables.  The results will be kept until pgstat_clear_snapshot()

pgsql-hackers by date:

Previous
From: Andy Fan
Date:
Subject: Can I assume relation would not be invalid during from ExecutorRun to ExecutorEnd
Next
From: Dilip Kumar
Date:
Subject: Re: Synchronizing slots from primary to standby