Re: pg_verify_checksums and -fno-strict-aliasing - Mailing list pgsql-hackers

From Tom Lane
Subject Re: pg_verify_checksums and -fno-strict-aliasing
Date
Msg-id 21037.1535745351@sss.pgh.pa.us
Whole thread Raw
In response to Re: pg_verify_checksums and -fno-strict-aliasing  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: pg_verify_checksums and -fno-strict-aliasing
List pgsql-hackers
I wrote:
> Some of these places might be performance-critical enough that adding
> a palloc/pfree cycle would not be nice.  What I was considering doing
> was inventing something like
>
> typedef union PGAlignedBuffer
> {
>     char    data[BLCKSZ];
>     double    force_align;
> } PGAlignedBuffer;

Here's a proposed patch using that approach.

Although some of the places that were using "char buf[BLCKSZ]" variables
weren't doing anything that really requires better alignment, I made
almost all of them use PGAlignedBuffer variables anyway, on the grounds
that you typically get better memcpy speed for aligned than unaligned
transfers.

I also fixed a few places that were using the palloc solution, and
one that was actually doing hand-alignment of the pointer (ugh).
I didn't try to be thorough about getting rid of such pallocs,
just fix places that seemed likely to be performance-critical.

This needs to be back-patched as relevant, of course.

            regards, tom lane

diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c
index 4afdea7..5ae1dda 100644
--- a/contrib/bloom/blinsert.c
+++ b/contrib/bloom/blinsert.c
@@ -36,7 +36,7 @@ typedef struct
     int64        indtuples;        /* total number of tuples indexed */
     MemoryContext tmpCtx;        /* temporary memory context reset after each
                                  * tuple */
-    char        data[BLCKSZ];    /* cached page */
+    PGAlignedBuffer data;        /* cached page */
     int            count;            /* number of tuples in cached page */
 } BloomBuildState;

@@ -52,7 +52,7 @@ flushCachedPage(Relation index, BloomBuildState *buildstate)

     state = GenericXLogStart(index);
     page = GenericXLogRegisterBuffer(state, buffer, GENERIC_XLOG_FULL_IMAGE);
-    memcpy(page, buildstate->data, BLCKSZ);
+    memcpy(page, buildstate->data.data, BLCKSZ);
     GenericXLogFinish(state);
     UnlockReleaseBuffer(buffer);
 }
@@ -63,8 +63,8 @@ flushCachedPage(Relation index, BloomBuildState *buildstate)
 static void
 initCachedPage(BloomBuildState *buildstate)
 {
-    memset(buildstate->data, 0, BLCKSZ);
-    BloomInitPage(buildstate->data, 0);
+    memset(buildstate->data.data, 0, BLCKSZ);
+    BloomInitPage(buildstate->data.data, 0);
     buildstate->count = 0;
 }

@@ -84,7 +84,7 @@ bloomBuildCallback(Relation index, HeapTuple htup, Datum *values,
     itup = BloomFormTuple(&buildstate->blstate, &htup->t_self, values, isnull);

     /* Try to add next item to cached page */
-    if (BloomPageAddItem(&buildstate->blstate, buildstate->data, itup))
+    if (BloomPageAddItem(&buildstate->blstate, buildstate->data.data, itup))
     {
         /* Next item was added successfully */
         buildstate->count++;
@@ -98,7 +98,7 @@ bloomBuildCallback(Relation index, HeapTuple htup, Datum *values,

         initCachedPage(buildstate);

-        if (!BloomPageAddItem(&buildstate->blstate, buildstate->data, itup))
+        if (!BloomPageAddItem(&buildstate->blstate, buildstate->data.data, itup))
         {
             /* We shouldn't be here since we're inserting to the empty page */
             elog(ERROR, "could not add new bloom tuple to empty page");
diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c
index 3cbb7c2..1a00c59 100644
--- a/contrib/pg_prewarm/pg_prewarm.c
+++ b/contrib/pg_prewarm/pg_prewarm.c
@@ -36,7 +36,7 @@ typedef enum
     PREWARM_BUFFER
 } PrewarmType;

-static char blockbuffer[BLCKSZ];
+static PGAlignedBuffer blockbuffer;

 /*
  * pg_prewarm(regclass, mode text, fork text,
@@ -178,7 +178,7 @@ pg_prewarm(PG_FUNCTION_ARGS)
         for (block = first_block; block <= last_block; ++block)
         {
             CHECK_FOR_INTERRUPTS();
-            smgrread(rel->rd_smgr, forkNumber, block, blockbuffer);
+            smgrread(rel->rd_smgr, forkNumber, block, blockbuffer.data);
             ++blocks_done;
         }
     }
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c
index 8107697..ef5c367 100644
--- a/src/backend/access/gin/ginentrypage.c
+++ b/src/backend/access/gin/ginentrypage.c
@@ -616,7 +616,7 @@ entrySplitPage(GinBtree btree, Buffer origbuf,
     Page        lpage = PageGetTempPageCopy(BufferGetPage(origbuf));
     Page        rpage = PageGetTempPageCopy(BufferGetPage(origbuf));
     Size        pageSize = PageGetPageSize(lpage);
-    char        tupstore[2 * BLCKSZ];
+    PGAlignedBuffer tupstore[2];    /* could need 2 pages' worth of tuples */

     entryPreparePage(btree, lpage, off, insertData, updateblkno);

@@ -625,7 +625,7 @@ entrySplitPage(GinBtree btree, Buffer origbuf,
      * one after another in a temporary workspace.
      */
     maxoff = PageGetMaxOffsetNumber(lpage);
-    ptr = tupstore;
+    ptr = tupstore[0].data;
     for (i = FirstOffsetNumber; i <= maxoff; i++)
     {
         if (i == off)
@@ -658,7 +658,7 @@ entrySplitPage(GinBtree btree, Buffer origbuf,
     GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
     GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);

-    ptr = tupstore;
+    ptr = tupstore[0].data;
     maxoff++;
     lsize = 0;

diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index e32807e..915b9ca 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -64,18 +64,15 @@ writeListPage(Relation index, Buffer buffer,
                 size = 0;
     OffsetNumber l,
                 off;
-    char       *workspace;
+    PGAlignedBuffer workspace;
     char       *ptr;

-    /* workspace could be a local array; we use palloc for alignment */
-    workspace = palloc(BLCKSZ);
-
     START_CRIT_SECTION();

     GinInitBuffer(buffer, GIN_LIST);

     off = FirstOffsetNumber;
-    ptr = workspace;
+    ptr = workspace.data;

     for (i = 0; i < ntuples; i++)
     {
@@ -127,7 +124,7 @@ writeListPage(Relation index, Buffer buffer,
         XLogRegisterData((char *) &data, sizeof(ginxlogInsertListPage));

         XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
-        XLogRegisterBufData(0, workspace, size);
+        XLogRegisterBufData(0, workspace.data, size);

         recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE);
         PageSetLSN(page, recptr);
@@ -140,8 +137,6 @@ writeListPage(Relation index, Buffer buffer,

     END_CRIT_SECTION();

-    pfree(workspace);
-
     return freesize;
 }

diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 3ec29a5..27d86b9 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -1000,7 +1000,7 @@ static bool
 _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 {
     BlockNumber lastblock;
-    char        zerobuf[BLCKSZ];
+    PGAlignedBuffer zerobuf;
     Page        page;
     HashPageOpaque ovflopaque;

@@ -1013,7 +1013,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
     if (lastblock < firstblock || lastblock == InvalidBlockNumber)
         return false;

-    page = (Page) zerobuf;
+    page = (Page) zerobuf.data;

     /*
      * Initialize the page.  Just zeroing the page won't work; see
@@ -1034,11 +1034,11 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
         log_newpage(&rel->rd_node,
                     MAIN_FORKNUM,
                     lastblock,
-                    zerobuf,
+                    zerobuf.data,
                     true);

     RelationOpenSmgr(rel);
-    smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false);
+    smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf.data, false);

     return true;
 }
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index b8bfe23..27e8b01 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2709,7 +2709,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
     HeapTuple  *heaptuples;
     int            i;
     int            ndone;
-    char       *scratch = NULL;
+    PGAlignedBuffer scratch;
     Page        page;
     bool        needwal;
     Size        saveFreeSpace;
@@ -2727,14 +2727,6 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
                                             xid, cid, options);

     /*
-     * Allocate some memory to use for constructing the WAL record. Using
-     * palloc() within a critical section is not safe, so we allocate this
-     * beforehand.
-     */
-    if (needwal)
-        scratch = palloc(BLCKSZ);
-
-    /*
      * We're about to do the actual inserts -- but check for conflict first,
      * to minimize the possibility of having to roll back work we've just
      * done.
@@ -2826,7 +2818,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
             uint8        info = XLOG_HEAP2_MULTI_INSERT;
             char       *tupledata;
             int            totaldatalen;
-            char       *scratchptr = scratch;
+            char       *scratchptr = scratch.data;
             bool        init;
             int            bufflags = 0;

@@ -2885,7 +2877,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
                 scratchptr += datalen;
             }
             totaldatalen = scratchptr - tupledata;
-            Assert((scratchptr - scratch) < BLCKSZ);
+            Assert((scratchptr - scratch.data) < BLCKSZ);

             if (need_tuple_data)
                 xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
@@ -2912,7 +2904,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
                 bufflags |= REGBUF_KEEP_DATA;

             XLogBeginInsert();
-            XLogRegisterData((char *) xlrec, tupledata - scratch);
+            XLogRegisterData((char *) xlrec, tupledata - scratch.data);
             XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);

             XLogRegisterBufData(0, tupledata, totaldatalen);
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
index ce02354..e421f29 100644
--- a/src/backend/access/transam/generic_xlog.c
+++ b/src/backend/access/transam/generic_xlog.c
@@ -61,14 +61,11 @@ typedef struct
 /* State of generic xlog record construction */
 struct GenericXLogState
 {
-    /*
-     * page's images. Should be first in this struct to have MAXALIGN'ed
-     * images addresses, because some code working with pages directly aligns
-     * addresses, not offsets from beginning of page
-     */
-    char        images[MAX_GENERIC_XLOG_PAGES * BLCKSZ];
+    /* Info about each page, see above */
     PageData    pages[MAX_GENERIC_XLOG_PAGES];
     bool        isLogged;
+    /* Page images (properly aligned) */
+    PGAlignedBuffer images[MAX_GENERIC_XLOG_PAGES];
 };

 static void writeFragment(PageData *pageData, OffsetNumber offset,
@@ -251,12 +248,12 @@ computeDelta(PageData *pageData, Page curpage, Page targetpage)
 #ifdef WAL_DEBUG
     if (XLOG_DEBUG)
     {
-        char        tmp[BLCKSZ];
+        PGAlignedBuffer tmp;

-        memcpy(tmp, curpage, BLCKSZ);
-        applyPageRedo(tmp, pageData->delta, pageData->deltaLen);
-        if (memcmp(tmp, targetpage, targetLower) != 0 ||
-            memcmp(tmp + targetUpper, targetpage + targetUpper,
+        memcpy(tmp.data, curpage, BLCKSZ);
+        applyPageRedo(tmp.data, pageData->delta, pageData->deltaLen);
+        if (memcmp(tmp.data, targetpage, targetLower) != 0 ||
+            memcmp(tmp.data + targetUpper, targetpage + targetUpper,
                    BLCKSZ - targetUpper) != 0)
             elog(ERROR, "result of generic xlog apply does not match");
     }
@@ -277,7 +274,7 @@ GenericXLogStart(Relation relation)

     for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
     {
-        state->pages[i].image = state->images + BLCKSZ * i;
+        state->pages[i].image = state->images[i].data;
         state->pages[i].buffer = InvalidBuffer;
     }

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 65db2e4..29d792f 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3210,8 +3210,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 {
     char        path[MAXPGPATH];
     char        tmppath[MAXPGPATH];
-    char        zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
-    char       *zbuffer;
+    PGAlignedXLogBuffer zbuffer;
     XLogSegNo    installed_segno;
     XLogSegNo    max_segno;
     int            fd;
@@ -3263,17 +3262,13 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
      * fsync below) that all the indirect blocks are down on disk.  Therefore,
      * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
      * log file.
-     *
-     * Note: ensure the buffer is reasonably well-aligned; this may save a few
-     * cycles transferring data to the kernel.
      */
-    zbuffer = (char *) MAXALIGN(zbuffer_raw);
-    memset(zbuffer, 0, XLOG_BLCKSZ);
+    memset(zbuffer.data, 0, XLOG_BLCKSZ);
     for (nbytes = 0; nbytes < wal_segment_size; nbytes += XLOG_BLCKSZ)
     {
         errno = 0;
         pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
-        if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
+        if ((int) write(fd, zbuffer.data, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
         {
             int            save_errno = errno;

@@ -3380,7 +3375,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
 {
     char        path[MAXPGPATH];
     char        tmppath[MAXPGPATH];
-    char        buffer[XLOG_BLCKSZ];
+    PGAlignedXLogBuffer buffer;
     int            srcfd;
     int            fd;
     int            nbytes;
@@ -3423,7 +3418,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
          * zeros.
          */
         if (nread < sizeof(buffer))
-            memset(buffer, 0, sizeof(buffer));
+            memset(buffer.data, 0, sizeof(buffer));

         if (nread > 0)
         {
@@ -3432,7 +3427,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
             if (nread > sizeof(buffer))
                 nread = sizeof(buffer);
             pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
-            r = read(srcfd, buffer, nread);
+            r = read(srcfd, buffer.data, nread);
             if (r != nread)
             {
                 if (r < 0)
@@ -3450,7 +3445,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
         }
         errno = 0;
         pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
-        if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
+        if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
         {
             int            save_errno = errno;

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 5bea073..c819204 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -809,12 +809,12 @@ XLogCompressBackupBlock(char *page, uint16 hole_offset, uint16 hole_length,
     int32        len;
     int32        extra_bytes = 0;
     char       *source;
-    char        tmp[BLCKSZ];
+    PGAlignedBuffer tmp;

     if (hole_length != 0)
     {
         /* must skip the hole */
-        source = tmp;
+        source = tmp.data;
         memcpy(source, page, hole_offset);
         memcpy(source + hole_offset,
                page + (hole_offset + hole_length),
@@ -917,7 +917,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
     if (lsn <= RedoRecPtr)
     {
         int            flags;
-        char        copied_buffer[BLCKSZ];
+        PGAlignedBuffer copied_buffer;
         char       *origdata = (char *) BufferGetBlock(buffer);
         RelFileNode rnode;
         ForkNumber    forkno;
@@ -935,11 +935,11 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
             uint16        lower = ((PageHeader) page)->pd_lower;
             uint16        upper = ((PageHeader) page)->pd_upper;

-            memcpy(copied_buffer, origdata, lower);
-            memcpy(copied_buffer + upper, origdata + upper, BLCKSZ - upper);
+            memcpy(copied_buffer.data, origdata, lower);
+            memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
         }
         else
-            memcpy(copied_buffer, origdata, BLCKSZ);
+            memcpy(copied_buffer.data, origdata, BLCKSZ);

         XLogBeginInsert();

@@ -948,7 +948,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
             flags |= REGBUF_STANDARD;

         BufferGetTag(buffer, &rnode, &forkno, &blkno);
-        XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer, flags);
+        XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data, flags);

         recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT);
     }
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 4c633c6..d5daafd 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1412,7 +1412,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
 {
     DecodedBkpBlock *bkpb;
     char       *ptr;
-    char        tmp[BLCKSZ];
+    PGAlignedBuffer tmp;

     if (!record->blocks[block_id].in_use)
         return false;
@@ -1425,7 +1425,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
     if (bkpb->bimg_info & BKPIMAGE_IS_COMPRESSED)
     {
         /* If a backup block image is compressed, decompress it */
-        if (pglz_decompress(ptr, bkpb->bimg_len, tmp,
+        if (pglz_decompress(ptr, bkpb->bimg_len, tmp.data,
                             BLCKSZ - bkpb->hole_length) < 0)
         {
             report_invalid_record(record, "invalid compressed image at %X/%X, block %d",
@@ -1434,7 +1434,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
                                   block_id);
             return false;
         }
-        ptr = tmp;
+        ptr = tmp.data;
     }

     /* generate page, taking into account hole if necessary */
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 48743db..4d60787 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11276,21 +11276,14 @@ static void
 copy_relation_data(SMgrRelation src, SMgrRelation dst,
                    ForkNumber forkNum, char relpersistence)
 {
-    char       *buf;
+    PGAlignedBuffer buf;
     Page        page;
     bool        use_wal;
     bool        copying_initfork;
     BlockNumber nblocks;
     BlockNumber blkno;

-    /*
-     * palloc the buffer so that it's MAXALIGN'd.  If it were just a local
-     * char[] array, the compiler might align it on any byte boundary, which
-     * can seriously hurt transfer speed to and from the kernel; not to
-     * mention possibly making log_newpage's accesses to the page header fail.
-     */
-    buf = (char *) palloc(BLCKSZ);
-    page = (Page) buf;
+    page = (Page) buf.data;

     /*
      * The init fork for an unlogged relation in many respects has to be
@@ -11314,7 +11307,7 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
         /* If we got a cancel signal during the copy of the data, quit */
         CHECK_FOR_INTERRUPTS();

-        smgrread(src, forkNum, blkno, buf);
+        smgrread(src, forkNum, blkno, buf.data);

         if (!PageIsVerified(page, blkno))
             ereport(ERROR,
@@ -11340,11 +11333,9 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
          * rel, because there's no need for smgr to schedule an fsync for this
          * write; we'll do it ourselves below.
          */
-        smgrextend(dst, forkNum, blkno, buf, true);
+        smgrextend(dst, forkNum, blkno, buf.data, true);
     }

-    pfree(buf);
-
     /*
      * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
      * to ensure that the toast table gets fsync'd too.  (For a temp or
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index c83ff3b..8005f11 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -496,11 +496,11 @@ SendTimeLineHistory(TimeLineHistoryCmd *cmd)
     bytesleft = histfilelen;
     while (bytesleft > 0)
     {
-        char        rbuf[BLCKSZ];
+        PGAlignedBuffer rbuf;
         int            nread;

         pgstat_report_wait_start(WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ);
-        nread = read(fd, rbuf, sizeof(rbuf));
+        nread = read(fd, rbuf.data, sizeof(rbuf));
         pgstat_report_wait_end();
         if (nread < 0)
             ereport(ERROR,
@@ -513,7 +513,7 @@ SendTimeLineHistory(TimeLineHistoryCmd *cmd)
                      errmsg("could not read file \"%s\": read %d of %zu",
                             path, nread, (Size) bytesleft)));

-        pq_sendbytes(&buf, rbuf, nread);
+        pq_sendbytes(&buf, rbuf.data, nread);
         bytesleft -= nread;
     }
     CloseTransientFile(fd);
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index efbede7..4a52284 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -96,7 +96,7 @@ struct BufFile
     off_t        curOffset;        /* offset part of current pos */
     int            pos;            /* next read/write position in buffer */
     int            nbytes;            /* total # of valid bytes in buffer */
-    char        buffer[BLCKSZ];
+    PGAlignedBuffer buffer;
 };

 static BufFile *makeBufFileCommon(int nfiles);
@@ -437,7 +437,7 @@ BufFileLoadBuffer(BufFile *file)
      * Read whatever we can get, up to a full bufferload.
      */
     file->nbytes = FileRead(thisfile,
-                            file->buffer,
+                            file->buffer.data,
                             sizeof(file->buffer),
                             WAIT_EVENT_BUFFILE_READ);
     if (file->nbytes < 0)
@@ -502,7 +502,7 @@ BufFileDumpBuffer(BufFile *file)
             file->offsets[file->curFile] = file->curOffset;
         }
         bytestowrite = FileWrite(thisfile,
-                                 file->buffer + wpos,
+                                 file->buffer.data + wpos,
                                  bytestowrite,
                                  WAIT_EVENT_BUFFILE_WRITE);
         if (bytestowrite <= 0)
@@ -572,7 +572,7 @@ BufFileRead(BufFile *file, void *ptr, size_t size)
             nthistime = size;
         Assert(nthistime > 0);

-        memcpy(ptr, file->buffer + file->pos, nthistime);
+        memcpy(ptr, file->buffer.data + file->pos, nthistime);

         file->pos += nthistime;
         ptr = (void *) ((char *) ptr + nthistime);
@@ -621,7 +621,7 @@ BufFileWrite(BufFile *file, void *ptr, size_t size)
             nthistime = size;
         Assert(nthistime > 0);

-        memcpy(file->buffer + file->pos, ptr, nthistime);
+        memcpy(file->buffer.data + file->pos, ptr, nthistime);

         file->dirty = true;
         file->pos += nthistime;
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 62d7797..756c1ff 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -240,11 +240,11 @@ ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
      */
     while (blocknum > lts->nBlocksWritten)
     {
-        char        zerobuf[BLCKSZ];
+        PGAlignedBuffer zerobuf;

-        MemSet(zerobuf, 0, sizeof(zerobuf));
+        MemSet(zerobuf.data, 0, sizeof(zerobuf));

-        ltsWriteBlock(lts, lts->nBlocksWritten, zerobuf);
+        ltsWriteBlock(lts, lts->nBlocksWritten, zerobuf.data);
     }

     /* Write the requested block */
diff --git a/src/bin/pg_rewind/copy_fetch.c b/src/bin/pg_rewind/copy_fetch.c
index 160a912..6a3c03a 100644
--- a/src/bin/pg_rewind/copy_fetch.c
+++ b/src/bin/pg_rewind/copy_fetch.c
@@ -156,7 +156,7 @@ recurse_dir(const char *datadir, const char *parentpath,
 static void
 rewind_copy_file_range(const char *path, off_t begin, off_t end, bool trunc)
 {
-    char        buf[BLCKSZ];
+    PGAlignedBuffer buf;
     char        srcpath[MAXPGPATH];
     int            srcfd;

@@ -182,7 +182,7 @@ rewind_copy_file_range(const char *path, off_t begin, off_t end, bool trunc)
         else
             len = end - begin;

-        readlen = read(srcfd, buf, len);
+        readlen = read(srcfd, buf.data, len);

         if (readlen < 0)
             pg_fatal("could not read file \"%s\": %s\n",
@@ -190,7 +190,7 @@ rewind_copy_file_range(const char *path, off_t begin, off_t end, bool trunc)
         else if (readlen == 0)
             pg_fatal("unexpected EOF while reading file \"%s\"\n", srcpath);

-        write_target_range(buf, begin, readlen);
+        write_target_range(buf.data, begin, readlen);
         begin += readlen;
     }

diff --git a/src/bin/pg_verify_checksums/pg_verify_checksums.c b/src/bin/pg_verify_checksums/pg_verify_checksums.c
index bf7feed..922d4dd 100644
--- a/src/bin/pg_verify_checksums/pg_verify_checksums.c
+++ b/src/bin/pg_verify_checksums/pg_verify_checksums.c
@@ -75,8 +75,8 @@ skipfile(const char *fn)
 static void
 scan_file(const char *fn, BlockNumber segmentno)
 {
-    char        buf[BLCKSZ];
-    PageHeader    header = (PageHeader) buf;
+    PGAlignedBuffer buf;
+    PageHeader    header = (PageHeader) buf.data;
     int            f;
     BlockNumber blockno;

@@ -93,7 +93,7 @@ scan_file(const char *fn, BlockNumber segmentno)
     for (blockno = 0;; blockno++)
     {
         uint16        csum;
-        int            r = read(f, buf, BLCKSZ);
+        int            r = read(f, buf.data, BLCKSZ);

         if (r == 0)
             break;
@@ -109,7 +109,7 @@ scan_file(const char *fn, BlockNumber segmentno)
         if (PageIsNew(header))
             continue;

-        csum = pg_checksum_page(buf, blockno + segmentno * RELSEG_SIZE);
+        csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
         if (csum != header->pd_checksum)
         {
             if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index 2a557b3..4affbb9 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -209,13 +209,13 @@ search_directory(const char *directory, const char *fname)
     /* set WalSegSz if file is successfully opened */
     if (fd >= 0)
     {
-        char        buf[XLOG_BLCKSZ];
+        PGAlignedXLogBuffer buf;
         int            r;

-        r = read(fd, buf, XLOG_BLCKSZ);
+        r = read(fd, buf.data, XLOG_BLCKSZ);
         if (r == XLOG_BLCKSZ)
         {
-            XLogLongPageHeader longhdr = (XLogLongPageHeader) buf;
+            XLogLongPageHeader longhdr = (XLogLongPageHeader) buf.data;

             WalSegSz = longhdr->xlp_seg_size;

diff --git a/src/include/c.h b/src/include/c.h
index 1e50103..193e9ac 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -989,6 +989,32 @@ extern void ExceptionalCondition(const char *conditionName,
  * ----------------------------------------------------------------
  */

+/*
+ * Use this, not "char buf[BLCKSZ]", to declare a field or local variable
+ * holding a page buffer, if that page might be accessed as a page and not
+ * just a string of bytes.  Otherwise the variable might be under-aligned,
+ * causing problems on alignment-picky hardware.  (In some places, we use
+ * this to declare buffers even though we only pass them to read() and
+ * write(), because copying to/from aligned buffers is usually faster than
+ * using unaligned buffers.)  We include both "double" and "int64" in the
+ * union to ensure that the compiler knows the value must be MAXALIGN'ed
+ * (cf. configure's computation of MAXIMUM_ALIGNOF).
+ */
+typedef union PGAlignedBuffer
+{
+    char        data[BLCKSZ];
+    double        force_align_d;
+    int64        force_align_i64;
+} PGAlignedBuffer;
+
+/* Same, but for an XLOG_BLCKSZ-sized buffer */
+typedef union PGAlignedXLogBuffer
+{
+    char        data[XLOG_BLCKSZ];
+    double        force_align_d;
+    int64        force_align_i64;
+} PGAlignedXLogBuffer;
+
 /* msb for char */
 #define HIGHBIT                    (0x80)
 #define IS_HIGHBIT_SET(ch)        ((unsigned char)(ch) & HIGHBIT)

pgsql-hackers by date:

Previous
From: Michael Paquier
Date:
Subject: Re: Bug in slot.c and are replication slots ever used at Window?
Next
From: Tom Lane
Date:
Subject: Re: psql \dC incorrectly shows casts "with inout" as "binary coercible" on 9.5.14 and 11beta3