Re: [HACKERS] WAL logging problem in 9.4.3? - Mailing list pgsql-hackers

From Kyotaro Horiguchi
Subject Re: [HACKERS] WAL logging problem in 9.4.3?
Date
Msg-id 20191121.194858.792948544668017842.horikyota.ntt@gmail.com
Whole thread Raw
In response to Re: [HACKERS] WAL logging problem in 9.4.3?  (Kyotaro Horiguchi <horikyota.ntt@gmail.com>)
List pgsql-hackers
At Thu, 21 Nov 2019 16:01:07 +0900 (JST), Kyotaro Horiguchi <horikyota.ntt@gmail.com> wrote in 
> > For deleting relfilenodes, smgrDoPendingDeletes() collects a list for
> > smgrdounlinkall() to pass to DropRelFileNodesAllBuffers(), which is
> > sophisticated about optimizing the shared buffers scan.  Commit 279628a
> > introduced that, in 2013.  I think smgrDoPendingSyncs() should do likewise, to
> Seems reasonable. Please wait a minite.

This is the first cut of that. This makes the function FlushRelationBuffersWithoutRelcache useless, which was
introducedin this work. The first patch reverts it, then the second patch adds the bulk sync feature.
 

The new function FlushRelFileNodesAllBuffers, differently from
DropRelFileNodesAllBuffers, takes SMgrRelation which is required by
FlushBuffer(). So it takes somewhat tricky way, where type
SMgrSortArray pointer to which is compatible with RelFileNode is used.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
From c51b44734d88fb19b568c4c0240848c8be2b7cf4 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Date: Thu, 21 Nov 2019 19:28:35 +0900
Subject: [PATCH 1/2] Revert FlushRelationBuffersWithoutRelcache.

Succeeding patch makes the function useless and the function is no
longer useful globally. Revert it.
---
 src/backend/storage/buffer/bufmgr.c | 27 ++++++++++-----------------
 src/include/storage/bufmgr.h        |  2 --
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 746ce477fc..67bbb26cae 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3203,27 +3203,20 @@ PrintPinnedBufs(void)
 void
 FlushRelationBuffers(Relation rel)
 {
-    RelationOpenSmgr(rel);
-
-    FlushRelationBuffersWithoutRelcache(rel->rd_smgr,
-                                        RelationUsesLocalBuffers(rel));
-}
-
-void
-FlushRelationBuffersWithoutRelcache(SMgrRelation smgr, bool islocal)
-{
-    RelFileNode rnode = smgr->smgr_rnode.node;
-    int i;
+    int            i;
     BufferDesc *bufHdr;
 
-    if (islocal)
+    /* Open rel at the smgr level if not already done */
+    RelationOpenSmgr(rel);
+
+    if (RelationUsesLocalBuffers(rel))
     {
         for (i = 0; i < NLocBuffer; i++)
         {
             uint32        buf_state;
 
             bufHdr = GetLocalBufferDescriptor(i);
-            if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+            if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
                 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
                  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
             {
@@ -3240,7 +3233,7 @@ FlushRelationBuffersWithoutRelcache(SMgrRelation smgr, bool islocal)
 
                 PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
-                smgrwrite(smgr,
+                smgrwrite(rel->rd_smgr,
                           bufHdr->tag.forkNum,
                           bufHdr->tag.blockNum,
                           localpage,
@@ -3270,18 +3263,18 @@ FlushRelationBuffersWithoutRelcache(SMgrRelation smgr, bool islocal)
          * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
          * and saves some cycles.
          */
-        if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+        if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
             continue;
 
         ReservePrivateRefCountEntry();
 
         buf_state = LockBufHdr(bufHdr);
-        if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+        if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
         {
             PinBuffer_Locked(bufHdr);
             LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
-            FlushBuffer(bufHdr, smgr);
+            FlushBuffer(bufHdr, rel->rd_smgr);
             LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
             UnpinBuffer(bufHdr, true);
         }
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 8097d5ab22..8cd1cf25d9 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -192,8 +192,6 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
                                                    ForkNumber forkNum);
 extern void FlushOneBuffer(Buffer buffer);
 extern void FlushRelationBuffers(Relation rel);
-extern void FlushRelationBuffersWithoutRelcache(struct SMgrRelationData *smgr,
-                                                bool islocal);
 extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
                                    int nforks, BlockNumber *firstDelBlock);
-- 
2.23.0

From 882731fcf063269d0bf85c57f23c83b9570e5df5 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Date: Thu, 21 Nov 2019 19:33:18 +0900
Subject: [PATCH 2/2] Improve the performance of relation syncs.

We can improve performance of syncing multiple files at once in the
same way as b41669118. This reduces the number of scans on the whole
shared_bufffers from the number of synced relations to one.
---
 src/backend/catalog/storage.c       |  28 +++++--
 src/backend/storage/buffer/bufmgr.c | 113 ++++++++++++++++++++++++++++
 src/backend/storage/smgr/smgr.c     |  38 +++++++++-
 src/include/storage/bufmgr.h        |   1 +
 src/include/storage/smgr.h          |   1 +
 5 files changed, 174 insertions(+), 7 deletions(-)

diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 51c233dac6..65811b2a9e 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -533,6 +533,9 @@ smgrDoPendingSyncs(void)
 {
     PendingRelDelete *pending;
     HTAB    *delhash = NULL;
+    int            nrels = 0,
+                maxrels = 0;
+    SMgrRelation *srels = NULL;
 
     if (XLogIsNeeded())
         return;  /* no relation can use this */
@@ -573,7 +576,7 @@ smgrDoPendingSyncs(void)
 
     for (pending = pendingDeletes; pending != NULL; pending = pending->next)
     {
-        bool to_be_removed = false; /* don't sync if aborted */
+        bool to_be_removed = false;
         ForkNumber fork;
         BlockNumber nblocks[MAX_FORKNUM + 1];
         BlockNumber total_blocks = 0;
@@ -623,14 +626,21 @@ smgrDoPendingSyncs(void)
          */
         if (total_blocks * BLCKSZ >= wal_skip_threshold * 1024)
         {
-            /* Flush all buffers then sync the file */
-            FlushRelationBuffersWithoutRelcache(srel, false);
+            /* relations to sync are passed to smgrdosyncall at once */
 
-            for (fork = 0; fork <= MAX_FORKNUM; fork++)
+            /* allocate the initial array, or extend it, if needed */
+            if (maxrels == 0)
             {
-                if (smgrexists(srel, fork))
-                    smgrimmedsync(srel, fork);
+                maxrels = 8;
+                srels = palloc(sizeof(SMgrRelation) * maxrels);
             }
+            else if (maxrels <= nrels)
+            {
+                maxrels *= 2;
+                srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+            }
+
+            srels[nrels++] = srel;
         }
         else
         {
@@ -658,6 +668,12 @@ smgrDoPendingSyncs(void)
 
     if (delhash)
         hash_destroy(delhash);
+
+    if (nrels > 0)
+    {
+        smgrdosyncall(srels, nrels);
+        pfree(srels);
+    }
 }
 
 /*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 67bbb26cae..56314653ae 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -105,6 +105,19 @@ typedef struct CkptTsStatus
     int            index;
 } CkptTsStatus;
 
+/*
+ * Type for array used to sort SMgrRelations
+ *
+ * FlushRelFileNodesAllBuffers shares the same comparator function with
+ * DropRelFileNodeBuffers. Pointer to this struct and RelFileNode must
+ * be compatible.
+ */
+typedef struct SMgrSortArray
+{
+    RelFileNode        rnode;    /* This must be the first member */
+    SMgrRelation    srel;
+} SMgrSortArray;
+
 /* GUC variables */
 bool        zero_damaged_pages = false;
 int            bgwriter_lru_maxpages = 100;
@@ -3283,6 +3296,106 @@ FlushRelationBuffers(Relation rel)
     }
 }
 
+/* ---------------------------------------------------------------------
+ *        FlushRelFileNodesAllBuffers
+ *
+ *        This function flushes out the buffer pool all the pages of all
+ *      forks of the specified smgr relations.  It's equivalent to
+ *      calling FlushRelationBuffers once per fork per relation, but the
+ *      parameter is not Relation but SMgrRelation
+ * --------------------------------------------------------------------
+ */
+void
+FlushRelFileNodesAllBuffers(SMgrRelation *smgrs, int nrels)
+{
+    int            i;
+    SMgrSortArray *srels;
+    bool        use_bsearch;
+
+    if (nrels == 0)
+        return;
+
+    /* fill-in array for qsort */
+    srels = palloc(sizeof(SMgrSortArray) * nrels);
+
+    for (i = 0 ; i < nrels ; i++)
+    {
+        Assert (!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
+
+        srels[i].rnode = smgrs[i]->smgr_rnode.node;
+        srels[i].srel  = smgrs[i];
+    }
+
+    /*
+     * Save the bsearch overhead for low number of relations to
+     * sync. See DropRelFileNodesAllBuffers for details. The name DROP_*
+     * is for historical reasons.
+     */
+    use_bsearch = nrels > DROP_RELS_BSEARCH_THRESHOLD;
+
+    /* sort the list of SMgrRelations if necessary */
+    if (use_bsearch)
+        pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
+
+    /* Make sure we can handle the pin inside the loop */
+    ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+    for (i = 0; i < NBuffers; i++)
+    {
+        SMgrSortArray *srelent = NULL;
+        BufferDesc *bufHdr = GetBufferDescriptor(i);
+        uint32        buf_state;
+
+        /*
+         * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+         * and saves some cycles.
+         */
+
+        if (!use_bsearch)
+        {
+            int            j;
+
+            for (j = 0; j < nrels; j++)
+            {
+                if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
+                {
+                    srelent = &srels[j];
+                    break;
+                }
+            }
+
+        }
+        else
+        {
+            srelent = bsearch((const void *) &(bufHdr->tag.rnode),
+                              srels, nrels, sizeof(SMgrSortArray),
+                              rnode_comparator);
+        }
+
+        /* buffer doesn't belong to any of the given relfilenodes; skip it */
+        if (srelent == NULL)
+            continue;
+
+        /* Ensure there's a free array slot for PinBuffer_Locked */
+        ReservePrivateRefCountEntry();
+
+        buf_state = LockBufHdr(bufHdr);
+        if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
+            (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+        {
+            PinBuffer_Locked(bufHdr);
+            LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+            FlushBuffer(bufHdr, srelent->srel);
+            LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+            UnpinBuffer(bufHdr, true);
+        }
+        else
+            UnlockBufHdr(bufHdr, buf_state);
+    }
+
+    pfree(srels);
+}
+
 /* ---------------------------------------------------------------------
  *        FlushDatabaseBuffers
  *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index b50c69b438..f79f2df40f 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -388,6 +388,43 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
     smgrsw[which].smgr_unlink(rnode, InvalidForkNumber, isRedo);
 }
 
+/*
+ *    smgrdosyncall() -- Immediately sync all forks of all given relations
+ *
+ *        All forks of all given relations are syncd out to the store.
+ *
+ *        This is equivalent to flusing all buffers FlushRelationBuffers for each
+ *        smgr relation then calling smgrimmedsync for all forks of each smgr
+ *        relation, but it's significantly quicker so should be preferred when
+ *        possible.
+ */
+void
+smgrdosyncall(SMgrRelation *rels, int nrels)
+{
+    int            i = 0;
+    ForkNumber    forknum;
+
+    if (nrels == 0)
+        return;
+
+    /* We need to flush all buffers for the relations before sync. */
+    FlushRelFileNodesAllBuffers(rels, nrels);
+
+    /*
+     * Sync the physical file(s).
+     */
+    for (i = 0; i < nrels; i++)
+    {
+        int            which = rels[i]->smgr_which;
+
+        for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+        {
+            if (smgrsw[which].smgr_exists(rels[i], forknum))
+                smgrsw[which].smgr_immedsync(rels[i], forknum);
+        }
+    }
+}
+
 /*
  *    smgrdounlinkall() -- Immediately unlink all forks of all given relations
  *
@@ -469,7 +506,6 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
     pfree(rnodes);
 }
 
-
 /*
  *    smgrextend() -- Add a new block to a file.
  *
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 8cd1cf25d9..3f85e8c6fe 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -195,6 +195,7 @@ extern void FlushRelationBuffers(Relation rel);
 extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
                                    int nforks, BlockNumber *firstDelBlock);
+extern void FlushRelFileNodesAllBuffers(struct SMgrRelationData **smgrs, int nrels);
 extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
 extern void DropDatabaseBuffers(Oid dbid);
 
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 1543d8d870..31a5ecd059 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -89,6 +89,7 @@ extern void smgrcloseall(void);
 extern void smgrclosenode(RelFileNodeBackend rnode);
 extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
                        BlockNumber blocknum, char *buffer, bool skipFsync);
-- 
2.23.0


pgsql-hackers by date:

Previous
From: Peter Eisentraut
Date:
Subject: Re: [proposal] recovery_target "latest"
Next
From: Fujii Masao
Date:
Subject: Re: Allow CREATE OR REPLACE VIEW to rename the columns