In-placre persistance change of a relation - Mailing list pgsql-hackers

From Kyotaro Horiguchi
Subject In-placre persistance change of a relation
Date
Msg-id 20201111.173317.460890039962481381.horikyota.ntt@gmail.com
Whole thread Raw
Responses Re: In-placre persistance change of a relation
Re: In-placre persistance change of a relation
List pgsql-hackers
Hello. This is a thread for an alternative solution to wal_level=none
[*1] for bulk data loading.

*1:
https://www.postgresql.org/message-id/TYAPR01MB29901EBE5A3ACCE55BA99186FE320%40TYAPR01MB2990.jpnprd01.prod.outlook.com

At Tue, 10 Nov 2020 09:33:12 -0500, Stephen Frost <sfrost@snowman.net> wrote in 
> Greetings,
> 
> * Kyotaro Horiguchi (horikyota.ntt@gmail.com) wrote:
> > For fuel(?) of the discussion, I tried a very-quick PoC for in-place
> > ALTER TABLE SET LOGGED/UNLOGGED and resulted as attached. After some
> > trials of several ways, I drifted to the following way after poking
> > several ways.
> > 
> > 1. Flip BM_PERMANENT of active buffers
> > 2. adding/removing init fork
> > 3. sync files,
> > 4. Flip pg_class.relpersistence.
> > 
> > It always skips table copy in the SET UNLOGGED case, and only when
> > wal_level=minimal in the SET LOGGED case.  Crash recovery seems
> > working by some brief testing by hand.
> 
> Somehow missed that this patch more-or-less does what I was referring to
> down-thread, but I did want to mention that it looks like it's missing a
> necessary FlushRelationBuffers() call before the sync, otherwise there
> could be dirty buffers for the relation that's being set to LOGGED (with
> wal_level=minimal), which wouldn't be good.  See the comments above
> smgrimmedsync().

Right. Thanks.  However, since SetRelFileNodeBuffersPersistence()
called just above scans shared buffers so I don't want to just call
FlushRelationBuffers() separately.  Instead, I added buffer-flush to
SetRelFileNodeBuffersPersistence().

FWIW this is a revised version of the PoC, which has some known
problems.

- Flipping of Buffer persistence is not WAL-logged nor even be able to
  be safely roll-backed. (It might be better to drop buffers).

- This version handles indexes but not yet handle toast relatins.

- tableAMs are supposed to support this feature. (but I'm not sure
  it's worth allowing them not to do so).

> > Of course, I haven't performed intensive test on it.
> 
> Reading through the thread, it didn't seem very clear, but we should
> definitely make sure that it does the right thing on replicas when going
> between unlogged and logged (and between logged and unlogged too), of
> course.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index dcaea7135f..0c6ce70484 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -613,6 +613,27 @@ heapam_relation_set_new_filenode(Relation rel,
     smgrclose(srel);
 }
 
+static void
+heapam_relation_set_persistence(Relation rel, char persistence)
+{
+    Assert(rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
+           rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED);
+
+    Assert (rel->rd_rel->relpersistence != persistence);
+
+    if (persistence == RELPERSISTENCE_UNLOGGED)
+    {
+        Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
+               rel->rd_rel->relkind == RELKIND_MATVIEW ||
+               rel->rd_rel->relkind == RELKIND_TOASTVALUE);
+
+        RelationCreateInitFork(rel->rd_node, false);
+    }
+    else
+        RelationDropInitFork(rel->rd_node);
+}
+
+
 static void
 heapam_relation_nontransactional_truncate(Relation rel)
 {
@@ -2540,6 +2561,7 @@ static const TableAmRoutine heapam_methods = {
     .compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples,
 
     .relation_set_new_filenode = heapam_relation_set_new_filenode,
+    .relation_set_persistence = heapam_relation_set_persistence,
     .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
     .relation_copy_data = heapam_relation_copy_data,
     .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c
index a7c0cb1bc3..8397002613 100644
--- a/src/backend/access/rmgrdesc/smgrdesc.c
+++ b/src/backend/access/rmgrdesc/smgrdesc.c
@@ -40,6 +40,14 @@ smgr_desc(StringInfo buf, XLogReaderState *record)
                          xlrec->blkno, xlrec->flags);
         pfree(path);
     }
+    else if (info == XLOG_SMGR_UNLINK)
+    {
+        xl_smgr_unlink *xlrec = (xl_smgr_unlink *) rec;
+        char       *path = relpathperm(xlrec->rnode, xlrec->forkNum);
+
+        appendStringInfoString(buf, path);
+        pfree(path);
+    }
 }
 
 const char *
@@ -55,6 +63,9 @@ smgr_identify(uint8 info)
         case XLOG_SMGR_TRUNCATE:
             id = "TRUNCATE";
             break;
+        case XLOG_SMGR_UNLINK:
+            id = "UNLINK";
+            break;
     }
 
     return id;
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index d538f25726..ac5aea3d38 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -60,6 +60,8 @@ int            wal_skip_threshold = 2048;    /* in kilobytes */
 typedef struct PendingRelDelete
 {
     RelFileNode relnode;        /* relation that may need to be deleted */
+    bool        deleteinitfork;    /* delete only init fork if true */
+    bool        createinitfork;    /* create init fork if true */
     BackendId    backend;        /* InvalidBackendId if not a temp rel */
     bool        atCommit;        /* T=delete at commit; F=delete at abort */
     int            nestLevel;        /* xact nesting level of request */
@@ -153,6 +155,8 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
     pending = (PendingRelDelete *)
         MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
     pending->relnode = rnode;
+    pending->deleteinitfork = false;
+    pending->createinitfork = false;
     pending->backend = backend;
     pending->atCommit = false;    /* delete if abort */
     pending->nestLevel = GetCurrentTransactionNestLevel();
@@ -168,6 +172,95 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
     return srel;
 }
 
+/*
+ * RelationCreateInitFork
+ *        Create physical storage for a relation.
+ *
+ * Create the underlying disk file storage for the relation. This only
+ * creates the main fork; additional forks are created lazily by the
+ * modules that need them.
+ *
+ * This function is transactional. The creation is WAL-logged, and if the
+ * transaction aborts later on, the storage will be destroyed.
+ */
+void
+RelationCreateInitFork(RelFileNode rnode, bool isRedo)
+{
+    PendingRelDelete *pending;
+    SMgrRelation srel;
+    PendingRelDelete *prev;
+    PendingRelDelete *next;
+
+    prev = NULL;
+    for (pending = pendingDeletes; pending != NULL; pending = next)
+    {
+        next = pending->next;
+        if (RelFileNodeEquals(rnode, pending->relnode) &&
+            pending->deleteinitfork && pending->atCommit)
+        {
+            /* unlink and delete list entry */
+            if (prev)
+                prev->next = next;
+            else
+                pendingDeletes = next;
+            pfree(pending);
+            return;
+        }
+        else
+        {
+            /* unrelated entry, don't touch it */
+            prev = pending;
+        }
+    }    
+    srel = smgropen(rnode, InvalidBackendId);
+    smgrcreate(srel, INIT_FORKNUM, isRedo);
+    if (!isRedo)
+        log_smgrcreate(&rnode, INIT_FORKNUM);
+    smgrimmedsync(srel, INIT_FORKNUM);
+
+    /* Add the relation to the list of stuff to delete at abort */
+    pending = (PendingRelDelete *)
+        MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
+    pending->relnode = rnode;
+    pending->deleteinitfork = true;
+    pending->createinitfork = false;
+    pending->backend = InvalidBackendId;
+    pending->atCommit = false;    /* delete if abort */
+    pending->nestLevel = GetCurrentTransactionNestLevel();
+    pending->next = pendingDeletes;
+    pendingDeletes = pending;
+}
+
+void
+RelationDropInitFork(RelFileNode rnode)
+{
+    PendingRelDelete *pending;
+    PendingRelDelete *next;
+
+    for (pending = pendingDeletes; pending != NULL; pending = next)
+    {
+        next = pending->next;
+        if (RelFileNodeEquals(rnode, pending->relnode) &&
+            pending->deleteinitfork && pending->atCommit)
+        {
+            /* We're done. */
+            return;
+        }
+    }    
+
+    /* Add the relation to the list of stuff to delete at abort */
+    pending = (PendingRelDelete *)
+        MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
+    pending->relnode = rnode;
+    pending->deleteinitfork = true;
+    pending->createinitfork = false;
+    pending->backend = InvalidBackendId;
+    pending->atCommit = true;    /* create if abort */
+    pending->nestLevel = GetCurrentTransactionNestLevel();
+    pending->next = pendingDeletes;
+    pendingDeletes = pending;
+}
+
 /*
  * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
  */
@@ -187,6 +280,25 @@ log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
     XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
 }
 
+/*
+ * Perform XLogInsert of an XLOG_SMGR_UNLINK record to WAL.
+ */
+void
+log_smgrunlink(const RelFileNode *rnode, ForkNumber forkNum)
+{
+    xl_smgr_unlink xlrec;
+
+    /*
+     * Make an XLOG entry reporting the file unlink.
+     */
+    xlrec.rnode = *rnode;
+    xlrec.forkNum = forkNum;
+
+    XLogBeginInsert();
+    XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+    XLogInsert(RM_SMGR_ID, XLOG_SMGR_UNLINK | XLR_SPECIAL_REL_UPDATE);
+}
+
 /*
  * RelationDropStorage
  *        Schedule unlinking of physical storage at transaction commit.
@@ -200,6 +312,8 @@ RelationDropStorage(Relation rel)
     pending = (PendingRelDelete *)
         MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
     pending->relnode = rel->rd_node;
+    pending->createinitfork = false;
+    pending->deleteinitfork = false;
     pending->backend = rel->rd_backend;
     pending->atCommit = true;    /* delete if commit */
     pending->nestLevel = GetCurrentTransactionNestLevel();
@@ -626,19 +740,27 @@ smgrDoPendingDeletes(bool isCommit)
 
                 srel = smgropen(pending->relnode, pending->backend);
 
-                /* allocate the initial array, or extend it, if needed */
-                if (maxrels == 0)
+                if (pending->deleteinitfork)
                 {
-                    maxrels = 8;
-                    srels = palloc(sizeof(SMgrRelation) * maxrels);
+                    log_smgrunlink(&pending->relnode, INIT_FORKNUM);
+                    smgrunlink(srel, INIT_FORKNUM, false);
                 }
-                else if (maxrels <= nrels)
+                else
                 {
-                    maxrels *= 2;
-                    srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
-                }
+                    /* allocate the initial array, or extend it, if needed */
+                    if (maxrels == 0)
+                    {
+                        maxrels = 8;
+                        srels = palloc(sizeof(SMgrRelation) * maxrels);
+                    }
+                    else if (maxrels <= nrels)
+                    {
+                        maxrels *= 2;
+                        srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+                    }
 
-                srels[nrels++] = srel;
+                    srels[nrels++] = srel;
+                }
             }
             /* must explicitly free the list entry */
             pfree(pending);
@@ -917,6 +1039,14 @@ smgr_redo(XLogReaderState *record)
         reln = smgropen(xlrec->rnode, InvalidBackendId);
         smgrcreate(reln, xlrec->forkNum, true);
     }
+    else if (info == XLOG_SMGR_UNLINK)
+    {
+        xl_smgr_unlink *xlrec = (xl_smgr_unlink *) XLogRecGetData(record);
+        SMgrRelation reln;
+
+        reln = smgropen(xlrec->rnode, InvalidBackendId);
+        smgrunlink(reln, xlrec->forkNum, true);
+    }
     else if (info == XLOG_SMGR_TRUNCATE)
     {
         xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index e3cfaf8b07..e358174b01 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -4918,6 +4918,137 @@ ATParseTransformCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
     return newcmd;
 }
 
+static bool
+try_inplace_persistence_change(AlteredTableInfo *tab, char persistence,
+                               LOCKMODE lockmode)
+{
+    Relation     rel;
+    Relation    classRel;
+    HeapTuple    tuple,
+                newtuple;
+    Datum        new_val[Natts_pg_class];
+    bool        new_null[Natts_pg_class],
+                new_repl[Natts_pg_class];
+    int            i;
+    List       *relids;
+    ListCell   *lc_oid;
+
+    Assert(tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE);
+    Assert(lockmode == AccessExclusiveLock);
+
+    /*
+     * Under the following condition, we need to call ATRewriteTable, which
+     * cannot be false in the AT_REWRITE_ALTER_PERSISTENCE case.
+     */
+    Assert(tab->constraints == NULL && tab->partition_constraint == NULL &&
+           tab->newvals == NULL && !tab->verify_new_notnull);
+
+    /*
+     * When wal_level is replica or higher we need that the initial state of
+     * the relation be recoverable from WAL.  When wal_level >= replica
+     * switching to PERMANENT needs to emit the WAL records to reconstruct the
+     * current data.  This could be done by writing XLOG_FPI for all pages but
+     * it is not obvious that that is performant than normal rewriting.
+     * Otherwise what we need for the relation data is just establishing
+     * initial state on storage and no need of WAL to reconstruct it.
+     */
+    if (tab->newrelpersistence == RELPERSISTENCE_PERMANENT && XLogIsNeeded())
+        return false;
+
+    rel = table_open(tab->relid, lockmode);
+
+    Assert(rel->rd_rel->relpersistence != persistence);
+
+    elog(DEBUG1, "perform im-place persistnce change");
+
+    RelationOpenSmgr(rel);
+
+    /* Change persistence then flush-out buffers of the relation */
+
+    /* Get the list of index OIDs for this relation */
+    relids = RelationGetIndexList(rel);
+    relids = lcons_oid(rel->rd_id, relids);
+
+    table_close(rel, lockmode);
+
+    /* Done change on storage. Update catalog including indexes. */
+    /* add the heap oid to the relation ID list */
+
+    classRel = table_open(RelationRelationId, RowExclusiveLock);
+
+    foreach (lc_oid, relids)
+    {
+        Oid reloid = lfirst_oid(lc_oid);
+        Relation r = relation_open(reloid, lockmode);
+
+        RelationOpenSmgr(r);
+
+        if (persistence == RELPERSISTENCE_UNLOGGED)
+        {
+            RelationCreateInitFork(r->rd_node, false);
+
+            if (r->rd_rel->relkind == RELKIND_INDEX ||
+                r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+                r->rd_indam->ambuildempty(r);
+            else
+            {
+                Assert(r->rd_rel->relkind == RELKIND_RELATION ||
+                       r->rd_rel->relkind == RELKIND_MATVIEW ||
+                       r->rd_rel->relkind == RELKIND_TOASTVALUE);
+            }
+        }
+        else
+            RelationDropInitFork(r->rd_node);
+
+        table_close(r, NoLock);
+
+        /*
+         * This relation is now WAL-logged. Sync all files immediately to
+         * establish the initial state on storgae.
+         */
+        if (persistence == RELPERSISTENCE_PERMANENT)
+        {
+            for (i = 0 ; i < MAX_FORKNUM ; i++)
+            {
+                if (smgrexists(r->rd_smgr, i))
+                    smgrimmedsync(r->rd_smgr, i);
+            }
+        }
+
+
+        tuple = SearchSysCacheCopy1(RELOID,    ObjectIdGetDatum(reloid));
+        if (!HeapTupleIsValid(tuple))
+            elog(ERROR, "cache lookup failed for relation %u", reloid);
+
+        memset(new_val, 0, sizeof(new_val));
+        memset(new_null, false, sizeof(new_null));
+        memset(new_repl, false, sizeof(new_repl));
+
+        new_val[Anum_pg_class_relpersistence - 1] = CharGetDatum(persistence);
+        new_null[Anum_pg_class_relpersistence - 1] = false;
+        new_repl[Anum_pg_class_relpersistence - 1] = true;
+
+        newtuple = heap_modify_tuple(tuple, RelationGetDescr(classRel),
+                                     new_val, new_null, new_repl);
+
+        CatalogTupleUpdate(classRel, &newtuple->t_self, newtuple);
+        heap_freetuple(newtuple);
+    }
+
+    foreach (lc_oid, relids)
+    {
+        Oid reloid = lfirst_oid(lc_oid);
+        Relation r = relation_open(reloid, lockmode);
+
+        RelationOpenSmgr(r);
+        SetRelationBuffersPersistence(r, persistence == RELPERSISTENCE_PERMANENT);
+        table_close(r, NoLock);
+    }
+    table_close(classRel, RowExclusiveLock);
+
+    return true;
+}
+
 /*
  * ATRewriteTables: ALTER TABLE phase 3
  */
@@ -5038,45 +5169,51 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode,
                                          tab->relid,
                                          tab->rewrite);
 
-            /*
-             * Create transient table that will receive the modified data.
-             *
-             * Ensure it is marked correctly as logged or unlogged.  We have
-             * to do this here so that buffers for the new relfilenode will
-             * have the right persistence set, and at the same time ensure
-             * that the original filenode's buffers will get read in with the
-             * correct setting (i.e. the original one).  Otherwise a rollback
-             * after the rewrite would possibly result with buffers for the
-             * original filenode having the wrong persistence setting.
-             *
-             * NB: This relies on swap_relation_files() also swapping the
-             * persistence. That wouldn't work for pg_class, but that can't be
-             * unlogged anyway.
-             */
-            OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, persistence,
-                                       lockmode);
+            if (tab->rewrite != AT_REWRITE_ALTER_PERSISTENCE ||
+                !try_inplace_persistence_change(tab, persistence, lockmode))
+            {
+                /*
+                 * Create transient table that will receive the modified data.
+                 *
+                 * Ensure it is marked correctly as logged or unlogged.  We
+                 * have to do this here so that buffers for the new relfilenode
+                 * will have the right persistence set, and at the same time
+                 * ensure that the original filenode's buffers will get read in
+                 * with the correct setting (i.e. the original one).  Otherwise
+                 * a rollback after the rewrite would possibly result with
+                 * buffers for the original filenode having the wrong
+                 * persistence setting.
+                 *
+                 * NB: This relies on swap_relation_files() also swapping the
+                 * persistence. That wouldn't work for pg_class, but that can't
+                 * be unlogged anyway.
+                 */
+                OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, persistence,
+                                           lockmode);
 
-            /*
-             * Copy the heap data into the new table with the desired
-             * modifications, and test the current data within the table
-             * against new constraints generated by ALTER TABLE commands.
-             */
-            ATRewriteTable(tab, OIDNewHeap, lockmode);
+                /*
+                 * Copy the heap data into the new table with the desired
+                 * modifications, and test the current data within the table
+                 * against new constraints generated by ALTER TABLE commands.
+                 */
+                ATRewriteTable(tab, OIDNewHeap, lockmode);
 
-            /*
-             * Swap the physical files of the old and new heaps, then rebuild
-             * indexes and discard the old heap.  We can use RecentXmin for
-             * the table's new relfrozenxid because we rewrote all the tuples
-             * in ATRewriteTable, so no older Xid remains in the table.  Also,
-             * we never try to swap toast tables by content, since we have no
-             * interest in letting this code work on system catalogs.
-             */
-            finish_heap_swap(tab->relid, OIDNewHeap,
-                             false, false, true,
-                             !OidIsValid(tab->newTableSpace),
-                             RecentXmin,
-                             ReadNextMultiXactId(),
-                             persistence);
+                /*
+                 * Swap the physical files of the old and new heaps, then
+                 * rebuild indexes and discard the old heap.  We can use
+                 * RecentXmin for the table's new relfrozenxid because we
+                 * rewrote all the tuples in ATRewriteTable, so no older Xid
+                 * remains in the table.  Also, we never try to swap toast
+                 * tables by content, since we have no interest in letting this
+                 * code work on system catalogs.
+                 */
+                finish_heap_swap(tab->relid, OIDNewHeap,
+                                 false, false, true,
+                                 !OidIsValid(tab->newTableSpace),
+                                 RecentXmin,
+                                 ReadNextMultiXactId(),
+                                 persistence);
+            }
         }
         else
         {
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index ad0d1a9abc..c71e1a5f92 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3033,6 +3033,80 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
     }
 }
 
+/* ---------------------------------------------------------------------
+ *        SetRelFileNodeBuffersPersistence
+ *
+ *        This function changes the persistence of all buffer pages of a relation
+ *        then writes all dirty pages of the relation out to disk when switching
+ *        to PERMANENT. (or more accurately, out to kernel disk buffers),
+ *        ensuring that the kernel has an up-to-date view of the relation.
+ *
+ *        Generally, the caller should be holding AccessExclusiveLock on the
+ *        target relation to ensure that no other backend is busy dirtying
+ *        more blocks of the relation; the effects can't be expected to last
+ *        after the lock is released.
+ *
+ *        XXX currently it sequentially searches the buffer pool, should be
+ *        changed to more clever ways of searching.  This routine is not
+ *        used in any performance-critical code paths, so it's not worth
+ *        adding additional overhead to normal paths to make it go faster;
+ *        but see also DropRelFileNodeBuffers.
+ * --------------------------------------------------------------------
+ */
+void
+SetRelationBuffersPersistence(Relation rel, bool permanent)
+{
+    int            i;
+    RelFileNodeBackend rnode = rel->rd_smgr->smgr_rnode;
+    
+    Assert (!RelFileNodeBackendIsTemp(rnode));
+
+    for (i = 0; i < NBuffers; i++)
+    {
+        BufferDesc *bufHdr = GetBufferDescriptor(i);
+        uint32        buf_state;
+
+        if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+            continue;
+
+        ReservePrivateRefCountEntry();
+
+        buf_state = LockBufHdr(bufHdr);
+
+        if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+        {
+            ereport(LOG, (errmsg ("#%d: %d", i, (buf_state &  BM_PERMANENT) == 0), errhidestmt(true)));
+            if (permanent)
+            {
+                Assert ((buf_state &  BM_PERMANENT) == 0);
+                buf_state |= BM_PERMANENT;
+                pg_atomic_write_u32(&bufHdr->state, buf_state);
+
+                /* we flush this buffer when swithing to PERMANENT */
+                if ((buf_state & (BM_VALID | BM_DIRTY)) ==
+                    (BM_VALID | BM_DIRTY))
+                {
+                    PinBuffer_Locked(bufHdr);
+                    LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
+                                  LW_SHARED);
+                    FlushBuffer(bufHdr, rel->rd_smgr);
+                    LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+                    UnpinBuffer(bufHdr, true);
+                }
+                else
+                    UnlockBufHdr(bufHdr, buf_state);
+            }
+            else
+            {
+                Assert ((buf_state &  BM_PERMANENT) != 0);
+                buf_state &= ~BM_PERMANENT;
+                UnlockBufHdr(bufHdr, buf_state);
+            }
+            ereport(LOG, (errmsg ("#%d: -> %d", i, (buf_state &  BM_PERMANENT) == 0), errhidestmt(true)));
+        }
+    }
+}
+
 /* ---------------------------------------------------------------------
  *        DropRelFileNodesAllBuffers
  *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index dcc09df0c7..5eb9e97b3d 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -645,6 +645,12 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
     smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
 }
 
+void
+smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+    smgrsw[reln->smgr_which].smgr_unlink(reln->smgr_rnode, forknum, isRedo);
+}
+
 /*
  * AtEOXact_SMgr
  *
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 387eb34a61..1d19278a18 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -451,6 +451,15 @@ typedef struct TableAmRoutine
                                               TransactionId *freezeXid,
                                               MultiXactId *minmulti);
 
+    /*
+     * This callback needs to switch persistence of the relation between
+     * RELPERSISTENCE_PERMANENT and RELPERSISTENCE_UNLOGGED. Actual change on
+     * storage is performed elsewhere.
+     *
+     * See also table_relation_set_persistence().
+     */
+    void        (*relation_set_persistence) (Relation rel, char persistence);
+
     /*
      * This callback needs to remove all contents from `rel`'s current
      * relfilenode. No provisions for transactional behaviour need to be made.
@@ -1404,6 +1413,18 @@ table_relation_set_new_filenode(Relation rel,
                                                freezeXid, minmulti);
 }
 
+/*
+ * Switch storage persistence between RELPERSISTENCE_PERMANENT and
+ * RELPERSISTENCE_UNLOGGED.
+ *
+ * This is used during in-place persistence switching 
+ */
+static inline void
+table_relation_set_persistence(Relation rel, char persistence)
+{
+    rel->rd_tableam->relation_set_persistence(rel, persistence);
+}
+
 /*
  * Remove all table contents from `rel`, in a non-transactional manner.
  * Non-transactional meaning that there's no need to support rollbacks. This
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
index 30c38e0ca6..43d2eb0fb4 100644
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -23,6 +23,8 @@
 extern int    wal_skip_threshold;
 
 extern SMgrRelation RelationCreateStorage(RelFileNode rnode, char relpersistence);
+extern void RelationCreateInitFork(RelFileNode rel, bool isRedo);
+extern void RelationDropInitFork(RelFileNode rel);
 extern void RelationDropStorage(Relation rel);
 extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit);
 extern void RelationPreTruncate(Relation rel);
diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h
index 7b21cab2e0..73ad2ae89e 100644
--- a/src/include/catalog/storage_xlog.h
+++ b/src/include/catalog/storage_xlog.h
@@ -29,6 +29,7 @@
 /* XLOG gives us high 4 bits */
 #define XLOG_SMGR_CREATE    0x10
 #define XLOG_SMGR_TRUNCATE    0x20
+#define XLOG_SMGR_UNLINK    0x30
 
 typedef struct xl_smgr_create
 {
@@ -36,6 +37,12 @@ typedef struct xl_smgr_create
     ForkNumber    forkNum;
 } xl_smgr_create;
 
+typedef struct xl_smgr_unlink
+{
+    RelFileNode rnode;
+    ForkNumber    forkNum;
+} xl_smgr_unlink;
+
 /* flags for xl_smgr_truncate */
 #define SMGR_TRUNCATE_HEAP        0x0001
 #define SMGR_TRUNCATE_VM        0x0002
@@ -51,6 +58,7 @@ typedef struct xl_smgr_truncate
 } xl_smgr_truncate;
 
 extern void log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum);
+extern void log_smgrunlink(const RelFileNode *rnode, ForkNumber forkNum);
 
 extern void smgr_redo(XLogReaderState *record);
 extern void smgr_desc(StringInfo buf, XLogReaderState *record);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ee91b8fa26..f65a273999 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -205,6 +205,7 @@ extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels)
 extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
                                    int nforks, BlockNumber *firstDelBlock);
+extern void SetRelationBuffersPersistence(Relation rnode, bool permanent);
 extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
 extern void DropDatabaseBuffers(Oid dbid);
 
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index f28a842401..5d74631006 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -86,6 +86,7 @@ extern void smgrclose(SMgrRelation reln);
 extern void smgrcloseall(void);
 extern void smgrclosenode(RelFileNodeBackend rnode);
 extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern void smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,

pgsql-hackers by date:

Previous
From: Bharath Rupireddy
Date:
Subject: Re: Skip ExecCheckRTPerms in CTAS with no data
Next
From: Kyotaro Horiguchi
Date:
Subject: Re: Disable WAL logging to speed up data loading