From 159cf437cab348a358332262b5f3944b0330b6e6 Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilip.kumar@enterprisedb.com>
Date: Thu, 7 Jul 2022 16:34:45 +0530
Subject: [PATCH v7 4/4] Don't delay removing Tombstone file until next 
 checkpoint

Prior to making relfilenode to 56bit wider, we can not
remove the unused relfilenode until the next checkpoint
because if we remove them immediately then there is a risk
of reusing the same relfilenode for two different relations
during single checkpoint due to Oid wraparound.

Now as part of the previous patch set we have made relfilenode
56 bit wider and removed the risk of wraparound so now we don't
need to wait till the next checkpoint for removing the unused
relation file and we can clean them up on commit.
---
 src/backend/access/transam/xlog.c |   5 --
 src/backend/commands/tablecmds.c  |   6 +-
 src/backend/storage/smgr/md.c     | 154 +++++++++++---------------------------
 src/backend/storage/sync/sync.c   | 101 -------------------------
 src/include/storage/sync.h        |   2 -
 5 files changed, 46 insertions(+), 222 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 28c3d31..73531c1 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6641,11 +6641,6 @@ CreateCheckPoint(int flags)
 	END_CRIT_SECTION();
 
 	/*
-	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
-	 */
-	SyncPostCheckpoint();
-
-	/*
 	 * Update the average distance between checkpoints if the prior checkpoint
 	 * exists.
 	 */
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 0678681..f5d653c 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -14372,9 +14372,9 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
 
 	/*
 	 * Generate a new relfilenumber.  We can not reuse the old relfilenumber
-	 * because the unused relfilenumber files are not unlinked until the next
-	 * checkpoint.  So if move the relation to the old tablespace again, we
-	 * will get the conflicting relfilenumber file.
+	 * because the unused relfilenumber files are not unlinked until commit.
+	 * So if move the relation to the old tablespace again within a same
+	 * transaction, we will get the conflicting relfilenumber file.
 	 */
 	newrelfilenumber = GetNewRelFileNumber();
 	AssertRelfileNumberFileNotExists(newTableSpace,
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index d9e72cf..7d15920 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -24,6 +24,7 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/file.h>
+#include <sys/stat.h>
 
 #include "access/xlog.h"
 #include "access/xlogutils.h"
@@ -126,8 +127,6 @@ static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum,
 static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
 								   MdfdVec *seg);
-static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
-									BlockNumber segno);
 static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
 									BlockNumber segno);
 static void _fdvec_resize(SMgrRelation reln,
@@ -240,41 +239,14 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
  * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
  * to delete all forks.
  *
- * For regular relations, we don't unlink the first segment file of the rel,
- * but just truncate it to zero length, and record a request to unlink it after
- * the next checkpoint.  Additional segments can be unlinked immediately,
- * however.  Leaving the empty file in place prevents that relfilenumber
- * from being reused.  The scenario this protects us from is:
- * 1. We delete a relation (and commit, and actually remove its file).
- * 2. We create a new relation, which by chance gets the same relfilenumber as
- *	  the just-deleted one (OIDs must've wrapped around for that to happen).
- * 3. We crash before another checkpoint occurs.
- * During replay, we would delete the file and then recreate it, which is fine
- * if the contents of the file were repopulated by subsequent WAL entries.
- * But if we didn't WAL-log insertions, but instead relied on fsyncing the
- * file after populating it (as we do at wal_level=minimal), the contents of
- * the file would be lost forever.  By leaving the empty file until after the
- * next checkpoint, we prevent reassignment of the relfilenumber until it's
- * safe, because relfilenumber assignment skips over any existing file.
- *
- * XXX although this all was true when the relfilenumbers were 32 bits wide but
- * now the relfilenumbers are 56 bits wide so we don't have risk of
- * relfilenumber being reused so in future we can immediately unlink the first
- * segment as well.  Although we can reuse the relfilenumber during createdb()
- * using file copy method or during movedb() but the above scenario is only
- * applicable when we create a new relation.
- *
- * We do not need to go through this dance for temp relations, though, because
- * we never make WAL entries for temp rels, and so a temp rel poses no threat
- * to the health of a regular rel that has taken over its relfilenumber.
- * The fact that temp rels and regular rels have different file naming
- * patterns provides additional safety.
+ * We do not carefully track whether other forks have been created or not, but
+ * just attempt to unlink them unconditionally; so we should never complain
+ * about ENOENT.
  *
- * All the above applies only to the relation's main fork; other forks can
- * just be removed immediately, since they are not needed to prevent the
- * relfilenumber from being recycled.  Also, we do not carefully
- * track whether other forks have been created or not, but just attempt to
- * unlink them unconditionally; so we should never complain about ENOENT.
+ * Note that now we can immediately unlink the first segment of the regular
+ * relation as well because the relfilenumber is 56 bits wide since PG 16.  So
+ * we don't have to worry about relfilenumber getting reused for some unrelated
+ * relation file.
  *
  * If isRedo is true, it's unsurprising for the relation to be already gone.
  * Also, we should remove the file immediately instead of queuing a request
@@ -325,90 +297,67 @@ static void
 mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum, bool isRedo)
 {
 	char	   *path;
-	int			ret;
+	char	   *segpath;
+	int			segno;
+	int			lastsegment = -1;
+	struct stat statbuf;
 
 	path = relpath(rlocator, forkNum);
+	segpath = (char *) palloc(strlen(path) + 12);
 
-	/*
-	 * Delete or truncate the first segment.
-	 */
-	if (isRedo || forkNum != MAIN_FORKNUM || RelFileLocatorBackendIsTemp(rlocator))
+	/* compute number of segments. */
+	for (segno = 0;; segno++)
 	{
-		if (!RelFileLocatorBackendIsTemp(rlocator))
-		{
-			/* Prevent other backends' fds from holding on to the disk space */
-			ret = do_truncate(path);
-
-			/* Forget any pending sync requests for the first segment */
-			register_forget_request(rlocator, forkNum, 0 /* first seg */ );
-		}
+		if (segno == 0)
+			sprintf(segpath, "%s", path);
 		else
-			ret = 0;
+			sprintf(segpath, "%s.%u", path, segno);
 
-		/* Next unlink the file, unless it was already found to be missing */
-		if (ret == 0 || errno != ENOENT)
+		if (stat(segpath, &statbuf) != 0)
 		{
-			ret = unlink(path);
-			if (ret < 0 && errno != ENOENT)
-				ereport(WARNING,
-						(errcode_for_file_access(),
-						 errmsg("could not remove file \"%s\": %m", path)));
+			/* ENOENT is expected after the last segment... */
+			if (errno == ENOENT)
+				break;
 		}
-	}
-	else
-	{
-		/* Prevent other backends' fds from holding on to the disk space */
-		ret = do_truncate(path);
-
-		/* Register request to unlink first segment later */
-		register_unlink_segment(rlocator, forkNum, 0 /* first seg */ );
+		lastsegment = segno;
 	}
 
 	/*
-	 * Delete any additional segments.
+	 * Unlink segment files in descending order so that if there is any failure
+	 * while deleting any of the segment files, we do not create any gaps in
+	 * segment files sequence.
 	 */
-	if (ret >= 0)
+	for (segno = lastsegment; segno >= 0; segno--)
 	{
-		char	   *segpath = (char *) palloc(strlen(path) + 12);
-		BlockNumber segno;
-
-		/*
-		 * Note that because we loop until getting ENOENT, we will correctly
-		 * remove all inactive segments as well as active ones.
-		 */
-		for (segno = 1;; segno++)
-		{
+		if (segno == 0)
+			sprintf(segpath, "%s", path);
+		else
 			sprintf(segpath, "%s.%u", path, segno);
 
 			if (!RelFileLocatorBackendIsTemp(rlocator))
 			{
 				/*
-				 * Prevent other backends' fds from holding on to the disk
+				 * prevent other backends' fds from holding on to the disk
 				 * space.
 				 */
-				if (do_truncate(segpath) < 0 && errno == ENOENT)
-					break;
+				do_truncate(path);
 
-				/*
-				 * Forget any pending sync requests for this segment before we
-				 * try to unlink.
-				 */
+				/* forget any pending sync requests for the first segment. */
 				register_forget_request(rlocator, forkNum, segno);
 			}
 
-			if (unlink(segpath) < 0)
-			{
-				/* ENOENT is expected after the last segment... */
-				if (errno != ENOENT)
-					ereport(WARNING,
-							(errcode_for_file_access(),
-							 errmsg("could not remove file \"%s\": %m", segpath)));
-				break;
-			}
-		}
-		pfree(segpath);
+			/*
+			 * Unlink the file, we have already checked for file existence in
+			 * the above loop while computing the segments so we do not need to
+			 * check for ENOENT.
+			 */
+			if (unlink(path))
+				ereport(WARNING,
+						(errcode_for_file_access(),
+						 errmsg("could not remove file \"%s\": %m", path)));
 	}
 
+	pfree(segpath);
 	pfree(path);
 }
 
@@ -1009,23 +958,6 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 }
 
 /*
- * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
- */
-static void
-register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
-						BlockNumber segno)
-{
-	FileTag		tag;
-
-	INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
-
-	/* Should never be used with temp relations */
-	Assert(!RelFileLocatorBackendIsTemp(rlocator));
-
-	RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
-}
-
-/*
  * register_forget_request() -- forget any fsyncs for a relation fork's segment
  */
 static void
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index e1fb631..9a4a31c 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -201,92 +201,6 @@ SyncPreCheckpoint(void)
 }
 
 /*
- * SyncPostCheckpoint() -- Do post-checkpoint work
- *
- * Remove any lingering files that can now be safely removed.
- */
-void
-SyncPostCheckpoint(void)
-{
-	int			absorb_counter;
-	ListCell   *lc;
-
-	absorb_counter = UNLINKS_PER_ABSORB;
-	foreach(lc, pendingUnlinks)
-	{
-		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc);
-		char		path[MAXPGPATH];
-
-		/* Skip over any canceled entries */
-		if (entry->canceled)
-			continue;
-
-		/*
-		 * New entries are appended to the end, so if the entry is new we've
-		 * reached the end of old entries.
-		 *
-		 * Note: if just the right number of consecutive checkpoints fail, we
-		 * could be fooled here by cycle_ctr wraparound.  However, the only
-		 * consequence is that we'd delay unlinking for one more checkpoint,
-		 * which is perfectly tolerable.
-		 */
-		if (entry->cycle_ctr == checkpoint_cycle_ctr)
-			break;
-
-		/* Unlink the file */
-		if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
-														  path) < 0)
-		{
-			/*
-			 * There's a race condition, when the database is dropped at the
-			 * same time that we process the pending unlink requests. If the
-			 * DROP DATABASE deletes the file before we do, we will get ENOENT
-			 * here. rmtree() also has to ignore ENOENT errors, to deal with
-			 * the possibility that we delete the file first.
-			 */
-			if (errno != ENOENT)
-				ereport(WARNING,
-						(errcode_for_file_access(),
-						 errmsg("could not remove file \"%s\": %m", path)));
-		}
-
-		/* Mark the list entry as canceled, just in case */
-		entry->canceled = true;
-
-		/*
-		 * As in ProcessSyncRequests, we don't want to stop absorbing fsync
-		 * requests for a long time when there are many deletions to be done.
-		 * We can safely call AbsorbSyncRequests() at this point in the loop.
-		 */
-		if (--absorb_counter <= 0)
-		{
-			AbsorbSyncRequests();
-			absorb_counter = UNLINKS_PER_ABSORB;
-		}
-	}
-
-	/*
-	 * If we reached the end of the list, we can just remove the whole list
-	 * (remembering to pfree all the PendingUnlinkEntry objects).  Otherwise,
-	 * we must keep the entries at or after "lc".
-	 */
-	if (lc == NULL)
-	{
-		list_free_deep(pendingUnlinks);
-		pendingUnlinks = NIL;
-	}
-	else
-	{
-		int			ntodelete = list_cell_number(pendingUnlinks, lc);
-
-		for (int i = 0; i < ntodelete; i++)
-			pfree(list_nth(pendingUnlinks, i));
-
-		pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
-	}
-}
-
-/*
  *	ProcessSyncRequests() -- Process queued fsync requests.
  */
 void
@@ -532,21 +446,6 @@ RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
 				entry->canceled = true;
 		}
 	}
-	else if (type == SYNC_UNLINK_REQUEST)
-	{
-		/* Unlink request: put it in the linked list */
-		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-		PendingUnlinkEntry *entry;
-
-		entry = palloc(sizeof(PendingUnlinkEntry));
-		entry->tag = *ftag;
-		entry->cycle_ctr = checkpoint_cycle_ctr;
-		entry->canceled = false;
-
-		pendingUnlinks = lappend(pendingUnlinks, entry);
-
-		MemoryContextSwitchTo(oldcxt);
-	}
 	else
 	{
 		/* Normal case: enter a request to fsync this segment */
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index 049af87..2c0b812 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -23,7 +23,6 @@
 typedef enum SyncRequestType
 {
 	SYNC_REQUEST,				/* schedule a call of sync function */
-	SYNC_UNLINK_REQUEST,		/* schedule a call of unlink function */
 	SYNC_FORGET_REQUEST,		/* forget all calls for a tag */
 	SYNC_FILTER_REQUEST			/* forget all calls satisfying match fn */
 } SyncRequestType;
@@ -57,7 +56,6 @@ typedef struct FileTag
 
 extern void InitSync(void);
 extern void SyncPreCheckpoint(void);
-extern void SyncPostCheckpoint(void);
 extern void ProcessSyncRequests(void);
 extern void RememberSyncRequest(const FileTag *ftag, SyncRequestType type);
 extern bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
-- 
1.8.3.1