Re: Feedback on getting rid of VACUUM FULL - Mailing list pgsql-hackers

Hannu Krosing wrote:
> On Wed, 2009-09-16 at 21:23 +0300, Heikki Linnakangas wrote:
>> 2) Another utility that does something like UPDATE ... WHERE ctid > ? to
>> move tuples to lower pages. It will be different from current VACUUM
>> FULL in some ways. It won't require a table lock, for example, but it
>> won't be able to move update chains as nicely. But it would be trivial
>> to write one, so I think we should offer that as a contrib module.
>
> I have not checked, but I suspect pg_reorg may already be doing
> something similar http://pgfoundry.org/forum/forum.php?forum_id=1561

Hmm, AFAICT pg_reorg is much more complex, writing stuff to a temp table
and swapping relfilenodes afterwards. More like the VACUUM REWRITE
that's been discussed.

For the kicks, I looked at what it would take to write a utility like
that. It turns out to be quite trivial, patch attached. It uses the same
principle as VACUUM FULL, scans from the end, moving tuples to
lower-numbered pages until it can't do it anymore. It requires a small
change to heap_update(), to override the preference to store the new
tuple on the same page as the old one, but other than that, it's all in
the external module.

To test:

-- Create and populate test table
CREATE TABLE foo (id int4 PRIMARY KEY);
INSERT INTO foo SELECT a FROM generate_series(1,100000) a;

-- Delete a lot of tuples from the beginning. This creates the hole that
we want to compact out.
DELETE FROM foo WHERE id < 90000;

-- Vacuum to remove the dead tuples
VACUUM VERBOSE foo;

-- Run the utility to "move" the tuples
SELECT vacuumfull('foo');

-- Vacuum table again to remove the old tuple versions of the moved rows
and truncate the file.

VACUUM VERBOSE foo;

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
diff --git a/contrib/Makefile b/contrib/Makefile
index 0afa149..59c9279 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -40,6 +40,7 @@ SUBDIRS = \
         test_parser    \
         tsearch2    \
         unaccent    \
+        vacuumfull    \
         vacuumlo

 ifeq ($(with_openssl),yes)
diff --git a/contrib/vacuumfull/Makefile b/contrib/vacuumfull/Makefile
new file mode 100644
index 0000000..925d2c4
--- /dev/null
+++ b/contrib/vacuumfull/Makefile
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------------------
+#
+# vacuumfull Makefile
+#
+# $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+MODULE_big    = vacuumfull
+OBJS        = vacuumfull.o
+DATA_built    = vacuumfull.sql
+DATA          = uninstall_vacuumfull.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/vacuumfull
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
diff --git a/contrib/vacuumfull/uninstall_vacuumfull.sql b/contrib/vacuumfull/uninstall_vacuumfull.sql
new file mode 100644
index 0000000..9ecab84
--- /dev/null
+++ b/contrib/vacuumfull/uninstall_vacuumfull.sql
@@ -0,0 +1,6 @@
+/* $PostgreSQL$ */
+
+-- Adjust this setting to control where the objects get dropped.
+SET search_path = public;
+
+DROP FUNCTION vacuumfull(regclass);
diff --git a/contrib/vacuumfull/vacuumfull.c b/contrib/vacuumfull/vacuumfull.c
new file mode 100644
index 0000000..07139ba
--- /dev/null
+++ b/contrib/vacuumfull/vacuumfull.c
@@ -0,0 +1,286 @@
+/*-------------------------------------------------------------------------
+ *
+ * vacuumfull.c
+ *      An utility to replace old VACUUM FULL
+ *
+ * XXX
+ *
+ * Copyright (c) 2007-2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *      $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/xact.h"
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/procarray.h"
+#include "utils/acl.h"
+#include "utils/tqual.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+
+PG_MODULE_MAGIC;
+
+Datum        vacuumfull(PG_FUNCTION_ARGS);
+
+
+/*----------------------------------------------------------------------
+ * ExecContext:
+ *
+ * As these variables always appear together, we put them into one struct
+ * and pull initialization and cleanup into separate routines.
+ * ExecContext is used by repair_frag() and move_xxx_tuple().  More
+ * accurately:    It is *used* only in move_xxx_tuple(), but because this
+ * routine is called many times, we initialize the struct just once in
+ * repair_frag() and pass it on to move_xxx_tuple().
+ */
+typedef struct ExecContextData
+{
+    ResultRelInfo *resultRelInfo;
+    EState       *estate;
+    TupleTableSlot *slot;
+} ExecContextData;
+
+typedef ExecContextData *ExecContext;
+
+static void
+ExecContext_Init(ExecContext ec, Relation rel)
+{
+    TupleDesc    tupdesc = RelationGetDescr(rel);
+
+    /*
+     * We need a ResultRelInfo and an EState so we can use the regular
+     * executor's index-entry-making machinery.
+     */
+    ec->estate = CreateExecutorState();
+
+    ec->resultRelInfo = makeNode(ResultRelInfo);
+    ec->resultRelInfo->ri_RangeTableIndex = 1;    /* dummy */
+    ec->resultRelInfo->ri_RelationDesc = rel;
+    ec->resultRelInfo->ri_TrigDesc = NULL;        /* we don't fire triggers */
+
+    ExecOpenIndices(ec->resultRelInfo);
+
+    ec->estate->es_result_relations = ec->resultRelInfo;
+    ec->estate->es_num_result_relations = 1;
+    ec->estate->es_result_relation_info = ec->resultRelInfo;
+
+    /* Set up a tuple slot too */
+    ec->slot = MakeSingleTupleTableSlot(tupdesc);
+}
+
+static void
+ExecContext_Finish(ExecContext ec)
+{
+    ExecDropSingleTupleTableSlot(ec->slot);
+    ExecCloseIndices(ec->resultRelInfo);
+    FreeExecutorState(ec->estate);
+}
+
+/*
+ * End of ExecContext Implementation
+ *----------------------------------------------------------------------
+ */
+
+
+/*
+ * vacuumfull
+ */
+PG_FUNCTION_INFO_V1(vacuumfull);
+
+Datum
+vacuumfull(PG_FUNCTION_ARGS)
+{
+    Oid relid = PG_GETARG_OID(0);
+    Relation rel;
+    BlockNumber blk;
+    BlockNumber numblks;
+    BlockNumber minblk;
+    Buffer buf = InvalidBuffer;
+    TransactionId oldestxmin;
+    bool found_unmovable = false;
+    ExecContextData ec;
+    int tuplesmoved = 0;
+    BlockNumber pagescleared = 0;
+
+    rel = heap_open(relid, AccessShareLock);
+
+    /*
+     * Check permissions.
+     *
+     * We allow the user to vacuum a table if he is superuser, the table
+     * owner, or the database owner (but in the latter case, only if it's not
+     * a shared relation).    pg_class_ownercheck includes the superuser case.
+     */
+    if (!(pg_class_ownercheck(RelationGetRelid(rel), GetUserId()) ||
+          (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !rel->rd_rel->relisshared)))
+    {
+        aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
+                       RelationGetRelationName(rel));
+
+    }
+
+    /* Check that this relation has storage */
+    if (rel->rd_rel->relkind == RELKIND_VIEW)
+        ereport(ERROR,
+                (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                 errmsg("\"%s\" is a view",
+                        RelationGetRelationName(rel))));
+
+    /*
+     * Reject attempts to read non-local temporary relations; we would be
+     * likely to get wrong data since we have no visibility into the owning
+     * session's local buffers.
+     */
+    if (RELATION_IS_OTHER_TEMP(rel))
+        ereport(ERROR,
+                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                 errmsg("cannot access temporary tables of other sessions")));
+
+    oldestxmin = GetOldestXmin(false, true);
+
+    ExecContext_Init(&ec, rel);
+
+    /*
+     * Read the relation backwards from the end, updating tuples. Hopefully
+     * the new tuple versions will be stored on lower-numbered pages.
+     * Stop when we reach the last page where we stored a new tuple version
+     * (minblk).
+     */
+    numblks = RelationGetNumberOfBlocks(rel);
+    minblk = 0;
+    for (blk = numblks ? numblks - 1 : 0;
+         blk > minblk && !found_unmovable;
+         blk--)
+    {
+        Page page;
+        OffsetNumber off;
+        OffsetNumber maxoff;
+
+        buf = ReleaseAndReadBuffer(buf, rel, blk);
+
+        page = BufferGetPage(buf);
+        maxoff = PageGetMaxOffsetNumber(page);
+
+        for (off = FirstOffsetNumber; off <= maxoff && !found_unmovable; off++)
+        {
+            ItemId itemid;
+            HeapTupleData oldtuple;
+            bool move = false;
+
+            LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+            itemid = PageGetItemId(page, off);
+            if (ItemIdIsNormal(itemid))
+            {
+                ItemPointerSet(&oldtuple.t_self, blk, off);
+                oldtuple.t_len = ItemIdGetLength(itemid);
+                oldtuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+
+                switch(HeapTupleSatisfiesVacuum(oldtuple.t_data, oldestxmin, buf) != HEAPTUPLE_DEAD)
+                {
+                    case HEAPTUPLE_LIVE:
+                        /* let's move it to a lower-numbered page */
+                        move = true;
+                        break;
+                    case HEAPTUPLE_DEAD:
+                        /* dead tuples will go away with a regular vacuum */
+                        break;
+                    case HEAPTUPLE_RECENTLY_DEAD:
+                        /* can't remove this yet, but it will eventually be removable
+                         * by regular VACUUM
+                         */
+                        break;
+                    case HEAPTUPLE_INSERT_IN_PROGRESS:
+                        /* we could try to update this once the inserter commits,
+                         * but we would have to wait for i. */
+                        found_unmovable = true;
+                        break;
+                    case HEAPTUPLE_DELETE_IN_PROGRESS:
+                        /* assuming the deleter commits, this will become removable
+                         * by regular VACUUM.
+                         */
+                        break;
+                }
+            }
+
+            LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+            if (move)
+            {
+                HeapTuple newtuple = heap_copytuple(&oldtuple);
+                ItemPointerData errctid;
+                TransactionId update_xmax;
+                HTSU_Result result;
+
+                result = heap_update(rel, &oldtuple.t_self, newtuple, &errctid, &update_xmax,
+                                     GetCurrentCommandId(true), NULL, true, true);
+
+                if (result == HeapTupleMayBeUpdated)
+                {
+                    BlockNumber newblk;
+
+                    tuplesmoved++;
+
+                    CacheInvalidateHeapTuple(rel, &oldtuple);
+
+                    /* Create index entries for the moved tuple */
+                    if (ec.resultRelInfo->ri_NumIndices > 0)
+                    {
+                        ExecStoreTuple(newtuple, ec.slot, InvalidBuffer, false);
+                        ExecInsertIndexTuples(ec.slot, &(newtuple->t_self), ec.estate, true);
+                        ResetPerTupleExprContext(ec.estate);
+                    }
+
+                    newblk = ItemPointerGetBlockNumber(&newtuple->t_self);
+
+                    elog(DEBUG4, "moved tuple %u/%u to %u/%u", blk, off, newblk,
+                         ItemPointerGetOffsetNumber(&newtuple->t_self));
+
+                    if (newblk >= blk)
+                    {
+                        /*
+                         * Oops, the update moved the tuple higher in the
+                         * relation, not lower as we wanted. Let's stop now
+                         * before we do any more of that.
+                         */
+                        elog(WARNING, "moved tuple %u/%u to a higher page, stopping",
+                             blk, off);
+                        found_unmovable = true;
+                    }
+
+                    if(newblk > minblk)
+                        minblk = newblk;
+                }
+                else
+                {
+                    elog(DEBUG1, "failed to move tuple %u/%u to a lower page, stopping",
+                         blk, off);
+                    found_unmovable = true;
+                }
+            }
+        }
+        if (!found_unmovable)
+            pagescleared++;
+    }
+
+    elog(NOTICE, "%d tuples moved, there is now %d pages at the end of relation with only dead tuples. Please wait for
anyin-progress transactions to finsih and run VACUUM to reclaim the space", 
+         tuplesmoved, pagescleared);
+
+    if (BufferIsValid(buf))
+        ReleaseBuffer(buf);
+
+    ExecContext_Finish(&ec);
+
+    heap_close(rel, AccessShareLock);
+
+    PG_RETURN_VOID();
+}
+
diff --git a/contrib/vacuumfull/vacuumfull.sql.in b/contrib/vacuumfull/vacuumfull.sql.in
new file mode 100644
index 0000000..6017cc9
--- /dev/null
+++ b/contrib/vacuumfull/vacuumfull.sql.in
@@ -0,0 +1,12 @@
+/* $PostgreSQL$ */
+
+-- Adjust this setting to control where the objects get created.
+SET search_path = public;
+
+--
+-- vacuumfull()
+--
+CREATE OR REPLACE FUNCTION vacuumfull(regclass)
+RETURNS void
+AS '$libdir/vacuumfull', 'vacuumfull'
+LANGUAGE C STRICT;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index b0a911e..56c42cf 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2349,12 +2349,17 @@ simple_heap_delete(Relation relation, ItemPointer tid)
  *        cmax/cmin if successful)
  *    crosscheck - if not InvalidSnapshot, also check old tuple against this
  *    wait - true if should wait for any conflicting update to commit/abort
+ *  forcefsm - use FSM even if there's space on the same page.
  *
  * Normal, successful return value is HeapTupleMayBeUpdated, which
  * actually means we *did* update it.  Failure return codes are
  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
  * (the last only possible if wait == false).
  *
+ * If 'forcefsm' is TRUE, and the new tuple couldn't be stored on a page
+ * with lower block number than the old page, the tuple is not updated and
+ * HeapTupleNoSpace is returned.
+ *
  * On success, the header fields of *newtup are updated to match the new
  * stored tuple; in particular, newtup->t_self is set to the TID where the
  * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
@@ -2369,7 +2374,7 @@ simple_heap_delete(Relation relation, ItemPointer tid)
 HTSU_Result
 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
             ItemPointer ctid, TransactionId *update_xmax,
-            CommandId cid, Snapshot crosscheck, bool wait)
+            CommandId cid, Snapshot crosscheck, bool wait, bool forcefsm)
 {
     HTSU_Result result;
     TransactionId xid = GetCurrentTransactionId();
@@ -2599,7 +2604,7 @@ l2:

     newtupsize = MAXALIGN(newtup->t_len);

-    if (need_toast || newtupsize > pagefree)
+    if (need_toast || newtupsize > pagefree || forcefsm)
     {
         /* Clear obsolete visibility flags ... */
         oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
@@ -2649,11 +2654,34 @@ l2:
          * while not holding the lock on the old page, and we must rely on it
          * to get the locks on both pages in the correct order.
          */
-        if (newtupsize > pagefree)
+        if (newtupsize > pagefree || forcefsm)
         {
             /* Assume there's no chance to put heaptup on same page. */
             newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
                                                buffer, 0, NULL);
+            if (BufferGetBlockNumber(newbuf) >= ItemPointerGetBlockNumber(otid))
+            {
+                /*
+                 * We couldn't put the new tuple version on any page before
+                 * the old version. The purpose of 'forcefsm' is to update
+                 * tuples so that they are stored on lower-numbered pages,
+                 * to allow the heap to be truncated later, so there's no
+                 * point in continuing if the new page is higher than the old
+                 * one.
+                 */
+
+                if (newbuf != buffer)
+                    UnlockReleaseBuffer(newbuf);
+
+                /* undo the xmax change. */
+                HeapTupleHeaderSetXmax(oldtup.t_data, InvalidTransactionId);
+
+                UnlockReleaseBuffer(buffer);
+                if (have_tuple_lock)
+                    UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+                bms_free(hot_attrs);
+                return HeapTupleNoSpace;
+            }
         }
         else
         {
@@ -2977,7 +3005,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
     result = heap_update(relation, otid, tup,
                          &update_ctid, &update_xmax,
                          GetCurrentCommandId(true), InvalidSnapshot,
-                         true /* wait for commit */ );
+                         true /* wait for commit */, false);
     switch (result)
     {
         case HeapTupleSelfUpdated:
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 55df5d0..7118dbb 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -2076,7 +2076,7 @@ lreplace:;
                          &update_ctid, &update_xmax,
                          estate->es_output_cid,
                          estate->es_crosscheck_snapshot,
-                         true /* wait for commit */ );
+                         true /* wait for commit */, false);
     switch (result)
     {
         case HeapTupleSelfUpdated:
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index f8395fe..f88a029 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -102,7 +102,7 @@ extern HTSU_Result heap_delete(Relation relation, ItemPointer tid,
 extern HTSU_Result heap_update(Relation relation, ItemPointer otid,
             HeapTuple newtup,
             ItemPointer ctid, TransactionId *update_xmax,
-            CommandId cid, Snapshot crosscheck, bool wait);
+            CommandId cid, Snapshot crosscheck, bool wait, bool forcefsm);
 extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple,
                 Buffer *buffer, ItemPointer ctid,
                 TransactionId *update_xmax, CommandId cid,
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index df79e3a..1aab53b 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -72,7 +72,8 @@ typedef enum
     HeapTupleInvisible,
     HeapTupleSelfUpdated,
     HeapTupleUpdated,
-    HeapTupleBeingUpdated
+    HeapTupleBeingUpdated,
+    HeapTupleNoSpace /* only used by heap_update(), with 'forcefsm'  */
 } HTSU_Result;

 #endif   /* SNAPSHOT_H */

pgsql-hackers by date:

Previous
From: Steve Prentice
Date:
Subject: Re: PATCH: make plpgsql IN args mutable (v1) [REVIEW]
Next
From: Hannu Krosing
Date:
Subject: Re: Feedback on getting rid of VACUUM FULL