Re: Feedback on getting rid of VACUUM FULL - Mailing list pgsql-hackers
From | Heikki Linnakangas |
---|---|
Subject | Re: Feedback on getting rid of VACUUM FULL |
Date | |
Msg-id | 4AB15065.1000607@enterprisedb.com Whole thread Raw |
In response to | Re: Feedback on getting rid of VACUUM FULL (Hannu Krosing <hannu@2ndQuadrant.com>) |
Responses |
Re: Feedback on getting rid of VACUUM FULL
Re: Feedback on getting rid of VACUUM FULL Re: Feedback on getting rid of VACUUM FULL Re: Feedback on getting rid of VACUUM FULL Re: Feedback on getting rid of VACUUM FULL |
List | pgsql-hackers |
Hannu Krosing wrote: > On Wed, 2009-09-16 at 21:23 +0300, Heikki Linnakangas wrote: >> 2) Another utility that does something like UPDATE ... WHERE ctid > ? to >> move tuples to lower pages. It will be different from current VACUUM >> FULL in some ways. It won't require a table lock, for example, but it >> won't be able to move update chains as nicely. But it would be trivial >> to write one, so I think we should offer that as a contrib module. > > I have not checked, but I suspect pg_reorg may already be doing > something similar http://pgfoundry.org/forum/forum.php?forum_id=1561 Hmm, AFAICT pg_reorg is much more complex, writing stuff to a temp table and swapping relfilenodes afterwards. More like the VACUUM REWRITE that's been discussed. For the kicks, I looked at what it would take to write a utility like that. It turns out to be quite trivial, patch attached. It uses the same principle as VACUUM FULL, scans from the end, moving tuples to lower-numbered pages until it can't do it anymore. It requires a small change to heap_update(), to override the preference to store the new tuple on the same page as the old one, but other than that, it's all in the external module. To test: -- Create and populate test table CREATE TABLE foo (id int4 PRIMARY KEY); INSERT INTO foo SELECT a FROM generate_series(1,100000) a; -- Delete a lot of tuples from the beginning. This creates the hole that we want to compact out. DELETE FROM foo WHERE id < 90000; -- Vacuum to remove the dead tuples VACUUM VERBOSE foo; -- Run the utility to "move" the tuples SELECT vacuumfull('foo'); -- Vacuum table again to remove the old tuple versions of the moved rows and truncate the file. VACUUM VERBOSE foo; -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com diff --git a/contrib/Makefile b/contrib/Makefile index 0afa149..59c9279 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -40,6 +40,7 @@ SUBDIRS = \ test_parser \ tsearch2 \ unaccent \ + vacuumfull \ vacuumlo ifeq ($(with_openssl),yes) diff --git a/contrib/vacuumfull/Makefile b/contrib/vacuumfull/Makefile new file mode 100644 index 0000000..925d2c4 --- /dev/null +++ b/contrib/vacuumfull/Makefile @@ -0,0 +1,24 @@ +#------------------------------------------------------------------------- +# +# vacuumfull Makefile +# +# $PostgreSQL$ +# +#------------------------------------------------------------------------- + +MODULE_big = vacuumfull +OBJS = vacuumfull.o +DATA_built = vacuumfull.sql +DATA = uninstall_vacuumfull.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/vacuumfull +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + diff --git a/contrib/vacuumfull/uninstall_vacuumfull.sql b/contrib/vacuumfull/uninstall_vacuumfull.sql new file mode 100644 index 0000000..9ecab84 --- /dev/null +++ b/contrib/vacuumfull/uninstall_vacuumfull.sql @@ -0,0 +1,6 @@ +/* $PostgreSQL$ */ + +-- Adjust this setting to control where the objects get dropped. +SET search_path = public; + +DROP FUNCTION vacuumfull(regclass); diff --git a/contrib/vacuumfull/vacuumfull.c b/contrib/vacuumfull/vacuumfull.c new file mode 100644 index 0000000..07139ba --- /dev/null +++ b/contrib/vacuumfull/vacuumfull.c @@ -0,0 +1,286 @@ +/*------------------------------------------------------------------------- + * + * vacuumfull.c + * An utility to replace old VACUUM FULL + * + * XXX + * + * Copyright (c) 2007-2009, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/xact.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/procarray.h" +#include "utils/acl.h" +#include "utils/tqual.h" +#include "utils/inval.h" +#include "utils/memutils.h" + +PG_MODULE_MAGIC; + +Datum vacuumfull(PG_FUNCTION_ARGS); + + +/*---------------------------------------------------------------------- + * ExecContext: + * + * As these variables always appear together, we put them into one struct + * and pull initialization and cleanup into separate routines. + * ExecContext is used by repair_frag() and move_xxx_tuple(). More + * accurately: It is *used* only in move_xxx_tuple(), but because this + * routine is called many times, we initialize the struct just once in + * repair_frag() and pass it on to move_xxx_tuple(). + */ +typedef struct ExecContextData +{ + ResultRelInfo *resultRelInfo; + EState *estate; + TupleTableSlot *slot; +} ExecContextData; + +typedef ExecContextData *ExecContext; + +static void +ExecContext_Init(ExecContext ec, Relation rel) +{ + TupleDesc tupdesc = RelationGetDescr(rel); + + /* + * We need a ResultRelInfo and an EState so we can use the regular + * executor's index-entry-making machinery. + */ + ec->estate = CreateExecutorState(); + + ec->resultRelInfo = makeNode(ResultRelInfo); + ec->resultRelInfo->ri_RangeTableIndex = 1; /* dummy */ + ec->resultRelInfo->ri_RelationDesc = rel; + ec->resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */ + + ExecOpenIndices(ec->resultRelInfo); + + ec->estate->es_result_relations = ec->resultRelInfo; + ec->estate->es_num_result_relations = 1; + ec->estate->es_result_relation_info = ec->resultRelInfo; + + /* Set up a tuple slot too */ + ec->slot = MakeSingleTupleTableSlot(tupdesc); +} + +static void +ExecContext_Finish(ExecContext ec) +{ + ExecDropSingleTupleTableSlot(ec->slot); + ExecCloseIndices(ec->resultRelInfo); + FreeExecutorState(ec->estate); +} + +/* + * End of ExecContext Implementation + *---------------------------------------------------------------------- + */ + + +/* + * vacuumfull + */ +PG_FUNCTION_INFO_V1(vacuumfull); + +Datum +vacuumfull(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + Relation rel; + BlockNumber blk; + BlockNumber numblks; + BlockNumber minblk; + Buffer buf = InvalidBuffer; + TransactionId oldestxmin; + bool found_unmovable = false; + ExecContextData ec; + int tuplesmoved = 0; + BlockNumber pagescleared = 0; + + rel = heap_open(relid, AccessShareLock); + + /* + * Check permissions. + * + * We allow the user to vacuum a table if he is superuser, the table + * owner, or the database owner (but in the latter case, only if it's not + * a shared relation). pg_class_ownercheck includes the superuser case. + */ + if (!(pg_class_ownercheck(RelationGetRelid(rel), GetUserId()) || + (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !rel->rd_rel->relisshared))) + { + aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS, + RelationGetRelationName(rel)); + + } + + /* Check that this relation has storage */ + if (rel->rd_rel->relkind == RELKIND_VIEW) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is a view", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + oldestxmin = GetOldestXmin(false, true); + + ExecContext_Init(&ec, rel); + + /* + * Read the relation backwards from the end, updating tuples. Hopefully + * the new tuple versions will be stored on lower-numbered pages. + * Stop when we reach the last page where we stored a new tuple version + * (minblk). + */ + numblks = RelationGetNumberOfBlocks(rel); + minblk = 0; + for (blk = numblks ? numblks - 1 : 0; + blk > minblk && !found_unmovable; + blk--) + { + Page page; + OffsetNumber off; + OffsetNumber maxoff; + + buf = ReleaseAndReadBuffer(buf, rel, blk); + + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + + for (off = FirstOffsetNumber; off <= maxoff && !found_unmovable; off++) + { + ItemId itemid; + HeapTupleData oldtuple; + bool move = false; + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + itemid = PageGetItemId(page, off); + if (ItemIdIsNormal(itemid)) + { + ItemPointerSet(&oldtuple.t_self, blk, off); + oldtuple.t_len = ItemIdGetLength(itemid); + oldtuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + + switch(HeapTupleSatisfiesVacuum(oldtuple.t_data, oldestxmin, buf) != HEAPTUPLE_DEAD) + { + case HEAPTUPLE_LIVE: + /* let's move it to a lower-numbered page */ + move = true; + break; + case HEAPTUPLE_DEAD: + /* dead tuples will go away with a regular vacuum */ + break; + case HEAPTUPLE_RECENTLY_DEAD: + /* can't remove this yet, but it will eventually be removable + * by regular VACUUM + */ + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + /* we could try to update this once the inserter commits, + * but we would have to wait for i. */ + found_unmovable = true; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + /* assuming the deleter commits, this will become removable + * by regular VACUUM. + */ + break; + } + } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (move) + { + HeapTuple newtuple = heap_copytuple(&oldtuple); + ItemPointerData errctid; + TransactionId update_xmax; + HTSU_Result result; + + result = heap_update(rel, &oldtuple.t_self, newtuple, &errctid, &update_xmax, + GetCurrentCommandId(true), NULL, true, true); + + if (result == HeapTupleMayBeUpdated) + { + BlockNumber newblk; + + tuplesmoved++; + + CacheInvalidateHeapTuple(rel, &oldtuple); + + /* Create index entries for the moved tuple */ + if (ec.resultRelInfo->ri_NumIndices > 0) + { + ExecStoreTuple(newtuple, ec.slot, InvalidBuffer, false); + ExecInsertIndexTuples(ec.slot, &(newtuple->t_self), ec.estate, true); + ResetPerTupleExprContext(ec.estate); + } + + newblk = ItemPointerGetBlockNumber(&newtuple->t_self); + + elog(DEBUG4, "moved tuple %u/%u to %u/%u", blk, off, newblk, + ItemPointerGetOffsetNumber(&newtuple->t_self)); + + if (newblk >= blk) + { + /* + * Oops, the update moved the tuple higher in the + * relation, not lower as we wanted. Let's stop now + * before we do any more of that. + */ + elog(WARNING, "moved tuple %u/%u to a higher page, stopping", + blk, off); + found_unmovable = true; + } + + if(newblk > minblk) + minblk = newblk; + } + else + { + elog(DEBUG1, "failed to move tuple %u/%u to a lower page, stopping", + blk, off); + found_unmovable = true; + } + } + } + if (!found_unmovable) + pagescleared++; + } + + elog(NOTICE, "%d tuples moved, there is now %d pages at the end of relation with only dead tuples. Please wait for anyin-progress transactions to finsih and run VACUUM to reclaim the space", + tuplesmoved, pagescleared); + + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + + ExecContext_Finish(&ec); + + heap_close(rel, AccessShareLock); + + PG_RETURN_VOID(); +} + diff --git a/contrib/vacuumfull/vacuumfull.sql.in b/contrib/vacuumfull/vacuumfull.sql.in new file mode 100644 index 0000000..6017cc9 --- /dev/null +++ b/contrib/vacuumfull/vacuumfull.sql.in @@ -0,0 +1,12 @@ +/* $PostgreSQL$ */ + +-- Adjust this setting to control where the objects get created. +SET search_path = public; + +-- +-- vacuumfull() +-- +CREATE OR REPLACE FUNCTION vacuumfull(regclass) +RETURNS void +AS '$libdir/vacuumfull', 'vacuumfull' +LANGUAGE C STRICT; diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b0a911e..56c42cf 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2349,12 +2349,17 @@ simple_heap_delete(Relation relation, ItemPointer tid) * cmax/cmin if successful) * crosscheck - if not InvalidSnapshot, also check old tuple against this * wait - true if should wait for any conflicting update to commit/abort + * forcefsm - use FSM even if there's space on the same page. * * Normal, successful return value is HeapTupleMayBeUpdated, which * actually means we *did* update it. Failure return codes are * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated * (the last only possible if wait == false). * + * If 'forcefsm' is TRUE, and the new tuple couldn't be stored on a page + * with lower block number than the old page, the tuple is not updated and + * HeapTupleNoSpace is returned. + * * On success, the header fields of *newtup are updated to match the new * stored tuple; in particular, newtup->t_self is set to the TID where the * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT @@ -2369,7 +2374,7 @@ simple_heap_delete(Relation relation, ItemPointer tid) HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, ItemPointer ctid, TransactionId *update_xmax, - CommandId cid, Snapshot crosscheck, bool wait) + CommandId cid, Snapshot crosscheck, bool wait, bool forcefsm) { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -2599,7 +2604,7 @@ l2: newtupsize = MAXALIGN(newtup->t_len); - if (need_toast || newtupsize > pagefree) + if (need_toast || newtupsize > pagefree || forcefsm) { /* Clear obsolete visibility flags ... */ oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED | @@ -2649,11 +2654,34 @@ l2: * while not holding the lock on the old page, and we must rely on it * to get the locks on both pages in the correct order. */ - if (newtupsize > pagefree) + if (newtupsize > pagefree || forcefsm) { /* Assume there's no chance to put heaptup on same page. */ newbuf = RelationGetBufferForTuple(relation, heaptup->t_len, buffer, 0, NULL); + if (BufferGetBlockNumber(newbuf) >= ItemPointerGetBlockNumber(otid)) + { + /* + * We couldn't put the new tuple version on any page before + * the old version. The purpose of 'forcefsm' is to update + * tuples so that they are stored on lower-numbered pages, + * to allow the heap to be truncated later, so there's no + * point in continuing if the new page is higher than the old + * one. + */ + + if (newbuf != buffer) + UnlockReleaseBuffer(newbuf); + + /* undo the xmax change. */ + HeapTupleHeaderSetXmax(oldtup.t_data, InvalidTransactionId); + + UnlockReleaseBuffer(buffer); + if (have_tuple_lock) + UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock); + bms_free(hot_attrs); + return HeapTupleNoSpace; + } } else { @@ -2977,7 +3005,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) result = heap_update(relation, otid, tup, &update_ctid, &update_xmax, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ ); + true /* wait for commit */, false); switch (result) { case HeapTupleSelfUpdated: diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 55df5d0..7118dbb 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -2076,7 +2076,7 @@ lreplace:; &update_ctid, &update_xmax, estate->es_output_cid, estate->es_crosscheck_snapshot, - true /* wait for commit */ ); + true /* wait for commit */, false); switch (result) { case HeapTupleSelfUpdated: diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index f8395fe..f88a029 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -102,7 +102,7 @@ extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, ItemPointer ctid, TransactionId *update_xmax, - CommandId cid, Snapshot crosscheck, bool wait); + CommandId cid, Snapshot crosscheck, bool wait, bool forcefsm); extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer, ItemPointer ctid, TransactionId *update_xmax, CommandId cid, diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index df79e3a..1aab53b 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -72,7 +72,8 @@ typedef enum HeapTupleInvisible, HeapTupleSelfUpdated, HeapTupleUpdated, - HeapTupleBeingUpdated + HeapTupleBeingUpdated, + HeapTupleNoSpace /* only used by heap_update(), with 'forcefsm' */ } HTSU_Result; #endif /* SNAPSHOT_H */
pgsql-hackers by date: