From 62fbe0ce5506e006b92dbfb07aee7414040d982f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 2 Jan 2019 16:00:16 +0200 Subject: [PATCH 2/2] Delete pages during GiST VACUUM v18-heikki --- src/backend/access/gist/README | 14 +++ src/backend/access/gist/gist.c | 18 +++ src/backend/access/gist/gistutil.c | 3 +- src/backend/access/gist/gistvacuum.c | 152 ++++++++++++++++++++++++- src/backend/access/gist/gistxlog.c | 60 ++++++++++ src/backend/access/rmgrdesc/gistdesc.c | 3 + src/backend/nodes/bitmapset.c | 16 +++ src/include/access/gist.h | 3 + src/include/access/gist_private.h | 7 +- src/include/access/gistxlog.h | 10 +- src/include/nodes/bitmapset.h | 1 + src/test/regress/expected/gist.out | 4 +- src/test/regress/sql/gist.sql | 4 +- 13 files changed, 282 insertions(+), 13 deletions(-) diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 02228662b81..c84359de310 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -413,6 +413,20 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops through buffers at a given level until all buffers at that level have been emptied, and then moves down to the next level. +Bulk delete algorithm (VACUUM) +------------------------------ + +Function gistbulkdelete() is responsible for marking empty leaf pages as free +so that they can be used it allocate newly split pages. To find this pages +function scans index in physical order. + +Physical scan reads the entire index from the first page to last. This scan +maintains information necessary to collect block numbers of internal pages +that need cleansing and block number of empty leafs. + +After the scan, for each internal pages under exclusive lock, each potentially +free leaf page is examined. gistbulkdelete() never delete last one reference +from internal page to preserve balanced tree properties. Authors: Teodor Sigaev diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index d42e810c6b3..bbfd5a92b88 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -704,6 +704,11 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTInsertStack *item; OffsetNumber downlinkoffnum; + /* + * Currently internal pages are not deleted during vacuum, + * so we do not need to check if page is deleted + */ + downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); @@ -838,6 +843,19 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, } } + /* + * Leaf pages can be left deleted but still referenced + * until it's space is reused. Downlink to this page may be already + * removed from the internal page, but this scan can posess it. + */ + if(GistPageIsDeleted(stack->page)) + { + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuple(&state, stack, giststate, itup, diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 70627e5df66..adb316c6afa 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -23,6 +23,7 @@ #include "storage/lmgr.h" #include "utils/float.h" #include "utils/syscache.h" +#include "utils/snapmgr.h" #include "utils/lsyscache.h" @@ -807,7 +808,7 @@ gistNewBuffer(Relation r) gistcheckpage(r, buffer); - if (GistPageIsDeleted(page)) + if (GistPageIsDeleted(page) && TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalDataXmin)) return buffer; /* OK to use */ LockBuffer(buffer, GIST_UNLOCK); diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 4fb32bf76bf..bac6b8c77af 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -16,8 +16,10 @@ #include "access/genam.h" #include "access/gist_private.h" +#include "access/transam.h" #include "commands/vacuum.h" #include "miscadmin.h" +#include "nodes/bitmapset.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" @@ -30,6 +32,10 @@ typedef struct void *callback_state; GistNSN startNSN; BlockNumber totFreePages; /* true total # of free pages */ + BlockNumber emptyPages; + + Bitmapset *internalPagesMap; + Bitmapset *emptyLeafPagesMap; } GistVacState; static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, @@ -91,8 +97,6 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) * check invalid tuples left after upgrade. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. - * - * Result: a palloc'd struct containing statistical info for VACUUM displays. */ static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, @@ -122,6 +126,9 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, else vstate.startNSN = gistGetFakeLSN(rel); vstate.totFreePages = 0; + vstate.emptyPages = 0; + vstate.internalPagesMap = NULL; + vstate.emptyLeafPagesMap = NULL; /* * Need lock unless it's local to this backend. @@ -166,6 +173,12 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; + + if (!vstate.internalPagesMap) + vstate.internalPagesMap = bms_make_empty(num_pages); + if (!vstate.emptyLeafPagesMap) + vstate.emptyLeafPagesMap = bms_make_empty(num_pages); + /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) gistvacuumpage(&vstate, blkno, blkno); @@ -189,6 +202,126 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; + + /* rescan inner pages that had empty child pages */ + if (vstate.emptyPages > 0) + { + int x; + + x = -1; + while (vstate.emptyPages > 0 && + (x = bms_next_member(vstate.internalPagesMap, x)) >= 0) + { + Buffer buffer; + Page page; + OffsetNumber off, + maxoff; + IndexTuple idxtuple; + ItemId iid; + OffsetNumber todelete[MaxOffsetNumber]; + Buffer buftodelete[MaxOffsetNumber]; + int ntodelete = 0; + + /* FIXME: 'x' is signed, so this will not work with indexes larger than 2^31 blocks */ + blkno = (BlockNumber) x; + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); + + LockBuffer(buffer, GIST_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page)) + { + UnlockReleaseBuffer(buffer); + continue; + } + + maxoff = PageGetMaxOffsetNumber(page); + /* Check that leafs are still empty and decide what to delete */ + for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off)) + { + Buffer leafBuffer; + Page leafPage; + BlockNumber leafBlockNo; + + iid = PageGetItemId(page, off); + idxtuple = (IndexTuple) PageGetItem(page, iid); + /* if this page was not empty in previous scan - we do not consider it */ + leafBlockNo = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + if (!bms_is_member(leafBlockNo, vstate.emptyLeafPagesMap)) + continue; + + leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, leafBlockNo, + RBM_NORMAL, info->strategy); + LockBuffer(leafBuffer, GIST_EXCLUSIVE); + gistcheckpage(rel, leafBuffer); + leafPage = (Page) BufferGetPage(leafBuffer); + if (!GistPageIsLeaf(leafPage)) + { + UnlockReleaseBuffer(leafBuffer); + continue; + } + + if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */ + && !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */ + && ntodelete < maxoff-1) /* We must keep at least one leaf page per each */ + { + buftodelete[ntodelete] = leafBuffer; + todelete[ntodelete++] = off; + } + else + UnlockReleaseBuffer(leafBuffer); + } + + if (ntodelete) + { + /* + * Like in _bt_unlink_halfdead_page we need a upper bound on xid + * that could hold downlinks to this page. We use + * ReadNewTransactionId() to instead of GetCurrentTransactionId + * since we are in a VACUUM. + */ + TransactionId txid = ReadNewTransactionId(); + int i; + + START_CRIT_SECTION(); + + /* Mark pages as deleted dropping references from internal pages */ + for (i = 0; i < ntodelete; i++) + { + Page leafPage = (Page) BufferGetPage(buftodelete[i]); + XLogRecPtr recptr; + + GistPageSetDeleteXid(leafPage,txid); + + GistPageSetDeleted(leafPage); + MarkBufferDirty(buftodelete[i]); + stats->pages_deleted++; + vstate.emptyPages--; + + MarkBufferDirty(buffer); + /* Offsets are changed as long as we delete tuples from internal page */ + PageIndexTupleDelete(page, todelete[i] - i); + + if (RelationNeedsWAL(rel)) + recptr = gistXLogSetDeleted(rel->rd_node, buftodelete[i], + txid, buffer, todelete[i] - i); + else + recptr = gistGetFakeLSN(rel); + PageSetLSN(page, recptr); + PageSetLSN(leafPage, recptr); + + UnlockReleaseBuffer(buftodelete[i]); + } + END_CRIT_SECTION(); + } + + UnlockReleaseBuffer(buffer); + } + } + + bms_free(vstate.emptyLeafPagesMap); + bms_free(vstate.internalPagesMap); } /* @@ -242,6 +375,7 @@ restart: { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; + int nremain; GISTPageOpaque opaque = GistPageGetOpaque(page); OffsetNumber maxoff = PageGetMaxOffsetNumber(page); @@ -309,8 +443,18 @@ restart: maxoff = PageGetMaxOffsetNumber(page); } - stats->num_index_tuples += maxoff - FirstOffsetNumber + 1; - + nremain = maxoff - FirstOffsetNumber + 1; + if (nremain == 0) + { + vstate->emptyLeafPagesMap = bms_add_member(vstate->emptyLeafPagesMap, blkno); + vstate->emptyPages++; + } + else + stats->num_index_tuples += nremain; + } + else + { + vstate->internalPagesMap = bms_add_member(vstate->internalPagesMap, blkno); } UnlockReleaseBuffer(buffer); diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 01e025d5fdb..bb0fa473f5e 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -64,6 +64,39 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id) UnlockReleaseBuffer(buffer); } +static void +gistRedoPageSetDeleted(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record); + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + GistPageSetDeleteXid(page, xldata->deleteXid); + GistPageSetDeleted(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + PageIndexTupleDelete(page, xldata->downlinkOffset); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} /* * redo any page update (except page split) */ @@ -116,6 +149,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record) data += sizeof(OffsetNumber) * xldata->ntodelete; PageIndexMultiDelete(page, todelete, xldata->ntodelete); + if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } @@ -535,6 +569,9 @@ gist_redo(XLogReaderState *record) case XLOG_GIST_CREATE_INDEX: gistRedoCreateIndex(record); break; + case XLOG_GIST_PAGE_DELETE: + gistRedoPageSetDeleted(record); + break; default: elog(PANIC, "gist_redo: unknown op code %u", info); } @@ -653,6 +690,29 @@ gistXLogSplit(bool page_is_leaf, return recptr; } +/* + * Write XLOG record describing a page delete. This also includes removal of + * downlink from internal page. + */ +XLogRecPtr +gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid, + Buffer internalPageBuffer, OffsetNumber internalPageOffset) { + gistxlogPageDelete xlrec; + XLogRecPtr recptr; + + xlrec.deleteXid = xid; + xlrec.downlinkOffset = internalPageOffset; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete)); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBuffer(1, internalPageBuffer, REGBUF_STANDARD); + /* new tuples */ + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE); + return recptr; +} + /* * Write XLOG record describing a page update. The update can include any * number of deletions and/or insertions of tuples on a single index page. diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index b79ed1dfdc8..f65335ba23a 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -76,6 +76,9 @@ gist_identify(uint8 info) case XLOG_GIST_CREATE_INDEX: id = "CREATE_INDEX"; break; + case XLOG_GIST_PAGE_DELETE: + id = "PAGE_DELETE"; + break; } return id; diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index 8ce253c88df..29cfcd78984 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -258,6 +258,22 @@ bms_make_singleton(int x) return result; } +/* + * bms_make_singleton - preallocate an empty bitmapset + */ +Bitmapset * +bms_make_empty(int size) +{ + Bitmapset *result; + int wordnum; + + if (size < 0) + elog(ERROR, "negative bitmapset member not allowed"); + wordnum = WORDNUM(size - 1); + result = (Bitmapset *) palloc0(BITMAPSET_SIZE(wordnum + 1)); + return result; +} + /* * bms_free - free a bitmapset * diff --git a/src/include/access/gist.h b/src/include/access/gist.h index 827566dc6e7..0dd2bf47c8c 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -151,6 +151,9 @@ typedef struct GISTENTRY #define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn)) #define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val)) +#define GistPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid ) +#define GistPageSetDeleteXid(page, val) ( ((PageHeader) (page))->pd_prune_xid = val) + /* * Vector of GISTENTRY structs; user-defined methods union and picksplit * take it as one of their arguments diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index a73716d6eaa..5d02800dac6 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -412,12 +412,17 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate); +/* gistxlog.c */ +extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer, + TransactionId xid, Buffer internalPageBuffer, + OffsetNumber internalPageOffset); + extern XLogRecPtr gistXLogUpdate(Buffer buffer, OffsetNumber *todelete, int ntodelete, IndexTuple *itup, int ntup, Buffer leftchild); -XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete, +extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete, RelFileNode hnode); extern XLogRecPtr gistXLogSplit(bool page_is_leaf, diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h index b67c7100500..3c71d0261a1 100644 --- a/src/include/access/gistxlog.h +++ b/src/include/access/gistxlog.h @@ -17,13 +17,15 @@ #include "access/xlogreader.h" #include "lib/stringinfo.h" +/* XLog stuff */ + #define XLOG_GIST_PAGE_UPDATE 0x00 #define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a page */ /* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */ #define XLOG_GIST_PAGE_SPLIT 0x30 /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */ #define XLOG_GIST_CREATE_INDEX 0x50 - /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */ +#define XLOG_GIST_PAGE_DELETE 0x60 /* * Backup Blk 0: updated page. @@ -76,6 +78,12 @@ typedef struct gistxlogPageSplit */ } gistxlogPageSplit; +typedef struct gistxlogPageDelete +{ + TransactionId deleteXid; /* last Xid which could see page in scan */ + OffsetNumber downlinkOffset; /* Offset of the downlink referencing this page */ +} gistxlogPageDelete; + extern void gist_redo(XLogReaderState *record); extern void gist_desc(StringInfo buf, XLogReaderState *record); extern const char *gist_identify(uint8 info); diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h index 433df8a46d0..55435f9ae64 100644 --- a/src/include/nodes/bitmapset.h +++ b/src/include/nodes/bitmapset.h @@ -79,6 +79,7 @@ extern Bitmapset *bms_copy(const Bitmapset *a); extern bool bms_equal(const Bitmapset *a, const Bitmapset *b); extern int bms_compare(const Bitmapset *a, const Bitmapset *b); extern Bitmapset *bms_make_singleton(int x); +extern Bitmapset *bms_make_empty(int size); extern void bms_free(Bitmapset *a); extern Bitmapset *bms_union(const Bitmapset *a, const Bitmapset *b); diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out index f5a2993aaf2..5b92f08c747 100644 --- a/src/test/regress/expected/gist.out +++ b/src/test/regress/expected/gist.out @@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g; -- To test vacuum, delete some entries from all over the index. delete from gist_point_tbl where id % 2 = 1; --- And also delete some concentration of values. (GiST doesn't currently --- attempt to delete pages even when they become empty, but if it did, this --- would exercise it) +-- And also delete some concentration of values. delete from gist_point_tbl where id < 10000; vacuum analyze gist_point_tbl; -- rebuild the index with a different fillfactor diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql index bae722fe13c..e66396e851b 100644 --- a/src/test/regress/sql/gist.sql +++ b/src/test/regress/sql/gist.sql @@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g; -- To test vacuum, delete some entries from all over the index. delete from gist_point_tbl where id % 2 = 1; --- And also delete some concentration of values. (GiST doesn't currently --- attempt to delete pages even when they become empty, but if it did, this --- would exercise it) +-- And also delete some concentration of values. delete from gist_point_tbl where id < 10000; vacuum analyze gist_point_tbl; -- 2.19.2