From 4a2ca3ec30b1743a669b21da80fbf589f6a23be8 Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Thu, 19 Jul 2018 14:28:25 +0400 Subject: [PATCH 2/2] Delete pages during GiST VACUUM v11 --- src/backend/access/gist/README | 35 ++++++++ src/backend/access/gist/gist.c | 18 ++++ src/backend/access/gist/gistbuild.c | 5 -- src/backend/access/gist/gistutil.c | 3 +- src/backend/access/gist/gistvacuum.c | 154 +++++++++++++++++++++++++++++++++ src/backend/access/gist/gistxlog.c | 60 +++++++++++++ src/backend/access/rmgrdesc/gistdesc.c | 3 + src/include/access/gist.h | 3 + src/include/access/gist_private.h | 24 +++-- src/include/access/gistxlog.h | 17 +++- src/test/regress/expected/gist.out | 4 +- src/test/regress/sql/gist.sql | 4 +- 12 files changed, 310 insertions(+), 20 deletions(-) diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 02228662b8..9548872be8 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -413,6 +413,41 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops through buffers at a given level until all buffers at that level have been emptied, and then moves down to the next level. +Bulk delete algorithm (VACUUM) +------------------------------ + +Function gistbulkdelete() is responsible for marking empty leaf pages as free +so that they can be used it allocate newly split pages. To find this pages +function can choose between two strategies: logical scan or physical scan. + +Physical scan reads the entire index from the first page to last. This scan +maintains graph structure in palloc'ed array to collect block numbers of +internal pages that need cleansing from references to empty leafs. Also, the +array contains offsets on the internal page to potentially free leaf page. This +scan method is chosen when maintenance work memory is sufficient to hold +necessary graph structure. + +The logical scan is chosen when there is not enough maintenance memory to +execute the physical scan. Logical scan traverses GiST index in DFS, looking up +into incomplete split branches. The logical scan can be slower on hard disk +drives. + +The result of both scans are the same: the stack of block numbers of internal +pages with the list of offsets potentially referencing empty leaf pages. After +the scan, for each internal pages under exclusive lock, each potentially free +leaf page is examined. gistbulkdelete() never delete last one reference from +internal page to preserve balanced tree properties. + +The physical scan can return empty leaf pages offsets unordered. Thus, before +executing PageIndexMultiDelete offsets (already locked and checked) are sorted. +This step is not necessary for the logical scan. + +Both scans hold only one lock at a time. Physical scan grabs exclusive lock +instantly, while logical scan takes shared lock and then swaps it to exclusive. +This is done because amount of work on internal page done by physical scan is +lower and amount of internal pages is relatively low compared to the amount of +leaf pages. + Authors: Teodor Sigaev diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 8a42effdf7..3a6b5c7ed3 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -700,6 +700,11 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) GISTInsertStack *item; OffsetNumber downlinkoffnum; + /* + * Currently internal pages are not deleted during vacuum, + * so we do not need to check if page is deleted + */ + downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); @@ -834,6 +839,19 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) } } + /* + * Leaf pages can be left deleted but still referenced + * until it's space is reused. Downlink to this page may be already + * removed from the internal page, but this scan can posess it. + */ + if(GistPageIsDeleted(stack->page)) + { + UnlockReleaseBuffer(stack->buffer); + xlocked = false; + state.stack = stack = stack->parent; + continue; + } + /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuple(&state, stack, giststate, itup, diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 434f15f014..f26f139a9e 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -1126,11 +1126,6 @@ gistGetMaxLevel(Relation index) * but will be added there the first time we visit them. */ -typedef struct -{ - BlockNumber childblkno; /* hash key */ - BlockNumber parentblkno; -} ParentMapEntry; static void gistInitParentMap(GISTBuildState *buildstate) diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 12804c321c..41978bb5e5 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -23,6 +23,7 @@ #include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/syscache.h" +#include "utils/snapmgr.h" /* @@ -806,7 +807,7 @@ gistNewBuffer(Relation r) gistcheckpage(r, buffer); - if (GistPageIsDeleted(page)) + if (GistPageIsDeleted(page) && TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalDataXmin)) return buffer; /* OK to use */ LockBuffer(buffer, GIST_UNLOCK); diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index ff86f4491f..26ac4dd30e 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -16,6 +16,7 @@ #include "access/genam.h" #include "access/gist_private.h" +#include "access/transam.h" #include "commands/vacuum.h" #include "miscadmin.h" #include "storage/indexfsm.h" @@ -104,6 +105,27 @@ typedef struct GistBDItem struct GistBDItem *next; } GistBDItem; +static void gistmapset(char *map, BlockNumber blkno) +{ + map[blkno / 8] |= 1 << (blkno % 8); +} +static bool gistmapget(char *map, BlockNumber blkno) +{ + return (map[blkno / 8] & 1 << (blkno % 8)) != 0; +} + +/* + * This function is used to sort offsets + * When employing physical scan rescan offsets are not ordered. + */ +static int +compare_offsetnumber(const void *x, const void *y) +{ + OffsetNumber a = *((OffsetNumber *)x); + OffsetNumber b = *((OffsetNumber *)y); + return a - b; +} + /* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples left after upgrade. @@ -121,6 +143,8 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, bool needLock; BlockNumber blkno; GistNSN startNSN = GetInsertRecPtr(); + void *internals; + void *emptyLeafs; /* first time through? */ if (stats == NULL) @@ -140,6 +164,9 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, npages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); + + internals = palloc0(npages / 8 + 1); + emptyLeafs = palloc0(npages / 8 + 1); for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++) { @@ -248,7 +275,14 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, END_CRIT_SECTION(); } + /* The page is completely empty */ + if (ntodelete == maxoff) + { + gistmapset(emptyLeafs, nextBlock); + } } + else + gistmapset(internals, nextBlock); /* We should not unlock buffer if we are going to jump left */ if (needScan) @@ -269,5 +303,125 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, } } + for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++) + { + Buffer buffer; + Page page; + OffsetNumber i, + maxoff; + IndexTuple idxtuple; + ItemId iid; + OffsetNumber todelete[MaxOffsetNumber]; + Buffer buftodelete[MaxOffsetNumber]; + int ntodelete = 0; + + if (!gistmapget(internals, blkno)) + continue; /* second scan is for internal pages */ + + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); + + LockBuffer(buffer, GIST_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + /* Currently block of internal page cannot become leaf */ + Assert(!GistPageIsLeaf(page)); + + if (PageIsNew(page) || GistPageIsDeleted(page)) + { + UnlockReleaseBuffer(buffer); + /* TODO: Should not we record free page here? */ + continue; + } + + maxoff = PageGetMaxOffsetNumber(page); + /* Check that leafs are still empty and decide what to delete */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + Buffer leafBuffer; + Page leafPage; + /* if this page was not empty in previous scan - we do not consider it */ + if(!gistmapget(emptyLeafs, i)) + { + continue; + } + + iid = PageGetItemId(page, i); + idxtuple = (IndexTuple) PageGetItem(page, iid); + + leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&(idxtuple->t_tid)), + RBM_NORMAL, info->strategy); + LockBuffer(leafBuffer, GIST_EXCLUSIVE); + gistcheckpage(rel, leafBuffer); + leafPage = (Page) BufferGetPage(leafBuffer); + Assert(GistPageIsLeaf(leafPage)); + + if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */ + && !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */ + && ntodelete < maxoff-1) /* We must keep at least one leaf page per each */ + { + buftodelete[ntodelete] = leafBuffer; + todelete[ntodelete++] = i; + } + else + UnlockReleaseBuffer(leafBuffer); + } + + + if (ntodelete) + { + /* Prepare possibly onurdered offsets */ + qsort(todelete, ntodelete, sizeof(OffsetNumber), compare_offsetnumber); + + /* + * Like in _bt_unlink_halfdead_page we need a upper bound on xid + * that could hold downlinks to this page. We use + * ReadNewTransactionId() to instead of GetCurrentTransactionId + * since we are in a VACUUM. + */ + TransactionId txid = ReadNewTransactionId(); + + START_CRIT_SECTION(); + + /* Mark pages as deleted dropping references from internal pages */ + for (i = 0; i < ntodelete; i++) + { + Page leafPage = (Page)BufferGetPage(buftodelete[i]); + + GistPageSetDeleteXid(leafPage,txid); + + GistPageSetDeleted(leafPage); + MarkBufferDirty(buftodelete[i]); + stats->pages_deleted++; + + MarkBufferDirty(buffer); + /* Offsets are changed as long as we delete tuples from internal page */ + PageIndexTupleDelete(page, todelete[i] - i); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr = + gistXLogSetDeleted(rel->rd_node, buftodelete[i], + txid, buffer, todelete[i] - i); + PageSetLSN(page, recptr); + PageSetLSN(leafPage, recptr); + } + else + { + PageSetLSN(page, gistGetFakeLSN(rel)); + PageSetLSN(leafPage, gistGetFakeLSN(rel)); + } + + UnlockReleaseBuffer(buftodelete[i]); + } + END_CRIT_SECTION(); + } + + UnlockReleaseBuffer(buffer); + } + + pfree(internals); + pfree(emptyLeafs); + return stats; } diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 1e09126978..80108f6bfb 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -60,6 +60,39 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id) UnlockReleaseBuffer(buffer); } +static void +gistRedoPageSetDeleted(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record); + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + GistPageSetDeleteXid(page, xldata->deleteXid); + GistPageSetDeleted(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + PageIndexTupleDelete(page, xldata->downlinkOffset); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} /* * redo any page update (except page split) */ @@ -112,6 +145,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record) data += sizeof(OffsetNumber) * xldata->ntodelete; PageIndexMultiDelete(page, todelete, xldata->ntodelete); + if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } @@ -324,6 +358,9 @@ gist_redo(XLogReaderState *record) case XLOG_GIST_CREATE_INDEX: gistRedoCreateIndex(record); break; + case XLOG_GIST_PAGE_DELETE: + gistRedoPageSetDeleted(record); + break; default: elog(PANIC, "gist_redo: unknown op code %u", info); } @@ -442,6 +479,29 @@ gistXLogSplit(bool page_is_leaf, return recptr; } +/* + * Write XLOG record describing a page delete. This also includes removal of + * downlink from internal page. + */ +XLogRecPtr +gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid, + Buffer internalPageBuffer, OffsetNumber internalPageOffset) { + gistxlogPageDelete xlrec; + XLogRecPtr recptr; + + xlrec.deleteXid = xid; + xlrec.downlinkOffset = internalPageOffset; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete)); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBuffer(1, internalPageBuffer, REGBUF_STANDARD); + /* new tuples */ + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE); + return recptr; +} + /* * Write XLOG record describing a page update. The update can include any * number of deletions and/or insertions of tuples on a single index page. diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index e5e925e0c5..f494db63f6 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -65,6 +65,9 @@ gist_identify(uint8 info) case XLOG_GIST_CREATE_INDEX: id = "CREATE_INDEX"; break; + case XLOG_GIST_PAGE_DELETE: + id = "PAGE_DELETE"; + break; } return id; diff --git a/src/include/access/gist.h b/src/include/access/gist.h index 827566dc6e..0dd2bf47c8 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -151,6 +151,9 @@ typedef struct GISTENTRY #define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn)) #define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val)) +#define GistPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid ) +#define GistPageSetDeleteXid(page, val) ( ((PageHeader) (page))->pd_prune_xid = val) + /* * Vector of GISTENTRY structs; user-defined methods union and picksplit * take it as one of their arguments diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 36ed7244ba..1f82695b1d 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -16,6 +16,7 @@ #include "access/amapi.h" #include "access/gist.h" +#include "access/gistxlog.h" #include "access/itup.h" #include "fmgr.h" #include "lib/pairingheap.h" @@ -51,6 +52,11 @@ typedef struct char tupledata[FLEXIBLE_ARRAY_MEMBER]; } GISTNodeBufferPage; +typedef struct +{ + BlockNumber childblkno; /* hash key */ + BlockNumber parentblkno; +} ParentMapEntry; #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)) /* Returns free space in node buffer page */ #define PAGE_FREE_SPACE(nbp) (nbp->freespace) @@ -176,13 +182,6 @@ typedef struct GISTScanOpaqueData typedef GISTScanOpaqueData *GISTScanOpaque; -/* despite the name, gistxlogPage is not part of any xlog record */ -typedef struct gistxlogPage -{ - BlockNumber blkno; - int num; /* number of index tuples following */ -} gistxlogPage; - /* SplitedPageLayout - gistSplit function result */ typedef struct SplitedPageLayout { @@ -409,6 +408,17 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate); +/* gistxlog.c */ +extern void gist_redo(XLogReaderState *record); +extern void gist_desc(StringInfo buf, XLogReaderState *record); +extern const char *gist_identify(uint8 info); +extern void gist_xlog_startup(void); +extern void gist_xlog_cleanup(void); + +extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer, + TransactionId xid, Buffer internalPageBuffer, + OffsetNumber internalPageOffset); + extern XLogRecPtr gistXLogUpdate(Buffer buffer, OffsetNumber *todelete, int ntodelete, IndexTuple *itup, int ntup, diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h index 1a2b9496d0..ad0b742dbb 100644 --- a/src/include/access/gistxlog.h +++ b/src/include/access/gistxlog.h @@ -17,12 +17,14 @@ #include "access/xlogreader.h" #include "lib/stringinfo.h" +/* XLog stuff */ + #define XLOG_GIST_PAGE_UPDATE 0x00 /* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */ #define XLOG_GIST_PAGE_SPLIT 0x30 /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */ #define XLOG_GIST_CREATE_INDEX 0x50 - /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */ +#define XLOG_GIST_PAGE_DELETE 0x60 /* * Backup Blk 0: updated page. @@ -59,6 +61,19 @@ typedef struct gistxlogPageSplit */ } gistxlogPageSplit; +typedef struct gistxlogPageDelete +{ + TransactionId deleteXid; /* last Xid which could see page in scan */ + OffsetNumber downlinkOffset; /* Offset of the downlink referencing this page */ +} gistxlogPageDelete; + +/* despite the name, gistxlogPage is not part of any xlog record */ +typedef struct gistxlogPage +{ + BlockNumber blkno; + int num; /* number of index tuples following */ +} gistxlogPage; + extern void gist_redo(XLogReaderState *record); extern void gist_desc(StringInfo buf, XLogReaderState *record); extern const char *gist_identify(uint8 info); diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out index f5a2993aaf..5b92f08c74 100644 --- a/src/test/regress/expected/gist.out +++ b/src/test/regress/expected/gist.out @@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g; -- To test vacuum, delete some entries from all over the index. delete from gist_point_tbl where id % 2 = 1; --- And also delete some concentration of values. (GiST doesn't currently --- attempt to delete pages even when they become empty, but if it did, this --- would exercise it) +-- And also delete some concentration of values. delete from gist_point_tbl where id < 10000; vacuum analyze gist_point_tbl; -- rebuild the index with a different fillfactor diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql index bae722fe13..e66396e851 100644 --- a/src/test/regress/sql/gist.sql +++ b/src/test/regress/sql/gist.sql @@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g; -- To test vacuum, delete some entries from all over the index. delete from gist_point_tbl where id % 2 = 1; --- And also delete some concentration of values. (GiST doesn't currently --- attempt to delete pages even when they become empty, but if it did, this --- would exercise it) +-- And also delete some concentration of values. delete from gist_point_tbl where id < 10000; vacuum analyze gist_point_tbl; -- 2.15.2 (Apple Git-101.1)